In [None]:
%load_ext autoreload
%autoreload 2
from intedact import univariate_eda_interact
import pandas as pd
import seaborn as sns
from IPython.display import display
import ipywidgets as widgets

# Make it so output doesn't collapse
style = """
    <style>
       .jupyter-widgets-output-area .output_scroll {
            height: unset !important;
            border-radius: unset !important;
            -webkit-box-shadow: unset !important;
            box-shadow: unset !important;
        }
        .jupyter-widgets-output-area  {
            height: auto !important;
        }
    </style>
    """
display(widgets.HTML(style))

In [None]:
# These are needed for text summaries
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Example 1: Diamonds Dataset

The first example we will use is the classic diamonds dataset packaged with ggplot as well as seaborn. This first example is great for getting introduced to the basic discrete and continuous summaries.

Recommended Explorations:
  - Try playing with number of bins on carat
  - Try removing outliers for the x, y, and z variables

In [None]:
data = sns.load_dataset("diamonds")
data["cut"] = pd.Categorical(data["cut"], categories=["Fair", "Good", "Very Good", "Premium", "Ideal"], ordered=True)
data["color"] = pd.Categorical(data["color"], categories=["D", "E", "F", "G", "H", "I", "J"], ordered=True)
data["clarity"] = pd.Categorical(data["clarity"], categories=["I1", "SI1", "SI2", "VS2", "VS1", "VVS2", "VVS1", "IF"], ordered=True)

In [None]:
univariate_eda_interact(data, notes_file="tmp.json", figure_dir=".")

# Example 2: Tidy Tuesday GDPR Violations

Recommended Explorations:
- Try using a log transform on the price column.
- Check out the date column for an example of a datetime summary. Try setting the Lower Trim option to 20 so you can see the main time series.
- Check out the summary column for an example of a text summary. By default, doesn't compute top ngrams so you can check the 'Plot most common ngrams' option to plot the top unigrams-trigrams. Also, since text tokenizing can be time consuming, it turns auto updating off so you have to press the 'Run Interact' button to update the summary when control options are changed.
- Check out the article_violated column for an example of a collections summary
- Check out the source column for an example of a url summary


In [None]:
data = pd.read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2020/2020-04-21/gdpr_violations.tsv", sep="\t")
data["date"] = pd.to_datetime(data["date"])
data["article_violated"] = data["article_violated"].apply(lambda x: x.split("|"))

In [None]:
univariate_eda_interact(data)