# xPlore: Automatic Explorative Data Analysis

xPlore is a tool for automatic generation of explorative data analysis reports. Many of the stages of explorative data analysis have a repetitive nature and can be automated. This notebook is generated automatically using xPlore and can be used as a template for further explorative data analysis steps.

### Read the Data into a Dataframe

In [None]:
def read():
    import pandas as pd

    df = pd.read_csv("resources/iris.csv", low_memory=False)
    return df

In [None]:
df = read()

### Head of the Dataframe

In [None]:
def head(df):
    return df.head()

In [None]:
head(df)

### Shape of the Dataframe

In [None]:
def shape(df):
    return df.shape

In [None]:
shape(df)

### Types of Columns

In [None]:
def column_types(df):
    import numpy as np

    numerical_columns = list(df.select_dtypes(include=[np.number]).columns)
    non_numerical_columns = list(df.select_dtypes(exclude=[np.number]).columns)
    return numerical_columns, non_numerical_columns

In [None]:
numerical_columns, non_numerical_columns = column_types(df)
print("Numerical columns:", numerical_columns)
print("Non-numerical columns:", non_numerical_columns)

### Percentage of Missing Values in Columns

In [None]:
def missing_values_percentage(df):
    import pandas as pd

    percent_missing = df.isnull().sum() * 100 / len(df)
    missing_value_df = pd.DataFrame(
        {"column_name": df.columns, "percent_missing": percent_missing}
    )
    return missing_value_df

In [None]:
missing_values_percentage_df = missing_values_percentage(df)
missing_values_percentage_df

### Pairwise Correlation between columns

In [None]:
def pairwise_correlation_between_columns(df):
    import seaborn as sns

    sns.heatmap(df.corr(), vmin=-1, vmax=1, annot=True)

In [None]:
pairwise_correlation_between_columns(df)

### Pairwise Scatter Plots

In [None]:
def pairwise_scatter_plots(df, target_values=None, features=None):
    import seaborn as sns

    sns.pairplot(df[df["class"].isin(list(target_values))][list(features)])

def interact():
    from ipywidgets import interactive, fixed, widgets, interact_manual
    from IPython.display import display
    from ipywidgets import AppLayout, Button, Layout

    target_values_options = list(df["class"].unique())

    features_options = list(df.columns.values)
    features_options.remove("class")
    w1 = widgets.SelectMultiple(
        options=target_values_options,
        value=target_values_options,
        rows=len(target_values_options),
        description="Target Values (column: class)",
        style={'description_width': 'initial'},
        disabled=False,
    )
    w2 = widgets.SelectMultiple(
        options=features_options,
        value=features_options[0 : min(len(features_options), 2)],
        rows=len(features_options),
        description="Features",
        disabled=False,
    )

    ui = widgets.HBox([w1, w2])
    out = widgets.interactive_output(
        pairwise_scatter_plots,
        {"df": fixed(df), "target_values": w1, "features": w2},
    )

    display(ui, out)

In [None]:
interact()

### Distribution of Target Variable

In [None]:
def distribution_of_target_variable(df):
    return df["class"].value_counts()

In [None]:
dist_target_variable = distribution_of_target_variable(df)
dist_target_variable

In [None]:
def plot_distribution_of_target_variable(dist_target_variable):
    import matplotlib.pyplot as plt

    df["class"].value_counts().plot(kind="bar")
    plt.xticks(rotation="horizontal")
    plt.xlabel("Target Variable: class")
    plt.ylabel("Count")
    plt.title("Distribution of Target Variable: class")
    plt.show()

In [None]:
plot_distribution_of_target_variable(dist_target_variable)