In [None]:
# Import python packages
from statsmodels.graphics.gofplots import qqplot
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from statsmodels.formula.api import ols
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
#import plotly.graph_objs as go
import scipy.stats as stats
import statsmodels.api as sm
import streamlit as st

# We can also use Snowpark for our analyses!
from snowflake.snowpark.context import get_active_session
session = get_active_session()

In [None]:
import os

try:
    import plotly.graph_objs as go
except:
    import tempfile
    import atexit
    import shutil

    mpldir = tempfile.mkdtemp()
    atexit.register(shutil.rmtree, mpldir)  # rm directory on succ exit

    os.environ['MPLCONFIGDIR'] = mpldir

    import plotly.graph_objs as go

Select the data to analyze

In [None]:
-- Welcome to Snowflake Notebooks!
-- Try out a SQL cell to generate some data.
SELECT * from WINE_PARAMS;


Separate numerical columns from the rest

In [None]:
df = cell3.to_pandas()

num_cols = list(df.select_dtypes(include=[np.number]).columns)
cat_cols = list(set(list(df.select_dtypes(exclude=[np.number]).columns)))

General Description

In [None]:
st.subheader("Head of the Dataset")
st.dataframe(df.head(),use_container_width=True)

st.subheader("Data Types")
st.dataframe(df.dtypes, use_container_width=True)

st.subheader("Numeric Data Description")
st.dataframe(df[num_cols].describe().T, use_container_width=True)

if len(cat_cols) > 0:
    st.subheader("Categorical Data Description")
    st.dataframe(df[cat_cols].describe().T, use_container_width=True)

Exploration of Numerical Variables

In [None]:
num_explore_selection = st.selectbox(
    "Select column to analyze",
    num_cols)

fig1 = ff.create_distplot([df[num_explore_selection]], 
                         [num_explore_selection], 
                         bin_size=(df[num_explore_selection].max()-df[num_explore_selection].min())/15)
st.subheader("Distribution Plot")
st.plotly_chart(fig1, use_container_width=True)

fig2 = px.box(df, x=num_explore_selection)
st.subheader("Box Plot")
st.plotly_chart(fig2, use_container_width=True)

qqplot_data = qqplot(df[num_explore_selection], line="s").gca().lines
fig3 = go.Figure()
fig3.add_trace({
    "type": "scatter",
    "x": qqplot_data[0].get_xdata(),
    "y": qqplot_data[0].get_ydata(),
    "mode": "markers",
    "marker": {
        "color": "#19d3f3"
    }
})
fig3.add_trace({
    "type": "scatter",
    "x": qqplot_data[1].get_xdata(),
    "y": qqplot_data[1].get_ydata(),
    "mode": "lines",
    "line": {
        "color": "#636efa"
    }
})
fig3["layout"].update({
    "title": "Quantile-Quantile Plot",
    "xaxis": {
        "title": "Theoritical Quantities",
        "zeroline": False
    },
    "yaxis": {
        "title": "Sample Quantities"
    },
    "showlegend": False,
    "width": 800,
    "height": 700,
})
st.plotly_chart(fig3, use_container_width=True)

Exploration of Categorical Variables

In [None]:
cat_explore_selection = st.selectbox(
    "Select column to analyze",
    cat_cols)
if len(cat_cols) > 0:
    st.subheader("Histogram")
    fig4 = px.histogram(df, x=cat_explore_selection)
    st.plotly_chart(fig4, use_container_width=True)
    
    st.subheader("Pie Chart")
    fig5 = px.pie(df, names=cat_explore_selection)
    st.plotly_chart(fig5, use_container_width=True)

Explore Interactions Between Numeric Variables

In [None]:
num_num_interaction1 = st.selectbox(
    "Select first column to analyze",
    num_cols)
num_num_interaction2 = st.selectbox(
    "Select second column to analyze",
    num_cols,index=1)

fig6 = px.scatter(df, x=num_num_interaction1, y=num_num_interaction2, trendline="ols")
st.plotly_chart(fig6, use_container_width=True)

Explore Interaction Between Categorical Variables

In [None]:
cat_cat_interaction1 = st.selectbox(
    "Select first column to analyze",
    cat_cols)
cat_cat_interaction2 = st.selectbox(
    "Select second column to analyze",
    cat_cols,index=1)
if len(cat_cols) > 0:
    fig7 = px.histogram(df, x=cat_cat_interaction1, color=cat_cat_interaction2)
    st.plotly_chart(fig7, use_container_width=True)

Explore Interactions Between a Numerical and a Categorical Variable

In [None]:
num_cat_interaction1 = st.selectbox(
    "Select numeric column to analyze",
    num_cols)
num_cat_interaction2 = st.selectbox(
    "Select categorical column to analyze",
    cat_cols)

fig8 = px.box(df, x=num_cat_interaction2, y=num_cat_interaction1)
st.plotly_chart(fig8, use_container_width=True)

Correlation Heatmap

In [None]:
corr_method = st.selectbox(
    "Select method for calculating correlation",
    ["pearson","kendall","spearman"])

df_num = df[num_cols]
corr = df_num.corr(corr_method)
fig9 = px.imshow(corr)
st.plotly_chart(fig9, use_container_width=True)

Explore Interaction Between 2 Numeric and 1 Categorical Variable

In [None]:
tri_interaction1 = st.selectbox(
    "Select first numeric column to analyze",
    num_cols)
tri_interaction2 = st.selectbox(
    "Select second numeric column to analyze",
    num_cols,index=1)
tri_interaction_cat = st.selectbox(
    "Select categorical column as legend",
    cat_cols)

fig12 = px.scatter(df, x=tri_interaction1, y=tri_interaction2, color=tri_interaction_cat)
st.plotly_chart(fig12, use_container_width=True)

Rank Associations Between Variables

In [None]:
reference_var = st.selectbox(
    "Select reference variable to compare to",
    df.columns)

if reference_var in num_cols:
    corrdf2 = corr[corr.index==reference_var].reset_index()[[each for each in corr.columns \
                                                      if reference_var not in each]].unstack().sort_values(kind="quicksort", 
                                                                                                  ascending=False).head()
    corrdf2 = corrdf2.reset_index()
    corrdf2.columns = ["level0","level1","rsq"]

    corrdf3 = corr[corr.index==reference_var].reset_index()[[each for each in corr.columns \
                                                      if reference_var not in each]].unstack().sort_values(kind="quicksort", 
                                                                                                  ascending=False).tail()
    corrdf3 = corrdf3.reset_index()
    corrdf3.columns = ["level0","level1","rsq"]

    fig13 = px.histogram(corrdf2,x="level0",y="rsq")
    st.plotly_chart(fig13, use_container_width=True)

    fig15 = px.histogram(corrdf3,x="level0",y="rsq")
    st.plotly_chart(fig15, use_container_width=True)

    etasquared_dict = {}
    for each in cat_cols:
        mod = ols("{} ~ C({})".format(reference_var, each),
                  data=df[[reference_var,each]],missing="drop").fit()
        aov_table = sm.stats.anova_lm(mod, typ=1)
        esq_sm = aov_table["sum_sq"].iloc[0]/(aov_table["sum_sq"].iloc[0]+aov_table["sum_sq"].iloc[1])
        etasquared_dict[each] = esq_sm
        
    topk_esq = pd.DataFrame.from_dict(etasquared_dict, orient="index").unstack().sort_values(\
        kind = "quicksort", ascending=False).head().reset_index()
    topk_esq.columns = ["level0", "level1", "EtaSquared"]
    fig14 = px.histogram(topk_esq,x="level1",y="EtaSquared")
    st.plotly_chart(fig14, use_container_width=True)
else:
    etasquared_dict = {}
    for each in num_cols:
        mod = ols("{} ~ C({})".format(each, reference_var),
                  data=df[[reference_var,each]],missing="drop").fit()
        aov_table = sm.stats.anova_lm(mod, typ=1)
        esq_sm = aov_table["sum_sq"].iloc[0]/(aov_table["sum_sq"].iloc[0]+aov_table["sum_sq"].iloc[1])
        etasquared_dict[each] = esq_sm
        
    topk_esq = pd.DataFrame.from_dict(etasquared_dict, orient="index").unstack().sort_values(\
        kind = "quicksort", ascending=False).head().reset_index()
    topk_esq.columns = ["level0", "level1", "EtaSquared"]
    fig13 = px.histogram(topk_esq,x="level1",y="EtaSquared")
    st.plotly_chart(fig13, use_container_width=True)

    cramer_dict = {}
    for each in cat_cols:
        if each !=reference_var:
            tbl = pd.crosstab(df[reference_var], df[each])
            chisq = stats.chi2_contingency(tbl, correction=False)[0]
            try:
                cramer = np.sqrt(chisq/sum(tbl))
            except:
                # cramer = np.sqrt(chisq/tbl.as_matrix().sum())
                cramer = np.sqrt(chisq / tbl.values.sum())

                pass
            cramer_dict[each] = cramer
    topk_cramer = pd.DataFrame.from_dict(cramer_dict, orient="index").unstack().sort_values(\
        kind = "quicksort", ascending=False).head().reset_index()
    topk_cramer.columns = ["level0", "level1","CramersV"]
    fig14 = px.histogram(topk_cramer,x="level1",y="CramersV")
    st.plotly_chart(fig14, use_container_width=True)

Visualize Numerical Data by Ptojecting to Principal Component Spaces

Project data to 2D PC space

In [None]:
comp_num = st.slider("Insert the desired explained variance from the components",
                          value=0.95, min_value=0.1, max_value=0.99)

num_df = df[num_cols]

X = StandardScaler().fit_transform(num_df.values)
pca = PCA(n_components=comp_num)
pca.fit(X)

st.write("The number of components is", pca.n_components_)

variance = pd.DataFrame([range(1,pca.n_components_+1),pca.explained_variance_ratio_]).T
fig10 = px.histogram(variance, x=0, y=1, nbins=int(pca.n_components_),
                    labels={
                     "0": "Principal Component",
                     "1": "Explained Variance Ratio"
                 })
fig10.update_layout(bargap=0.2)
st.plotly_chart(fig10, use_container_width=True)

legend_pca1 = st.selectbox(
    "Select legend variable",
    cat_cols)

if pca.n_components_>1:
    pca_xaxis = st.selectbox(
        "Select PC at X axis",
        range(1,pca.n_components_+1))
    pca_yaxis = st.selectbox(
        "Select PC at Y axis",
        range(1,pca.n_components_+1),index=1)
    
    x_pca_index = pca_xaxis - 1
    y_pca_index = pca_yaxis - 1
    
    Y_pca = pd.DataFrame(pca.fit_transform(X))
    Y_pca_labels = []
    for i in range(1,pca.n_components_+1):
        Y_pca_labels.append('PC'+str(i))
    Y_pca.columns = Y_pca_labels
    
    if len(cat_cols) > 0:
        scatter_2d_data = pd.concat([Y_pca, df[legend_pca1]], axis=1)
        
        fig11 = px.scatter(scatter_2d_data,x=scatter_2d_data.columns[x_pca_index],
                           y=scatter_2d_data.columns[y_pca_index],
                           color=legend_pca1)
    else:
        fig11 = px.scatter(Y_pca,x=Y_pca.columns[x_pca_index],
                           y=Y_pca.columns[y_pca_index])
    st.plotly_chart(fig11, use_container_width=True)
else: 
    st.subheader("Not enough components to display 2D graph")

Project Data to 3D PC Space

In [None]:
legend_pca3d = st.selectbox(
    "Select legend variable for 3d visualization",
    cat_cols)
if pca.n_components_>2:
    pca3d_xaxis = st.selectbox(
        "Select PC at X axis for 3d",
        range(1,pca.n_components_+1))
    pca3d_yaxis = st.selectbox(
        "Select PC at Y axis for 3d",
        range(1,pca.n_components_+1),index=1)
    pca3d_zaxis = st.selectbox(
        "Select PC at Z axis for 3d",
        range(1,pca.n_components_+1),index=2)
    
    x_pca3d_index = pca3d_xaxis - 1
    y_pca3d_index = pca3d_yaxis - 1
    z_pca3d_index = pca3d_zaxis - 1
    
    if len(cat_cols) > 0:
        scatter_3d_data = pd.concat([Y_pca, df[legend_pca3d]], axis=1)
        
        fig12 = px.scatter_3d(scatter_3d_data,x=scatter_3d_data.columns[x_pca3d_index],
                            y=scatter_3d_data.columns[y_pca3d_index],
                            z=scatter_3d_data.columns[z_pca3d_index],
                            color=legend_pca3d)
    else:
        fig12 = px.scatter_3d(Y_pca,x=Y_pca.columns[x_pca3d_index],
                            y=Y_pca.columns[y_pca3d_index],
                            z=Y_pca.columns[z_pca3d_index])
    st.plotly_chart(fig12, use_container_width=True)
else: 
    st.subheader("Not enough components to display 3D graph")