# Presets

The original data creators, depositors or copyright holders, the funders of the
Data Collections (if different) and the UK Data Service/UK Data Archive, and to acknowledge
Crown Copyright where appropriate. The original data creators, depositors or copyright holders, the funders of the Data Collections (if
different) and the UK Data Service/UK Data Archive bear no responsibility for their further
analysis or interpretation.

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv('data/data_p2.csv')

# Basic stats

In [None]:
data.head()

In [None]:
print('Nrows:', data.shape[0])
print('Ncols:', data.shape[1])
print('rows/cols ratio:', data.shape[0]/data.shape[1])

In [None]:
print(data.dtypes.to_string())

In [None]:
print('Number of integer columns:', len(data.dtypes[data.dtypes == 'int64']))
print('Number of float columns:', len(data.dtypes[data.dtypes == 'float64']))
print('Number of string columns:', len(data.dtypes[data.dtypes == 'object']))

In [None]:
print(data.isna().sum().to_string())

- how to replace NA for dates? 

In [None]:
data.describe()

In [None]:
data.select_dtypes(['object']).apply(lambda x: x.unique())

In [None]:
data.select_dtypes(['int64']).apply(lambda x: x.unique())

# EDA

In [None]:
def show_basic_plots(df, vars_subset=None):
    # Presets:
    if vars_subset is None:
        vars_subset = df.columns.tolist()
    
    color_background = '#F5F5F5'
    color_gridlines = '#DCDCDC'
    colors_in_use = ['#2C3E50', '#537EA2', '#858F84', '#42A593',
                     '#873E23', '#CFD1A1', '#6A744F', '#BDBDC5',
                     '#7EA253', '#EDB676', '#C26D40']+px.colors.qualitative.Safe
    
    dtypes_num = ['int64', 'int32', 'int16', 'float64', 'float32', 'float16']
    dtypes_str = ['object', 'category']
    vars_num = df.loc[:, vars_subset].select_dtypes(include=dtypes_num).columns
    vars_str = df.loc[:, vars_subset].select_dtypes(include=dtypes_str).columns
    
    # For categorical variables:
    fig_str_lst = [None]*len(vars_str)
    for i in range(0, len(vars_str)):
        fig = go.Figure()
        fig.add_trace(go.Histogram(x=df.loc[:, vars_str[i]],
                                   name=vars_str[i],
                                   showlegend=True))
        fig.update_traces(marker_color=colors_in_use[0],
                          marker_line_color='rgb(8,48,107)',
                          marker_line_width=1.5,
                          opacity=0.8)
        fig.update_layout(xaxis_type='category',
                          xaxis_title=vars_str[i],
                          paper_bgcolor=color_background,
                          plot_bgcolor=color_background)
        fig.update_yaxes(gridcolor=color_gridlines)
        fig.update_xaxes(linecolor=color_gridlines)
        fig_str_lst[i] = fig
        
    # For 2 categorical variables:
    fig_str_lst_mix = []
    for i in range(0, len(vars_str)):
        for j in range(0, len(vars_str)):
            if i == j:
                continue
            else:
                fig = px.histogram(df, x=vars_str[i], color=vars_str[j],
                                   color_discrete_sequence=colors_in_use[1:])
                fig.update_traces(marker_line_color='rgb(8,48,107)',
                                  marker_line_width=1.5,
                                  opacity=0.8)
                fig.update_layout(xaxis_type='category',
                                  xaxis_title=vars_str[i],
                                  paper_bgcolor=color_background,
                                  plot_bgcolor=color_background)
                fig.update_yaxes(gridcolor=color_gridlines, title='')
                fig.update_xaxes(linecolor=color_gridlines)
                fig_str_lst_mix.append(fig)
            
    # For numerical variables:
    fig_num_lst_dist = [None]*len(vars_num)
    for i in range(0, len(vars_num)):
        fig = ff.create_distplot(hist_data=[df[vars_num[i]].dropna()],
                                 group_labels=[vars_num[i]],
                                 show_hist=False,
                                 show_rug=False,
                                 colors = colors_in_use)
        fig.update_layout(paper_bgcolor=color_background,
                              plot_bgcolor=color_background)
        fig.update_yaxes(gridcolor=color_gridlines,
                         zerolinecolor=color_gridlines,
                         title='')
        fig.update_xaxes(gridcolor=color_gridlines,
                         title=vars_num[i])
        fig_num_lst_dist[i] = fig
        
    fig_num_lst_violin = [None]*len(vars_num)
    for i in range(0, len(vars_num)):
        fig = px.violin(df, y=vars_num[i], box=True, points='outliers')
        fig.update_traces(marker_color = colors_in_use[0],
                          opacity=0.8,
                          name=vars_num[i],
                          showlegend=True)
        fig.update_layout(paper_bgcolor=color_background,
                          plot_bgcolor=color_background)
        fig.update_yaxes(gridcolor=color_gridlines,
                         zerolinecolor=color_gridlines,
                         title='')
        fig.update_xaxes(title=vars_num[i])
        fig_num_lst_violin[i] = fig
    
        # For 2 numerical variables:
    fig_num_lst_mix = []
    fig_num_lst_scat = []
    for i in range(0, len(vars_num)):
        for j in range(0, len(vars_num)):
            if i == j:
                continue
            else:
                fig = go.Figure()
                fig.add_trace(go.Histogram2dContour(x = df[vars_num[i]],
                                                        y = df[vars_num[j]],
                                                        colorscale='deep'))
                fig.update_layout(paper_bgcolor=color_background,
                                  plot_bgcolor=color_background)
                fig.update_yaxes(title=vars_num[j])
                fig.update_xaxes(title=vars_num[i])
                fig_num_lst_mix.append(fig)
                
                fig = px.scatter(data,x=data[vars_num[i]],
                                 y=data[vars_num[j]],
                                 trendline='ols')
                fig.update_layout(paper_bgcolor=color_background,
                                  plot_bgcolor=color_background)
                fig.update_traces(marker_color=colors_in_use[0],
                                  opacity=0.8)
                fig.update_yaxes(gridcolor=color_gridlines,
                                 zerolinecolor=color_gridlines,
                                 title=vars_num[j])
                fig.update_xaxes(gridcolor=color_gridlines,
                                 zerolinecolor=color_gridlines,
                                 title=vars_num[i])
                fig_num_lst_scat.append(fig)
        
    # 1 categorical, 1 numeric:
    fig_all_violin_mix = []
    for i in range(0, len(vars_str)):
        for j in range(0, len(vars_num)):
            fig = px.violin(df, y=vars_num[j], color=vars_str[i],
                            color_discrete_sequence=colors_in_use,
                            box=True, points='outliers')
            fig.update_traces(opacity=0.8)
            fig.update_layout(xaxis_title=vars_num[j],
                              showlegend=True,
                              paper_bgcolor=color_background,
                              plot_bgcolor=color_background)
            fig.update_yaxes(gridcolor=color_gridlines, title='')
            fig.update_xaxes(linecolor=color_gridlines)
            fig_all_violin_mix.append(fig)
            
    final_dict = {
        # categorical+target
        'dist_cat': fig_str_lst,
        'dist_mlt_cat': fig_str_lst_mix,
        # numeric+target
        'dist_num': fig_num_lst_dist,
        'violin_num': fig_num_lst_violin,
        'scatter_num': fig_num_lst_scat,
        'dist_mlt_num': fig_num_lst_mix,
        # all
        'violin_mix': fig_all_violin_mix
        #'scatter_mix': None,
        #'dist_mlt_mix': None,
    }
    return(final_dict)

In [None]:
def corr_heatmap(df):
    color_background = '#F5F5F5'
    color_gridlines = '#DCDCDC'
        
    fig = px.imshow(df.corr().round(3), text_auto=True, color_continuous_scale='deep')
    fig.update_traces(opacity=0.8)
    fig.update_layout(
        coloraxis_showscale=False,
        paper_bgcolor=color_background,
        plot_bgcolor=color_background)
    fig.update_yaxes(gridcolor=color_gridlines, title='')
    fig.update_xaxes(linecolor=color_gridlines)
    return(fig)

def show_violin(df):
    plts = show_basic_plots(df)
    for i in plts['violin_mix']: 
        i.show()
        
def show_dist_cat(df):
    plts = show_basic_plots(df)
    for i in plts['dist_cat']: 
        i.show()
        
def show_dummy_corr(df):
    corr_heatmap(
        pd.get_dummies(
            df,
            columns=tmp_cols_to_dummy,
            prefix=tmp_cols_to_dummy
        )
    ).show()

In [None]:
def show_outliers(X, n_std=3):
    outliers_dict = {}
    
    for col in X.columns.to_list():
        mean = X[col].mean()
        sd = X[col].std()
        X_filtered = X[X[col] > mean+(n_std*sd)]
        outliers_dict[col] = [
            X.shape[0]-X[(X[col] <= mean+(n_std*sd))].shape[0],
            round((X.shape[0]-X[(X[col] <= mean+(n_std*sd))].shape[0])/X.shape[0],3),
            X_filtered[col].min()
        ]
        
    return(pd.DataFrame(outliers_dict, index=['Count', 'Percentage', 'Min']))

In [None]:
tmp = show_basic_plots(data)
#'dist_cat'
#'dist_mlt_cat'
#'violin_mix'
#'scatter_mix'
#'dist_mlt_mix'

## Numerical

In [None]:
for i in tmp['dist_num']: 
    i.show()

In [None]:
for i in tmp['violin_num']: 
    i.show()

In [None]:
for i in tmp['scatter_num']: 
    i.show()

In [None]:
data.hist(layout=(6, 7), figsize=(20, 15), bins=40)
plt.show()

In [None]:
sns.pairplot(data, diag_kind="kde")

## Categorical

In [None]:
for i in tmp['dist_cat']: 
    i.show()

In [None]:
for i in tmp['dist_mlt_cat']: 
    i.show()

## Mix

In [None]:
for i in tmp['violin_mix']:
    i.show()

## Conclusion

## Correlation analysis

In [None]:
corr_heatmap(X)