# Correlation Coefficients
<hr style="border:2px solid black">

**load packages**

In [None]:
# data analysis stack
import numpy as np
import pandas as pd

# data visualization stack
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')

# statistics stack
from scipy import stats

# machine learning stack
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import silhouette_score

# miscellaneous
import warnings
warnings.simplefilter('ignore')

**load data**

In [None]:
df = sns.load_dataset('penguins')
df.dropna(inplace=True, ignore_index=True)

In [None]:
df.head()

**feature variables**

In [None]:
numerical_features = [
    'bill_length_mm',
    'bill_depth_mm',
    'flipper_length_mm',
    'body_mass_g'
]

categorical_features = [
    'species',
    'island',
    'sex'
]

## correlation among numerical features

In [None]:
plt.figure(figsize=(4,4),dpi=100)
sns.heatmap(
    data=df[numerical_features].corr(),
    cmap='coolwarm',
    linecolor='white',
    linewidth=1,
    annot=True,
    vmin=-1,
    vmax=1
);

**scatter plot example**

In [None]:
sns.scatterplot(
        x=df.flipper_length_mm,
        y=df.body_mass_g,
    );

## correlation among categorical features

In [None]:
def cat_cat_correlation(df_,cat_col1,cat_col2):
    """
    This function spits out Cramer's correlation statistic, with Bergsma-Wicher 
    correction, between two categorical columns of a dataframe 
    """
    crosstab = pd.crosstab(df_[cat_col1], df_[cat_col2])
    chi_sqr = stats.chi2_contingency(crosstab)[0]
    n = crosstab.sum().sum()
    r,k = crosstab.shape
    phi_sqr_corr = max(0, chi_sqr/n - ((k-1)*(r-1))/(n-1))    
    r_corr = r - ((r-1)**2)/(n-1)
    k_corr = k - ((k-1)**2)/(n-1)
    
    result = np.sqrt(phi_sqr_corr / min( (k_corr-1), (r_corr-1)))
    return round(result,3)

In [None]:
cramer_v_corr = dict(
    zip(
        categorical_features,
        [[cat_cat_correlation(df,f1,f2) for f2 in categorical_features] for f1 in categorical_features]
    )
)

plt.figure(figsize=(4,4),dpi=100)
sns.heatmap(data=pd.DataFrame(data=cramer_v_corr,index=categorical_features),
            cmap='magma',
            linecolor='white',
            linewidth=1,
            annot=True,
            vmin=0,
            vmax=1
           );

**bar plot example**

In [None]:
sns.countplot(data=df, x='island', hue='species', fill= True);

## correlation between categorical and numerical features

In [None]:
def cat_num_correlation(df_,cat_col,num_col):
    """
    This function spits out a sensible correlation coefficient
    between a categorical column and a numerical column of a dataframe.
    The correlation is essentially the silhouette score of the numerical
    data points with the catagorical variable values serving as cluster labels
    """
    data_points = df_[num_col].values.reshape(-1,1)
    labels = LabelEncoder().fit_transform(df_[cat_col])
    score = silhouette_score(data_points,labels)
    score = score * np.heaviside(score,0)
    score = round(score,3)
    return score

In [None]:
cat_num_corr_values = dict(
    zip(
        numerical_features,
        [[cat_num_correlation(df,f1,f2) for f1 in categorical_features] for f2 in numerical_features]
    )
)

plt.figure(figsize=(5,4),dpi=100)
sns.heatmap(data=pd.DataFrame(data=cat_num_corr_values,index=categorical_features),
            cmap='viridis',
            linecolor='white',
            linewidth=1,
            annot=True,
            vmin=0,
            vmax=1
           );

**kde plot example**

In [None]:
sns.kdeplot(data=df, x='bill_length_mm', hue='species', fill= True);

**anova p test**

In [None]:
def anova_pvalue(df_,cat_col,num_col):
    """
    This function spits out the anova p-value (probability of no correlation) 
    between a categorical column and a numerical column of a dataframe
    """
    category_group_lists = df_.groupby(cat_col)[num_col].apply(list)
    anova_results = stats.f_oneway(*category_group_lists)
    p_value = round(anova_results[1],3)
    return p_value

In [None]:
anova_pvalue(df,'sex','body_mass_g')