In [None]:
# <-- Import libraries, custom functions, and load configuration & datasets <-- #

import yaml
import datetime as dt
import re
import pandas as pd
import scipy.stats as st
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import statsmodels.api as sm
import seaborn as sns

from statsmodels.multivariate.manova import MANOVA
from scipy import stats
from scipy.stats import pearsonr, boxcox, chi2_contingency
from scipy.stats.contingency import association

# <-- Imports custom preprocessing functions from 'functions.py' <-- #

# from functions import ()

# <-- Loads YAML configuration to dynamically reference CSV output files. <-- #

config = None  # <-- Initialize config
try:
    with open("../config.yaml", "r") as file:
        config = yaml.safe_load(file)
except:
    print("Yaml configuration file not found!")


df_demo = pd.read_csv(config['input_data']['file1'])
df_exp_clients = pd.read_csv(config['input_data']['file2'])
df_web_data_pt_1 = pd.read_csv(config['input_data']['file3'])
df_web_data_pt_2 = pd.read_csv(config['input_data']['file3'])

: 

In [None]:
df_demo

In [None]:
df_exp_clients

In [None]:
display(pd.DataFrame({
    "non_nulls": df_exp_clients.count(),
    "nulls": df_exp_clients.isnull().sum(),
    "total": len(df_exp_clients)
}))

In [None]:
df_exp_clients.dropna(inplace=True)

In [None]:
# Drop rows of clients not inlcuded in the experiment
df_exp_clients.shape, df_exp_clients.isnull().sum()

In [None]:
# client_id == 4666211 [only client with not age]
df_exp_clients[df_exp_clients['client_id'] == 4666211]

In [None]:
df_web_data_pt_1

In [None]:
df_web_data_pt_2

In [None]:
# Concatenate the two web data parts vertically and drop duplicates
df_web_data = pd.concat([df_web_data_pt_1, df_web_data_pt_2], ignore_index=True).drop_duplicates().reset_index(drop=True)
print(f"Combined shape: {df_web_data.shape}")
print(f"Original shapes: {df_web_data_pt_1.shape} + {df_web_data_pt_2.shape}")
df_web_data

In [None]:
df_demo.columns

In [None]:
df_exp_clients.columns

In [None]:
# Counting and sorting the unique values for each numerical column in descending order
df_demo.select_dtypes("number").nunique().sort_values(ascending=False)

In [None]:
display(pd.DataFrame({
    "non_nulls": df_demo.count(),
    "nulls": df_demo.isnull().sum(),
    "total": len(df_demo)
}))

In [None]:
# Drop the rows are missing elements.
df_demo.dropna(thresh=7, inplace=True)

In [None]:
df_demo.shape, df_demo.isnull().sum()

In [None]:
df_demo.drop_duplicates(subset='client_id').count()

In [None]:
display(pd.DataFrame({
    "non_nulls": df_exp_clients.count(),
    "nulls": df_exp_clients.isnull().sum(),
    "total": len(df_exp_clients)
}))

In [None]:
# Calculate the mean, median, mode, variance, and standard deviation of 'clnt_age'.
data={"mean": [df_demo['clnt_age'].mean()], "median": [df_demo['clnt_age'].median()],
 "mode": [df_demo['clnt_age'].mode()[0]], "sd": [df_demo['clnt_age'].std()]}
display(pd.DataFrame(data))

In [None]:
# Discretizing 'clnt_age' into quartiles
df_demo['clnt_age_quantile'] = pd.qcut(df_demo['clnt_age'], q=4, labels=['A1', 'A2', 'A3', 'A4'])
df_demo.clnt_age_quantile.value_counts()

In [None]:
# Show rows that have at least one missing value
df_demo[df_demo.isnull().any(axis=1)]

In [None]:
#df_demo.groupby('tenure_quantile')['clnt_age'].agg('median')

# sns.boxplot(x=new_df['clnt_age'], color="lightblue")
# plt.show()

In [None]:
df_demo.describe

In [None]:
df_demo.groupby("clnt_age_quantile")['clnt_age'].agg(['mean','median','std'])

In [None]:
df_demo['clnt_age'].describe()

In [None]:
desc = df_demo['clnt_age'].describe()
# desc['std'], desc['min'], desc['max'], desc['25%'], desc['75%']
variance = round(df_demo['clnt_age'].var(), 2)
std_dev = round(df_demo['clnt_age'].std(), 2)
price_range = (desc['min'], desc['max'])
iqr = desc['75%'] - desc['25%']

print(f'Age Variance: {variance}')
print(f"Age Std Deviation: {std_dev}")
print(f'Age Range: {price_range}')
print(f"Age Interquartile Range: {iqr}")

In [None]:
skewness_age = round(df_demo['clnt_age'].skew())
kurtosis_age = round(df_demo['clnt_age'].kurtosis())

skewness_age, kurtosis_age

In [None]:
# Age distribution
fig, axes = plt.subplots(1,2,figsize=(10,5))
sns.boxplot(data=df_demo, y='clnt_age',ax=axes[0])
sns.histplot(df_demo['clnt_age'], kde=True, bins=10, ax=axes[1]);
plt.show()

In [None]:
df_demo['gendr'].unique()

In [None]:
df_demo['gendr'].value_counts(dropna=False)

In [None]:
df_demo['gendr_clean'] = df_demo['gendr'].apply(lambda x: x if x in ['M', 'F'] else 'U')
sns.countplot(data=df_demo, y='gendr_clean', palette="Set3", hue='gendr_clean', legend=False)

In [None]:
# Gender distribution
fig, axes = plt.subplots(1,2,figsize=(12,6))
sns.boxplot(x = df_demo['clnt_tenure_yr'], color="lightblue", ax=axes[0])
sns.histplot(df_demo['clnt_tenure_yr'], bins=10, kde=True, ax=axes[1]);
plt.show()

In [None]:
# Cross-tab age quantiles with tenure quantiles (Are younger clients newer? Are older clients more tenured?)
df_demo['tenure_quantile'] = pd.qcut(df_demo['clnt_tenure_yr'], q=4, labels=['T1','T2','T3','T4'])
crosstab_result = pd.crosstab(df_demo['clnt_age_quantile'], df_demo['tenure_quantile'])
crosstab_result

In [None]:
# Correlation
df_demo[['clnt_age', 'clnt_tenure_yr']].corr()

In [None]:
# Calculating the Pearson correlation 'Age' and 'Tenure'
person_correlation = df_demo['clnt_age'].corr(df_demo['clnt_tenure_yr'])
person_correlation

In [None]:
# Calculating the Spearman rank correlation 'Age' and 'Tenure'
spearman_correlation = df_demo['clnt_age'].corr(df_demo['clnt_tenure_yr'], method='spearman')
spearman_correlation

In [None]:
sns.heatmap(crosstab_result, annot=True, cmap="YlGnBu", fmt="d");

In [None]:
sns.pairplot(df_demo[["clnt_age", "clnt_tenure_yr"]]);