In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline

In [None]:
df_clients = pd.read_csv("../Data/Raw/df_final_experiment_clients.txt")
df_demo = pd.read_csv("../Data/Raw/df_final_demo.txt")
df_web1 = pd.read_csv("../Data/Raw/df_final_web_data_pt_1.txt")
df_web2 = pd.read_csv("../Data/Raw/df_final_web_data_pt_2.txt")

df_demo_control = pd.read_csv("../Data/Clean/demo_control.csv")

## **Data Clean - Demo Control** ##

In [None]:
df_demo_control.nunique().sort_values(ascending=False)
df_demo_control.shape

- Numerical: Maybe 'balance' - but this could also be categorical
- Categorical: almost all of the columns should be categorical, which makes sense because it's demographic data

In [None]:
#check for missing values
df_demo_control.isnull().sum().sort_values(ascending=False)

In [None]:
#see rows with missing values
df_demo_control[df_demo_control.isnull().any(axis=1)]

In [None]:
#drop rows missing values
df_cleaned = df_demo_control.dropna()

## **Demographic Data - Client Tenure** ##

In [None]:
#How long have the control clients had accounts?

disp_tenure_months = df_cleaned['clnt_tenure_mnth'].describe()
disp_tenure_years = df_cleaned['clnt_tenure_yr'].describe()
mode_tenure_years = df_cleaned['clnt_tenure_yr'].mode()[0]
mode_tenure_months = df_cleaned['clnt_tenure_mnth'].mode()[0]

disp_tenure_months, disp_tenure_years, mode_tenure_years, mode_tenure_months

In [None]:
#Checking for invalid tenure

df_invalid_tenure = df_cleaned[df_cleaned["clnt_tenure_yr"] > df_cleaned["clnt_age"]]

df_invalid_tenure

**Analysis**: There are 145 cases where the client's tenure is longer than their age - potentially a source of error unless they inherited the account from a family member or spouse?

In [None]:
fig, axes = plt.subplots(2,1, dpi = 100)
sns.histplot(df_cleaned['clnt_tenure_mnth'], kde=True, bins=30, color="green", ax=axes[0])
axes[0].set_title("Histogram for Tenure Months, Control")
sns.histplot(df_cleaned['clnt_tenure_yr'], kde=True, bins=30, color="blue", ax=axes[1])
axes[1].set_title("Histogram for Tenure Years, Control")
plt.tight_layout()
plt.show()

**Analysis:** 75% of Vanguard's account holders in this control group are long-term (6+ years), with 25% of clients holding accounts for 16+ years. There are few new clients included in this dataset, unclear if that was intentional for the control group?

The distribution is multimodal, suggesting that the control group might have been selected based on diff groups of customers, e.g. New customers with tenure 5–10 years. Mid-tenured customers 15–20 years. Long-term customers 25+ years.

## **Demographic Data - Client Age** ##

In [None]:
#How long have the control clients had accounts?

disp_age = df_cleaned['clnt_age'].describe()
mode_age = df_cleaned['clnt_age'].mode()[0]

disp_age, mode_age

In [None]:
fig, axes = plt.subplots(2,1, dpi = 100)
sns.histplot(df_cleaned['clnt_age'], kde=True, bins=30, color="green", ax=axes[0])
axes[0].set_title("Histogram for Client Age, Control")
sns.histplot(df_cleaned['clnt_age'], kde=True, bins=30, color="blue", ax=axes[1])
axes[1].set_title("Histogram for Client Age, Control")
plt.tight_layout()
plt.show()

**Analysis:** The median age for the control group is 58 years old. The distribution is multimodal, suggesting that maybe the control group was selected based on two groups 18-40 and 40+.

## **Demographic Data - Gender** ##

In [None]:
fig, ax = plt.subplots(2, 2, dpi=100, figsize=(10, 8))

sns.countplot(data=df_cleaned, x='gendr', palette="Set3", hue='gendr',legend= False, ax=ax[0, 0])
sns.countplot(data=df_cleaned, x='num_accts', palette="Set3", hue='num_accts',legend= False, ax=ax[0, 1])
sns.countplot(data=df_cleaned, x='calls_6_mnth', palette="Set3", hue='calls_6_mnth',legend= False, ax=ax[1, 0])
sns.countplot(data=df_cleaned, x='logons_6_mnth', palette="Set3", hue='logons_6_mnth', legend= False, ax=ax[1, 1])

ax[0, 0].set_title("Gender Distribution")
ax[0, 1].set_title("Number of Accounts")
ax[1, 0].set_title("Calls in Last 6 Months")
ax[1, 1].set_title("Logins in Last 6 Months")

plt.tight_layout()
plt.show()



**Analysis:** 
- Even distribution of M, F, and Unknown gender
- Majority of clients have 2 accounts
- Larger share of clients have 6 calls
- Larger share of clients have 9 log-ons, minimum 3 log-ons