In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

%matplotlib inline

In [13]:
# Loaded the dataset, and concatenated the 2-part digital_footprint file.
df_client_profiles = pd.read_csv("/Users/milenko/My Drive (1307mile@gmail.com)/bootcamp/w5/w5w6_project2/data/df_final_demo.txt")

df_digital_footprint_1 = pd.read_csv("/Users/milenko/My Drive (1307mile@gmail.com)/bootcamp/w5/w5w6_project2/data/df_final_web_data_pt_1.txt")
df_digital_footprint_2 = pd.read_csv("/Users/milenko/My Drive (1307mile@gmail.com)/bootcamp/w5/w5w6_project2/data/df_final_web_data_pt_2.txt")
df_digital_footprint = pd.concat([df_digital_footprint_1, df_digital_footprint_2], axis=0, ignore_index=True)

df_experiment_roster = pd.read_csv("/Users/milenko/My Drive (1307mile@gmail.com)/bootcamp/w5/w5w6_project2/data/df_final_experiment_clients.txt")

# Understanding the data

# Cleaning Data

In [14]:
df_client_profiles

Unnamed: 0,client_id,clnt_tenure_yr,clnt_tenure_mnth,clnt_age,gendr,num_accts,bal,calls_6_mnth,logons_6_mnth
0,836976,6.0,73.0,60.5,U,2.0,45105.30,6.0,9.0
1,2304905,7.0,94.0,58.0,U,2.0,110860.30,6.0,9.0
2,1439522,5.0,64.0,32.0,U,2.0,52467.79,6.0,9.0
3,1562045,16.0,198.0,49.0,M,2.0,67454.65,3.0,6.0
4,5126305,12.0,145.0,33.0,F,2.0,103671.75,0.0,3.0
...,...,...,...,...,...,...,...,...,...
70604,7993686,4.0,56.0,38.5,U,3.0,1411062.68,5.0,5.0
70605,8981690,12.0,148.0,31.0,M,2.0,101867.07,6.0,6.0
70606,333913,16.0,198.0,61.5,F,2.0,40745.00,3.0,3.0
70607,1573142,21.0,255.0,68.0,M,3.0,475114.69,4.0,4.0


In [4]:
df_client_profiles.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70609 entries, 0 to 70608
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   client_id         70609 non-null  int64  
 1   clnt_tenure_yr    70595 non-null  float64
 2   clnt_tenure_mnth  70595 non-null  float64
 3   clnt_age          70594 non-null  float64
 4   gendr             70595 non-null  object 
 5   num_accts         70595 non-null  float64
 6   bal               70595 non-null  float64
 7   calls_6_mnth      70595 non-null  float64
 8   logons_6_mnth     70595 non-null  float64
dtypes: float64(7), int64(1), object(1)
memory usage: 4.8+ MB


Considering the nature of data in the columns ['clnt_tenure_yr', 'clnt_tenure_mnth', 'clnt_age', 'num_accts', 'calls_6_mnth', 'logons_6_mnth'], 
they should be converted to int64 Dtype.\
To avoid conversion issues, the dataframe should be first cleaned of null values.

In [5]:
df_client_profiles = df_client_profiles.dropna(thresh=9)

print("\nThe shape of the dataframe after dropping null values:", df_client_profiles.shape, "\n")

df_client_profiles.isnull().sum()


The shape of the dataframe after dropping null values: (70594, 9) 



client_id           0
clnt_tenure_yr      0
clnt_tenure_mnth    0
clnt_age            0
gendr               0
num_accts           0
bal                 0
calls_6_mnth        0
logons_6_mnth       0
dtype: int64

In [6]:
# Cleaned from null values. Columns to convert to integer type
int_columns = ['clnt_tenure_yr', 'clnt_tenure_mnth', 'clnt_age', 'num_accts', 'calls_6_mnth', 'logons_6_mnth']

for col in int_columns:
    df_client_profiles.loc[:, col] = df_client_profiles[col].astype(int)

In [7]:
# Checked for duplicated rows. There are none.
df_client_profiles.duplicated().sum()

0

In [8]:
# Renamed columns for clarity.
df_client_profiles = df_client_profiles.rename(columns= {
    'clnt_tenure_yr': 'client_tenure_in_years', 
    'clnt_tenure_mnth': 'client_tenure_in_months', 
    'clnt_age': 'client_age', 
    'gendr': 'gender', 
    'num_accts': 'number_of_accounts', 
    'bal': 'balance',
    'calls_6_mnth': 'calls_6_months',
    'logons_6_mnth': 'logons_6_months'})

In [9]:
# Transformed 2 columns from giving 6-monthly to annual values.
df_client_profiles[['calls_6_months', 'logons_6_months']] = df_client_profiles[['calls_6_months', 'logons_6_months']].apply(lambda x: x * 2)

df_client_profiles.rename(columns= {'calls_6_months': 'calls_per_year', 'logons_6_months': 'logons_per_year'}, inplace=True)

# Client behaviour analysis
- Who are the primary clients using this online process?
- Are the primary clients younger or older, new or long-standing?

In [10]:
df_client_profiles.describe()

Unnamed: 0,client_id,client_tenure_in_years,client_tenure_in_months,client_age,number_of_accounts,balance,calls_per_year,logons_per_year
count,70594.0,70594.0,70594.0,70594.0,70594.0,70594.0,70594.0,70594.0
mean,5005026.0,12.053007,150.659999,46.180426,2.255532,147446.7,6.764938,11.133439
std,2877289.0,6.87185,82.090264,15.60039,0.535,301510.6,4.47319,4.706592
min,169.0,2.0,33.0,13.0,1.0,13789.42,0.0,2.0
25%,2519543.0,6.0,82.0,32.0,2.0,37346.6,2.0,8.0
50%,5016974.0,11.0,136.0,47.0,2.0,63334.59,6.0,10.0
75%,7483074.0,16.0,192.0,59.0,2.0,137546.1,12.0,14.0
max,9999839.0,62.0,749.0,96.0,8.0,16320040.0,14.0,18.0


In [11]:
df_client_profiles.describe(include='object')

Unnamed: 0,gender
count,70594
unique,4
top,U
freq,24122


Our primary clients on average is:
- middle-aged: 46-years-old,
- loyal: with us for 12 years,
- which means they join us in their 30s,
- unknown or undisclosed gender,
- tech-oriented: 11 logons per year vs 7 calls,
- holds 2 accounts,
- with 147,446.7 on their balance.

# Who are the primary clients using this online process?

In [12]:
groupby['logins_per_year']

NameError: name 'groupby' is not defined

# Visualisations