# BASIC EDA - {"CUSTOMER CHURN" DATASET} 

## 1. Import Modules and Configuration Settings

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

In [2]:
# PD Options

pd.set_option('display.min_rows', 5)
pd.set_option('display.max_rows', 25)
pd.set_option('display.precision', 4)

## 2. Import Dataset

In [3]:
data = pd.read_csv('cc_train.csv')
df = data.copy()

## 3. Basic EDA

### 3.1 Sample of the Dataset

In [4]:
df.sample(5)

Unnamed: 0,state,account_length,area_code,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls,churn
866,DE,131,area_code_408,no,no,0,94.4,80,16.05,215.1,101,18.28,179.7,108,8.09,13.1,9,3.54,2,no
3203,NY,77,area_code_510,no,no,0,124.1,92,21.1,214.9,131,18.27,241.3,132,10.86,13.7,3,3.7,0,no
1742,VT,101,area_code_415,no,no,0,136.2,92,23.15,220.9,110,18.78,196.9,116,8.86,13.3,7,3.59,3,no
3330,VA,68,area_code_415,no,yes,41,226.0,113,38.42,149.8,115,12.73,184.9,88,8.32,11.5,2,3.11,2,no
1651,WI,73,area_code_415,no,no,0,157.1,109,26.71,268.8,83,22.85,181.5,91,8.17,10.0,8,2.7,0,no


### 3.2 Shape of the Dataset

In [5]:
print(f'No. of Observations (rows) : {df.shape[0]}')
print(f'No. of Features (columns)  : {df.shape[1]}')

No. of Observations (rows) : 4250
No. of Features (columns)  : 20


### 3.3 Dataset Information

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4250 entries, 0 to 4249
Data columns (total 20 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   state                          4250 non-null   object 
 1   account_length                 4250 non-null   int64  
 2   area_code                      4250 non-null   object 
 3   international_plan             4250 non-null   object 
 4   voice_mail_plan                4250 non-null   object 
 5   number_vmail_messages          4250 non-null   int64  
 6   total_day_minutes              4250 non-null   float64
 7   total_day_calls                4250 non-null   int64  
 8   total_day_charge               4250 non-null   float64
 9   total_eve_minutes              4250 non-null   float64
 10  total_eve_calls                4250 non-null   int64  
 11  total_eve_charge               4250 non-null   float64
 12  total_night_minutes            4250 non-null   f

### 3.4 Checking Any Missing Values in the Dataset

In [7]:
miss_cnt = df.isna().sum().reset_index().rename(columns={'index':'Features', 0:'Missing Count'})
miss_cnt['Missing PCT'] = miss_cnt['Missing Count'].apply(lambda x: (x/df.shape[0])*100)
miss_cnt

Unnamed: 0,Features,Missing Count,Missing PCT
0,state,0,0.0
1,account_length,0,0.0
2,area_code,0,0.0
3,international_plan,0,0.0
4,voice_mail_plan,0,0.0
5,number_vmail_messages,0,0.0
6,total_day_minutes,0,0.0
7,total_day_calls,0,0.0
8,total_day_charge,0,0.0
9,total_eve_minutes,0,0.0


### 3.5 Checking Unique Values for Individual Features

In [8]:
unq_cnt = df.nunique().reset_index().rename(columns={'index':'Features', 0:'Unique Values'})
unq_cnt['Unique Values PCT'] = unq_cnt['Unique Values'].apply(lambda x: (x/df.shape[0])*100)
unq_cnt

Unnamed: 0,Features,Unique Values,Unique Values PCT
0,state,51,1.2
1,account_length,215,5.0588
2,area_code,3,0.0706
3,international_plan,2,0.0471
4,voice_mail_plan,2,0.0471
5,number_vmail_messages,46,1.0824
6,total_day_minutes,1843,43.3647
7,total_day_calls,120,2.8235
8,total_day_charge,1843,43.3647
9,total_eve_minutes,1773,41.7176


### 3.6 Checking Any Duplicate Records

In [9]:
print(f'Total No. of Duplicated Observations in the Dataset : {df.duplicated().sum()}')

Total No. of Duplicated Observations in the Dataset : 0


### 3.7 Generating Statistical Summary on Numerical Columns

In [10]:
df.describe()

Unnamed: 0,account_length,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls
count,4250.0,4250.0,4250.0,4250.0,4250.0,4250.0,4250.0,4250.0,4250.0,4250.0,4250.0,4250.0,4250.0,4250.0,4250.0
mean,100.2362,7.6318,180.2596,99.9073,30.6447,200.1739,100.1765,17.015,200.5279,99.8395,9.0239,10.2561,4.4264,2.7697,1.5591
std,39.6984,13.4399,54.0124,19.8508,9.1821,50.2495,19.9086,4.2712,50.3535,20.0932,2.2659,2.7601,2.4631,0.7452,1.3114
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,73.0,0.0,143.325,87.0,24.365,165.925,87.0,14.1025,167.225,86.0,7.5225,8.5,3.0,2.3,1.0
50%,100.0,0.0,180.45,100.0,30.68,200.7,100.0,17.06,200.45,100.0,9.02,10.3,4.0,2.78,1.0
75%,127.0,16.0,216.2,113.0,36.75,233.775,114.0,19.8675,234.7,113.0,10.56,12.0,6.0,3.24,2.0
max,243.0,52.0,351.5,165.0,59.76,359.3,170.0,30.54,395.0,175.0,17.77,20.0,20.0,5.4,9.0


### 3.8 Features Specific to a Datatype

In [11]:
print(f'Number of Unique Data Types : {df.dtypes.nunique()}')
print(f'Unique Data Types : {df.dtypes.unique().tolist()}')

Number of Unique Data Types : 3
Unique Data Types : [dtype('O'), dtype('int64'), dtype('float64')]


In [12]:
fea_flo = df.select_dtypes(include='float64').columns.values.tolist()
fea_int = df.select_dtypes(include='int64').columns.values.tolist()
fea_obj = df.select_dtypes(include='object').columns.values.tolist()

print(f'Float Type Features    : {len(fea_flo)} \n{fea_flo} \n\n')
print(f'Integer Type Features  : {len(fea_int)} \n{fea_int} \n\n')
print(f'Object Type Features   : {len(fea_obj)} \n{fea_obj}')

Float Type Features    : 8 
['total_day_minutes', 'total_day_charge', 'total_eve_minutes', 'total_eve_charge', 'total_night_minutes', 'total_night_charge', 'total_intl_minutes', 'total_intl_charge'] 


Integer Type Features  : 7 
['account_length', 'number_vmail_messages', 'total_day_calls', 'total_eve_calls', 'total_night_calls', 'total_intl_calls', 'number_customer_service_calls'] 


Object Type Features   : 5 
['state', 'area_code', 'international_plan', 'voice_mail_plan', 'churn']
