# Project: Predicting Churning Customers

Goal: Predict churning customers.

In [43]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

## Acquire

In [22]:
df = pd.read_csv('customer_data.csv').iloc[:,:-2]
df.head()

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
0,768805383,Existing Customer,45,M,3,High School,Married,$60K - $80K,Blue,39,5,1,3,12691.0,777,11914.0,1.335,1144,42,1.625,0.061
1,818770008,Existing Customer,49,F,5,Graduate,Single,Less than $40K,Blue,44,6,1,2,8256.0,864,7392.0,1.541,1291,33,3.714,0.105
2,713982108,Existing Customer,51,M,3,Graduate,Married,$80K - $120K,Blue,36,4,1,0,3418.0,0,3418.0,2.594,1887,20,2.333,0.0
3,769911858,Existing Customer,40,F,4,High School,Unknown,Less than $40K,Blue,34,3,4,1,3313.0,2517,796.0,1.405,1171,20,2.333,0.76
4,709106358,Existing Customer,40,M,3,Uneducated,Married,$60K - $80K,Blue,21,5,1,0,4716.0,0,4716.0,2.175,816,28,2.5,0.0


Aquire data function:

In [None]:
def acquire_customer_data():
    '''
    Checks if csv exists; if so, read csv. If not, reads url
    into dataframe and write to csv. Return dataframe.
    '''
    if os.path.isfile('customer_data.csv'):
        customer_data = pd.read_csv('customer_data.csv').iloc[:,:-2]
    else:
        print('Download the data via Kaggle:\
              \nhttps://www.kaggle.com/datasets/sakshigoyal7/credit-card-customers\
              \n Rename to customer_data.csv and run program again')
    return customer_data

## Wrangle

Rename columns

In [26]:
df.columns = df.columns.str.lower()
df.columns

Index(['clientnum', 'attrition_flag', 'customer_age', 'gender',
       'dependent_count', 'education_level', 'marital_status',
       'income_category', 'card_category', 'months_on_book',
       'total_relationship_count', 'months_inactive_12_mon',
       'contacts_count_12_mon', 'credit_limit', 'total_revolving_bal',
       'avg_open_to_buy', 'total_amt_chng_q4_q1', 'total_trans_amt',
       'total_trans_ct', 'total_ct_chng_q4_q1', 'avg_utilization_ratio'],
      dtype='object')

In [32]:
columns = {'clientnum':'id',
            'attrition_flag':'account_closed',
            'customer_age':'age',
            'months_on_book':'months_with_bank',
            'total_relationship_count':'products_held_count',
            'months_inactive_12_mon':'months_inactive_last12',
            'contacts_count_12_mon':'contacts_count_last12',
            'avg_open_to_buy':'open_to_buy_credit_line_avg_last12',
            'total_trans_amt':'total_trans_amt_last12',
            'total_trans_ct':'total_trans_ct_last12'}
df = df.rename(columns=columns)

Check missing values

In [34]:
df.describe()

Unnamed: 0,id,age,dependent_count,months_with_bank,products_held_count,months_inactive_last12,contacts_count_last12,credit_limit,total_revolving_bal,open_to_buy_credit_line_avg_last12,total_amt_chng_q4_q1,total_trans_amt_last12,total_trans_ct_last12,total_ct_chng_q4_q1,avg_utilization_ratio
count,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0
mean,739177600.0,46.32596,2.346203,35.928409,3.81258,2.341167,2.455317,8631.953698,1162.814061,7469.139637,0.759941,4404.086304,64.858695,0.712222,0.274894
std,36903780.0,8.016814,1.298908,7.986416,1.554408,1.010622,1.106225,9088.77665,814.987335,9090.685324,0.219207,3397.129254,23.47257,0.238086,0.275691
min,708082100.0,26.0,0.0,13.0,1.0,0.0,0.0,1438.3,0.0,3.0,0.0,510.0,10.0,0.0,0.0
25%,713036800.0,41.0,1.0,31.0,3.0,2.0,2.0,2555.0,359.0,1324.5,0.631,2155.5,45.0,0.582,0.023
50%,717926400.0,46.0,2.0,36.0,4.0,2.0,2.0,4549.0,1276.0,3474.0,0.736,3899.0,67.0,0.702,0.176
75%,773143500.0,52.0,3.0,40.0,5.0,3.0,3.0,11067.5,1784.0,9859.0,0.859,4741.0,81.0,0.818,0.503
max,828343100.0,73.0,5.0,56.0,6.0,6.0,6.0,34516.0,2517.0,34516.0,3.397,18484.0,139.0,3.714,0.999


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 21 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   id                                  10127 non-null  int64  
 1   account_closed                      10127 non-null  object 
 2   age                                 10127 non-null  int64  
 3   gender                              10127 non-null  object 
 4   dependent_count                     10127 non-null  int64  
 5   education_level                     10127 non-null  object 
 6   marital_status                      10127 non-null  object 
 7   income_category                     10127 non-null  object 
 8   card_category                       10127 non-null  object 
 9   months_with_bank                    10127 non-null  int64  
 10  products_held_count                 10127 non-null  int64  
 11  months_inactive_last12              10127

No null data types, but data such as "unknown" or "missing" may exist.

In [42]:
for column in df.columns:
    if df[column].nunique() < 30:
        print(f'{column}: ')
        print(df[column].unique())

account_closed: 
['Existing Customer' 'Attrited Customer']
gender: 
['M' 'F']
dependent_count: 
[3 5 4 2 0 1]
education_level: 
['High School' 'Graduate' 'Uneducated' 'Unknown' 'College' 'Post-Graduate'
 'Doctorate']
marital_status: 
['Married' 'Single' 'Unknown' 'Divorced']
income_category: 
['$60K - $80K' 'Less than $40K' '$80K - $120K' '$40K - $60K' '$120K +'
 'Unknown']
card_category: 
['Blue' 'Gold' 'Silver' 'Platinum']
products_held_count: 
[5 6 4 3 2 1]
months_inactive_last12: 
[1 4 2 3 6 0 5]
contacts_count_last12: 
[3 2 0 1 4 5 6]


I want to put missing values anywhere I see an `"Unknown"` value

In [44]:
df = df.replace('Unknown', np.nan)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10127 entries, 0 to 10126
Data columns (total 21 columns):
 #   Column                              Non-Null Count  Dtype  
---  ------                              --------------  -----  
 0   id                                  10127 non-null  int64  
 1   account_closed                      10127 non-null  object 
 2   age                                 10127 non-null  int64  
 3   gender                              10127 non-null  object 
 4   dependent_count                     10127 non-null  int64  
 5   education_level                     8608 non-null   object 
 6   marital_status                      9378 non-null   object 
 7   income_category                     9015 non-null   object 
 8   card_category                       10127 non-null  object 
 9   months_with_bank                    10127 non-null  int64  
 10  products_held_count                 10127 non-null  int64  
 11  months_inactive_last12              10127

Wrangle function

## EDA

In [45]:
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_format = 'retina'

## Statistical Testing

Potential questions to answer:
- One mean:
- Difference in means for paired data:
- Difference in means for two independent groups:
- One proportion:
- Difference in proportions for paired data:
- Difference in proportions for two independent groups:

## Modeling

## README


Business Goals:
- Find drivers for customer churn at Telco. Why are customers churning?
- Construct a ML classification model that accurately predicts customer churn
- Present your process and findings to the lead data scientist

Deliverables:
- `README.md`
- `final_report.ipynb`
- `acquire.py` and `explore.py` 
- `predictions.csv`
-  test/work `.ipynb`(s)
- 