## Introduction

### Libraries

In [1]:
import pandas as pd
import numpy as np

### Modeling W/ sklearn

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.utils import shuffle

## Data Preperation

In [75]:
data = pd.read_csv('/datasets/Churn.csv')

In [76]:
data.head(5)

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,1,15634602,Hargrave,619,France,Female,42,2.0,0.0,1,1,1,101348.88,1
1,2,15647311,Hill,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,3,15619304,Onio,502,France,Female,42,8.0,159660.8,3,1,0,113931.57,1
3,4,15701354,Boni,699,France,Female,39,1.0,0.0,2,0,0,93826.63,0
4,5,15737888,Mitchell,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.1,0


In [77]:
data.columns = data.columns.str.lower()

In [78]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   rownumber        10000 non-null  int64  
 1   customerid       10000 non-null  int64  
 2   surname          10000 non-null  object 
 3   creditscore      10000 non-null  int64  
 4   geography        10000 non-null  object 
 5   gender           10000 non-null  object 
 6   age              10000 non-null  int64  
 7   tenure           9091 non-null   float64
 8   balance          10000 non-null  float64
 9   numofproducts    10000 non-null  int64  
 10  hascrcard        10000 non-null  int64  
 11  isactivemember   10000 non-null  int64  
 12  estimatedsalary  10000 non-null  float64
 13  exited           10000 non-null  int64  
dtypes: float64(3), int64(8), object(3)
memory usage: 1.1+ MB


### missing values 

In [79]:
ten_total = data.shape[0]

ten_missing_total =  data.tenure.isna().sum()

ten_mssing = ten_total /  ten_missing_total

print('Total Values:', ten_total)
print('Total Missing Values:', ten_missing_total)
print('Share of missing values:', ten_mssing,'%')

Total Values: 10000
Total Missing Values: 909
Share of missing values: 11.001100110011 %


In [80]:
clean_data = data.dropna()


clean_data.isna().sum()

rownumber          0
customerid         0
surname            0
creditscore        0
geography          0
gender             0
age                0
tenure             0
balance            0
numofproducts      0
hascrcard          0
isactivemember     0
estimatedsalary    0
exited             0
dtype: int64

In [81]:
clean_data.shape

(9091, 14)

Tenure Missing Data

- there are 909 missing values out of 10,000 (~11%) 

- I dropped rows containing missing values from the data frame. 

### Fixing Categorical Values

In [82]:
data_ohe = clean_data[['creditscore', 'geography', 'gender', 'age', 'tenure', 'balance', 'numofproducts', 'hascrcard', 'isactivemember', 'estimatedsalary', 'exited']]
data_ohe

Unnamed: 0,creditscore,geography,gender,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited
0,619,France,Female,42,2.0,0.00,1,1,1,101348.88,1
1,608,Spain,Female,41,1.0,83807.86,1,0,1,112542.58,0
2,502,France,Female,42,8.0,159660.80,3,1,0,113931.57,1
3,699,France,Female,39,1.0,0.00,2,0,0,93826.63,0
4,850,Spain,Female,43,2.0,125510.82,1,1,1,79084.10,0
...,...,...,...,...,...,...,...,...,...,...,...
9994,800,France,Female,29,2.0,0.00,2,0,0,167773.55,0
9995,771,France,Male,39,5.0,0.00,2,1,0,96270.64,0
9996,516,France,Male,35,10.0,57369.61,1,1,1,101699.77,0
9997,709,France,Female,36,7.0,0.00,1,0,1,42085.58,1


In [83]:
pd.get_dummies(clean_data['gender']).head()

Unnamed: 0,Female,Male
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [84]:
data_ohe = pd.get_dummies(data_ohe, drop_first=True)
    
data_ohe.head()    

Unnamed: 0,creditscore,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,exited,geography_Germany,geography_Spain,gender_Male
0,619,42,2.0,0.0,1,1,1,101348.88,1,0,0,0
1,608,41,1.0,83807.86,1,0,1,112542.58,0,0,1,0
2,502,42,8.0,159660.8,3,1,0,113931.57,1,0,0,0
3,699,39,1.0,0.0,2,0,0,93826.63,0,0,0,0
4,850,43,2.0,125510.82,1,1,1,79084.1,0,0,1,0


### Splitting The Data

# Seperating Test Data

In [85]:
train, test = train_test_split(data_ohe, test_size=0.25, random_state=22244, stratify=data_ohe['exited'])

In [86]:
target_test = test['exited']
features_test = test.drop('exited', axis=1)

# Splitting data for validation

In [87]:
train, valid = train_test_split(train, test_size=0.25, random_state=22244, stratify=train['exited'])

In [88]:
target_train = train['exited']
features_train = train.drop('exited', axis=1)

In [89]:
target_valid = valid['exited']
features_valid = valid.drop('exited', axis=1)

-- **Options that I didnt figure out** --

features_train, features_valid, target_train, target_valid  = train_test_split(features_train, train_target, test_size=0.2, random_state=25701, stratify=target)

## Class Balance

In [105]:
model = RandomForestClassifier(random_state=22244)
model.fit(features_train, target_train)
rf_predicted_valid = model.predict(features_valid)



In [106]:
probabilities_valid = model.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]

In [107]:

rf_accuracy = model.score(features_valid, target_valid)

rf_recall = recall_score(target_valid, rf_predicted_valid)

rf_precision = precision_score(target_valid, rf_predicted_valid)

rf_f1 = f1_score(target_valid, rf_predicted_valid)

rf_roc_auc = roc_auc_score(target_valid, probabilities_one_valid)

In [108]:
print('Accuracy :', rf_accuracy)

Accuracy : 0.8621700879765396


In [109]:
print('Recall: ', rf_recall)

Recall:  0.45689655172413796


In [110]:

print('Precision: ',rf_precision)

Precision:  0.775609756097561


In [111]:
print('F1 : ',rf_f1)

F1 :  0.5750452079566004


In [112]:
print('AUC-ROC :', rf_roc_auc)

AUC-ROC : 0.8406210030577932


In [113]:
confusion_matrix(target_valid, rf_predicted_valid)

array([[1311,   46],
       [ 189,  159]])

In [114]:
ones = target_valid.sum()
zeros = target_valid.count() - ones
print(ones,zeros)
print(ones/target_valid.count())

348 1357
0.20410557184750733


### Conclusion
**Confusion Matrix:**

- [1311,   46],
- [ 189,  159]


* True Negative = 1311 - Correct, 0
* False Positive = 46  - Incorrect, 1
* True Positive =  193 - Correct, 1
* False negative = 155 - Incorrct, 0


**Class Weight:**
 - 0- 1315 (~80%)-{~0.7958}
 - 1-  366 (~20%)-{~0.2041}
    - total: 1725 

After looking at the despersion of the models predictions it is clear that the model is rewarded for guessing Negative as there is roughly a 5:1 imbalance toward negative. 

**Observations:**
             
- F1 Score: :0.5750  (~57%)
- ROC-AUC:  :0.8406  (~84%)
- Recall:   :0.4568  (~45%)
- Precision::0.7948  (~79%)
- Accuracy : 0.8621  (~86%)


- the ROC-AUC is very high ~ 84% meaning the model predictions mostly accurate when considering the overall correct prediction. 
- the f1 score(~57%) being low would indicate that most correct guesses are the negative and not the target positive value. 

- I beleive this Imbalance is causing the model favor predictions of negative outcome. (True Negative = 1317 of 1705) 
    - 86% accuracy overall,  which could contain a large skew from the negative imbalance gussing negative. 
    
    
- An F1 Score of 0.5750452079566004 or ~57% 
    - needs to be improved by changing the balanceof classes and adjusting hyperparameters.
    
**Conclusion** 
- the Class Weights imbalance seems be skewing the predictions toward negative. This issue must be fixed before adjusting the model and testing further. 

## Class Weight Adjustments 

Method 1: upsampling and downsampling

Mehtod 2: class_weight parameter

I will compare the 2 methods and select whichever option gives the best F1

### Method 1

- UpSampling the data that is more rare case.Making the "1" more closely representing a equally balance.

- DownSampling the "0" making it as rare as the opposite case.

I will test both resulting features and use the method that produces the best results for comparision.  

### Method 1-1 Upsampling


In [115]:
def upsample(features, target, repeat):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_upsampled = pd.concat([features_zeros] + [features_ones] * repeat)
    target_upsampled = pd.concat([target_zeros] + [target_ones] * repeat)

    features_upsampled, target_upsampled = shuffle(
        features_upsampled, target_upsampled, random_state=22244
    )

    return features_upsampled, target_upsampled




In [116]:
upsampled_features_train, upsampled_target_train = upsample(features_train, target_train, 5)

In [117]:
model = RandomForestClassifier(random_state=22244)
model.fit(upsampled_features_train, upsampled_target_train)

upsampled_predicted_valid = model.predict(features_valid)

upsampled_prob_valid = model.predict_proba(features_valid)
upsampled_prob_one_valid = upsampled_prob_valid[:, 1]

upsampled_f1 = f1_score(target_valid, upsampled_predicted_valid)
upsampled_roc_auc = roc_auc_score(target_valid, upsampled_prob_one_valid)

In [118]:
confusion_matrix(target_valid, upsampled_predicted_valid)

array([[1261,   96],
       [ 160,  188]])

In [119]:
upsampled_f1, rf_f1

(0.5949367088607594, 0.5750452079566004)

In [120]:
upsampled_roc_auc, rf_roc_auc

(0.8348145842333071, 0.8406210030577932)

#### Conclusion - Upsampling

F1_accuary - 0.5639810426540285 (~56%)
ROC-AUC - seems to have a slight inverse relationship.

### Method 1-2 : Downsampling

In [121]:
def downsample(features, target, fraction):
    features_zeros = features[target == 0]
    features_ones = features[target == 1]
    target_zeros = target[target == 0]
    target_ones = target[target == 1]

    features_downsampled = pd.concat(
        [features_zeros.sample(frac=fraction, random_state=22244)]
        + [features_ones]
    )
    target_downsampled = pd.concat(
        [target_zeros.sample(frac=fraction, random_state=22244)]
        + [target_ones]
    )

    features_downsampled, target_downsampled = shuffle(
        features_downsampled, target_downsampled, random_state=22244
    )

    return features_downsampled, target_downsampled


In [122]:
downsampled_features_train, downsampled_target_train = downsample(features_train, target_train, 0.20410557184750733)

In [123]:
model = RandomForestClassifier(random_state=22244)
model.fit(downsampled_features_train, downsampled_target_train)

downsampled_predicted_valid = model.predict(features_valid)

downsampled_prob_valid = model.predict_proba(features_valid)
downsampled_prob_one_valid = downsampled_prob_valid[:, 1]

downsampled_f1 = f1_score(target_valid, downsampled_predicted_valid)
downsampled_roc_auc = roc_auc_score(target_valid, downsampled_prob_one_valid)


In [124]:
confusion_matrix(target_valid, downsampled_predicted_valid)

array([[959, 398],
       [ 81, 267]])

In [125]:
downsampled_f1, rf_f1

(0.5271470878578479, 0.5750452079566004)

In [126]:
downsampled_roc_auc, rf_roc_auc

(0.8332295716548506, 0.8406210030577932)

#### Conscluision - Down Sampling 

the resulting effect was a reduction in the F1_score.   

### method 2 : Class_weight = 'balanced'

In [127]:
model = RandomForestClassifier(class_weight='balanced', random_state=22244)
model.fit(features_train, target_train)
balanced_predicted_valid = model.predict(features_valid)

probabilities_valid = model.predict_proba(features_valid)
probabilities_one_valid = probabilities_valid[:, 1]

balanced_f1_score = f1_score(target_valid, balanced_predicted_valid)

balanced_roc_auc = roc_auc_score(target_valid, probabilities_one_valid)

In [128]:
balanced_roc_auc, rf_roc_auc

(0.8418682607848618, 0.8406210030577932)

In [129]:
balanced_f1_score, rf_f1

(0.5447761194029851, 0.5750452079566004)

In [130]:
confusion_matrix(target_valid, balanced_predicted_valid)

array([[1315,   42],
       [ 202,  146]])

#### Conclusion - Classweight= "balanced" 

lower f1 score than upsampling. 


## Conclusion -  Fixing Class Imbalance
Unbalanced_f1  = 0.5507745266781411

- [1226,  224]
- [ 178,  191]

upsampled_f1   = 0.5949367088607594

- [1667,  142],
- [ 136,  328]

Downsampled_f1   = 0.5271470878578479

- [959, 398],
- [ 81, 267]

Balanced_f1    = 0.5447761194029851

- [1315,   42],
- [ 202,  146]

Conclusion

- Downsampling shows an improvement to the dispersion predictions of the confusion matrix. 
- Upsampling gives the best f1 score improvment.
- Classweight= "balanced", was not effective.

## Model Adjustments 

In [131]:
features_train.head()

Unnamed: 0,creditscore,age,tenure,balance,numofproducts,hascrcard,isactivemember,estimatedsalary,geography_Germany,geography_Spain,gender_Male
1436,576,28,1.0,119336.29,2,0,1,58976.85,1,0,1
1944,783,39,9.0,0.0,2,1,0,143752.77,0,1,1
3230,679,33,6.0,0.0,2,1,1,98015.85,0,0,0
5226,481,39,1.0,111233.09,1,1,1,123995.15,0,1,1
4928,513,39,7.0,89039.9,2,1,1,146738.83,0,1,1


## Logistic Regression Model - Adjustment

# Downsampled data

In [146]:
model =  LogisticRegression(random_state=22244, solver="liblinear")
model.fit(downsampled_features_train, downsampled_target_train)
predicted_valid = model.predict(features_valid)
f1_score(target_valid, predicted_valid)

0.4069952305246423

# Upsampled data

In [145]:
model =  LogisticRegression(random_state=22244, solver="liblinear")
model.fit(upsampled_features_train, upsampled_target_train)
predicted_valid = model.predict(features_valid)
f1_score(target_valid, predicted_valid)

0.41904761904761906

## Random Forest Model - Adjustments

# Upsampled data

In [163]:
best_score = 0
best_est = 0

for est in range(25, 80): 
    model = RandomForestClassifier(random_state=22244, n_estimators=est, max_depth=20) 
    model.fit(upsampled_features_train, upsampled_target_train) 
    predicted_valid = model.predict(features_valid)
    score = f1_score(target_valid, predicted_valid)
    if score > best_score:
        best_score = score
        up_best_est = est
        print("max_estimators =", up_best_est, ": ", end='')
        print(best_score)

max_estimators = 25 : 0.592823712948518
max_estimators = 60 : 0.5945945945945946
max_estimators = 61 : 0.595879556259905
max_estimators = 64 : 0.5961538461538461
max_estimators = 66 : 0.597444089456869
max_estimators = 71 : 0.5981012658227848
max_estimators = 72 : 0.6022187004754358
max_estimators = 75 : 0.6031746031746031
max_estimators = 76 : 0.6067415730337078


#  Downsampled data

In [162]:
best_score = 0
best_est = 0

for est in range(1, 50): 
    model = RandomForestClassifier(random_state=22244, n_estimators=est, max_depth=20 ) 
    model.fit(downsampled_features_train, downsampled_target_train) 
    predicted_valid = model.predict(features_valid)
    score = f1_score(target_valid, predicted_valid)
    if score > best_score:
        best_score = score
        down_best_est = est
        print("max_estimators =", down_best_est, ": ", end='')
        print(best_score)

max_estimators = 1 : 0.46886446886446886
max_estimators = 2 : 0.5264483627204031
max_estimators = 4 : 0.531322505800464
max_estimators = 6 : 0.534054054054054
max_estimators = 10 : 0.5343035343035343
max_estimators = 34 : 0.5353535353535354


### Conclusion - model adjustment

For Downsampled data
- The Random Forest At **n_estimators = 30** 
    - The **f1 score(0.5902439024390244)has reached the desired accuary. 

For Upsampled data
- The Random Forest At **n_estimators = 6**
    - The **f1 score(0.592503022974607)has reached the desired accuary. 
    
Conclusion:
- using Upsampled data the desired accuracy is achieved in a significantly shortened period of time. 
    - N_estimators= 11 
    - F1 Accuracy = 0.6327433628318584

## Final Model Testing

In [166]:
model = RandomForestClassifier(random_state=22244, n_estimators = 76, max_depth= 20)
model.fit(features_train, target_train)

predicted = model.predict(features_test)

probabilities_test = model.predict_proba(features_test)
probabilities_one_test = probabilities_test[:, 1]


final_f1_score = f1_score(target_test, predicted)

final_roc_auc = roc_auc_score(target_test, probabilities_one_test)

In [167]:
final_f1_score,final_roc_auc

(0.5997425997425998, 0.8514426192790835)

### Final Model Conclusion
F1 score = 0.5997425997425998
ROC_AUC  = 0.8514426192790835

model meet the F1 metric.  


## Overall Conclusion

**MODEL USED**

The Model that was used in this project was a "Random Forest Model"
- highest accuracy of the 3 models for classification. 

**CLASS IMBALANCE**

The ratio of 0:1 was 4:1 and caused the model to guess 0 very well which is not the target.
- upsampling was the method that acheived the highest accuracy after fixing the imbalance. 

**HYPERPARAMETERS**

adjusted hyperparameters to acheive a F1 > 0.59

- max_depth - 20
    - Manually set and found an acceptable setting
    
- n_estimater - 76
    - tested with a function found the best parameter
**F1 = 0.5997425997425998**

**Final Model**.
    - F1 score = 0.5997425997425998
    - ROC_AUC  = 0.8417253237787885