In [80]:
import sys
sys.path.append('../_util')
sys.path.append('../data')
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
import warnings
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import OneHotEncoder
from imblearn.over_sampling import SMOTE
warnings.filterwarnings('ignore')

In [46]:
data = pd.read_csv('../data/DATA_Customer-Churn.csv')
data[data == ' '] = float("NaN")
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='ignore')
data['MonthlyCharges'] = pd.to_numeric(data['MonthlyCharges'], errors='ignore')
data.fillna((data.mean()), inplace=True )
data['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

In [47]:
category_0_base = data[data['Churn'] == 0]
category_1_base = data[data['Churn'] == 1]

category_0 = category_0_base.select_dtypes(include=np.number)
category_1 = category_1_base.select_dtypes(include=np.number)

In [48]:
y = data['Churn']
X = data.drop(['Churn'], axis = 1)

In [49]:
numericalX = X.select_dtypes(np.number)
categoricalX = X.select_dtypes(np.object)

In [50]:
encoder = OneHotEncoder(drop='first').fit(categoricalX)
encoded_categorical = encoder.transform(categoricalX).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)
X = pd.concat([numericalX, encoded_categorical], axis = 1)

In [51]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [54]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

y_test

2200    No
4627    No
3225    No
2828    No
3768    No
        ..
3184    No
2551    No
5735    No
5096    No
450     No
Name: Churn, Length: 1761, dtype: object

In [55]:
clf = RandomForestClassifier(max_depth=6,min_samples_leaf=20,max_features=None,n_estimators=100,
                             bootstrap=True,oob_score=True, random_state=0)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.812570995834911
0.7893242475865985


In [66]:
clf = RandomForestClassifier(max_depth=4,min_samples_leaf=20,max_features=None,n_estimators=100,
                             bootstrap=True,oob_score=True, random_state=0)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=5)
cross_val_scores

array([0.79754021, 0.79470199, 0.7907197 , 0.78693182, 0.79924242])

In [63]:
np.std(cross_val_scores)

0.008611228221346256

# SMOTE

In [69]:
data = pd.concat([numericalX, encoded_categorical], axis = 1)
data

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0,1,29.85,29.85,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0,34,56.95,1889.50,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0,2,53.85,108.15,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,45,42.30,1840.75,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0,2,70.70,151.65,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,24,84.80,1990.50,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0
7039,0,72,103.20,7362.90,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
7040,0,11,29.60,346.45,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7041,1,4,74.40,306.60,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [71]:
smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

No     5174
Yes    5174
Name: Churn, dtype: int64

In [72]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.25, random_state=0)
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

In [73]:
clf = RandomForestClassifier(max_depth=3, random_state=0)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.7978353304986471
0.8024739080015462


In [79]:
clf.predict_proba(X_test)

array([[0.25497639, 0.74502361],
       [0.75592186, 0.24407814],
       [0.54465378, 0.45534622],
       ...,
       [0.26523264, 0.73476736],
       [0.32690023, 0.67309977],
       [0.54187566, 0.45812434]])

In [75]:
clf.predict(X_test)

array(['Yes', 'No', 'No', ..., 'Yes', 'Yes', 'No'], dtype=object)