In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
!pip install imbalanced-learn



In [7]:
from imblearn.under_sampling import RandomUnderSampler

In [4]:
train_data = pd.read_csv("data/cleaned_train.csv")

In [5]:
target = train_data['targets'].values
features = train_data.iloc[:, :-1]

In [6]:
features.head()

Unnamed: 0,account_length,international_plan,voice_mail_plan,number_vmail_messages,total_day_minutes,total_day_calls,total_day_charge,total_eve_minutes,total_eve_calls,total_eve_charge,total_night_minutes,total_night_calls,total_night_charge,total_intl_minutes,total_intl_calls,total_intl_charge,number_customer_service_calls
0,107,0,1,26,161.6,123,27.47,195.5,103,16.62,254.4,103,11.45,13.7,3,3.7,1
1,137,0,0,0,243.4,114,41.38,121.2,110,10.3,162.6,104,7.32,12.2,5,3.29,0
2,84,1,0,0,299.4,71,50.9,61.9,88,5.26,196.9,89,8.86,6.6,7,1.78,2
3,75,1,0,0,166.7,113,28.34,148.3,122,12.61,186.9,121,8.41,10.1,3,2.73,3
4,121,0,1,24,218.2,88,37.09,348.5,108,29.62,212.6,118,9.57,7.5,7,2.03,3


In [8]:
under_sampler = RandomUnderSampler(sampling_strategy='majority')

In [9]:
sampled_features, sampled_targets = under_sampler.fit_sample(features, target)

In [10]:
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(sampled_features)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(scaled_features, sampled_targets, stratify = sampled_targets, test_size = 0.20)

In [12]:
from sklearn.svm import SVC

In [13]:
clf = SVC(C=50.0, )

In [14]:
clf.fit(X_train, y_train)

SVC(C=50.0)

In [15]:
clf.score(X_test, y_test)

0.8833333333333333

In [16]:
from sklearn.ensemble import RandomForestClassifier

In [17]:
clf2 = RandomForestClassifier(n_estimators=100, max_depth=500)

In [18]:
clf2.fit(X_train, y_train)

RandomForestClassifier(max_depth=500)

In [19]:
clf2.score(X_test, y_test)

0.9375

In [20]:
from sklearn.metrics import classification_report

In [21]:
print(classification_report(y_test, clf2.predict(X_test)))

              precision    recall  f1-score   support

           0       0.92      0.96      0.94       120
           1       0.96      0.92      0.94       120

    accuracy                           0.94       240
   macro avg       0.94      0.94      0.94       240
weighted avg       0.94      0.94      0.94       240



In [53]:
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
param_grid = {
    'max_depth': [80, 90, 100, 110, 300],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}
# Create a based model
rf = RandomForestClassifier()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [54]:
grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 180 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   16.0s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:  4.7min finished


GridSearchCV(cv=3, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'max_depth': [80, 90, 100, 110, 300],
                         'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             verbose=2)

In [56]:
best_clf = grid_search.best_estimator_

In [57]:
best_clf.score(X_test, y_test)

0.9529411764705882

In [22]:
import pickle

In [23]:
filename = 'models/churn_prediction.pickle'

with open(filename, "wb") as file:
    pickle.dump(clf2, file)

In [24]:
filename_scaler = 'models/scaler.pickle'

with open(filename_scaler, "wb") as file:
    pickle.dump(scaler, file)