## Problem with splitting 

In every of the previous notebook we did the follow steps. We splitted our sample into train and test with stratification across target feature and then used oversmpling algorithms like SMOTE or Random duplicating.

The problem that could occur is that every time we used the same random state during splitting so after it we had the same train and test in every notebook.

What if train sample that has been created has not enough information about the statistical population of our sample. And if we use some other splitting we will have other result on our test set.

Here we will try to use some other random states for splitting and to get some better metrics' values

In [46]:
from imblearn.over_sampling import SMOTE
from tqdm import tqdm
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import recall_score, confusion_matrix, precision_score, f1_score, accuracy_score, classification_report
import copy

In [47]:
data=pd.read_csv("data.csv")
enc_data=pd.read_csv("encoded_data.csv")

X = enc_data.drop(columns = ['Churn', "customerID"])
y = enc_data['Churn'].values

In [80]:
def train_on_random_splits(model, X, y, K):

    reports = []
    trained_models = []
    
    for i in range(K):

        model = copy.deepcopy(model)
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, stratify=y)

        num_cols = ["tenure", 'MonthlyCharges', 'TotalCharges']

        scaler= StandardScaler()

        X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
        X_test[num_cols] = scaler.transform(X_test[num_cols])

        oversample = SMOTE(sampling_strategy='minority')

        X_resampled, y_resampled = oversample.fit_resample(X_train, y_train)

        model.fit(X_resampled, y_resampled)
        
        predicted_y = model.predict(X_test)

        report = classification_report(y_test, predicted_y, output_dict=True)["1.0"]

        reports.append(report)
        trained_models.append(model)

    return (trained_models, reports)

<a id="knn"></a>
## KNN 

In [81]:
knn_model = KNeighborsClassifier(n_neighbors = 10)
knn_models, knn_reports = train_on_random_splits(knn_model, X, y, 10)

for report in knn_reports:
    print(report)

{'precision': 0.5105386416861827, 'recall': 0.7785714285714286, 'f1-score': 0.6166902404526167, 'support': 280.0}
{'precision': 0.461038961038961, 'recall': 0.7607142857142857, 'f1-score': 0.5741239892183289, 'support': 280.0}
{'precision': 0.481651376146789, 'recall': 0.75, 'f1-score': 0.5865921787709497, 'support': 280.0}
{'precision': 0.45622119815668205, 'recall': 0.7071428571428572, 'f1-score': 0.5546218487394958, 'support': 280.0}
{'precision': 0.4444444444444444, 'recall': 0.7285714285714285, 'f1-score': 0.5520974289580515, 'support': 280.0}
{'precision': 0.508641975308642, 'recall': 0.7357142857142858, 'f1-score': 0.6014598540145986, 'support': 280.0}
{'precision': 0.4618834080717489, 'recall': 0.7357142857142858, 'f1-score': 0.5674931129476584, 'support': 280.0}
{'precision': 0.49776785714285715, 'recall': 0.7964285714285714, 'f1-score': 0.6126373626373627, 'support': 280.0}
{'precision': 0.4598698481561822, 'recall': 0.7571428571428571, 'f1-score': 0.5721997300944669, 'suppor

## Logistic Regression

In [82]:
lr_model = LogisticRegression()
lr_models, lr_reports = train_on_random_splits(lr_model, X, y, 10)

for report in lr_reports:
    print(report)

{'precision': 0.5259433962264151, 'recall': 0.7964285714285714, 'f1-score': 0.6335227272727273, 'support': 280.0}
{'precision': 0.4892703862660944, 'recall': 0.8142857142857143, 'f1-score': 0.6112600536193029, 'support': 280.0}
{'precision': 0.5177777777777778, 'recall': 0.8321428571428572, 'f1-score': 0.6383561643835617, 'support': 280.0}
{'precision': 0.5321100917431193, 'recall': 0.8285714285714286, 'f1-score': 0.6480446927374302, 'support': 280.0}
{'precision': 0.5199063231850117, 'recall': 0.7928571428571428, 'f1-score': 0.628005657708628, 'support': 280.0}
{'precision': 0.49563318777292575, 'recall': 0.8107142857142857, 'f1-score': 0.6151761517615176, 'support': 280.0}
{'precision': 0.5117370892018779, 'recall': 0.7785714285714286, 'f1-score': 0.6175637393767706, 'support': 280.0}
{'precision': 0.49776785714285715, 'recall': 0.7964285714285714, 'f1-score': 0.6126373626373627, 'support': 280.0}
{'precision': 0.511727078891258, 'recall': 0.8571428571428571, 'f1-score': 0.6408544726

## SVM

In [83]:
svc_model = LogisticRegression()
svc_models, svc_reports = train_on_random_splits(svc_model, X, y, 10)

for report in svc_reports:
    print(report)

{'precision': 0.5277777777777778, 'recall': 0.8142857142857143, 'f1-score': 0.6404494382022472, 'support': 280.0}
{'precision': 0.5195402298850574, 'recall': 0.8071428571428572, 'f1-score': 0.6321678321678321, 'support': 280.0}
{'precision': 0.5067264573991032, 'recall': 0.8071428571428572, 'f1-score': 0.6225895316804407, 'support': 280.0}
{'precision': 0.5046511627906977, 'recall': 0.775, 'f1-score': 0.6112676056338028, 'support': 280.0}
{'precision': 0.49665924276169265, 'recall': 0.7964285714285714, 'f1-score': 0.6117969821673526, 'support': 280.0}
{'precision': 0.5333333333333333, 'recall': 0.7714285714285715, 'f1-score': 0.6306569343065693, 'support': 280.0}
{'precision': 0.5111111111111111, 'recall': 0.8214285714285714, 'f1-score': 0.6301369863013698, 'support': 280.0}
{'precision': 0.5077262693156733, 'recall': 0.8214285714285714, 'f1-score': 0.6275579809004093, 'support': 280.0}
{'precision': 0.5155131264916468, 'recall': 0.7714285714285715, 'f1-score': 0.6180257510729614, 'sup

In [84]:
rf_model = RandomForestClassifier(n_estimators=150, n_jobs = -1, max_leaf_nodes = 30)
rf_models, rf_reports = train_on_random_splits(rf_model, X, y, 10)

for report in rf_reports:
    print(report)

{'precision': 0.5347721822541966, 'recall': 0.7964285714285714, 'f1-score': 0.6398852223816356, 'support': 280.0}
{'precision': 0.5373493975903615, 'recall': 0.7964285714285714, 'f1-score': 0.641726618705036, 'support': 280.0}
{'precision': 0.5278450363196125, 'recall': 0.7785714285714286, 'f1-score': 0.6291486291486291, 'support': 280.0}
{'precision': 0.5241545893719807, 'recall': 0.775, 'f1-score': 0.6253602305475504, 'support': 280.0}
{'precision': 0.5371287128712872, 'recall': 0.775, 'f1-score': 0.6345029239766082, 'support': 280.0}
{'precision': 0.5099502487562189, 'recall': 0.7321428571428571, 'f1-score': 0.6011730205278593, 'support': 280.0}
{'precision': 0.5169902912621359, 'recall': 0.7607142857142857, 'f1-score': 0.615606936416185, 'support': 280.0}
{'precision': 0.5142857142857142, 'recall': 0.7714285714285715, 'f1-score': 0.6171428571428571, 'support': 280.0}
{'precision': 0.5546875, 'recall': 0.7607142857142857, 'f1-score': 0.641566265060241, 'support': 280.0}
{'precision'

## AdaBoost

In [85]:
ada_model = AdaBoostClassifier(algorithm="SAMME")
ada_models, ada_reports = train_on_random_splits(ada_model, X, y, 10)

for report in ada_reports:
    print(report)

{'precision': 0.5688775510204082, 'recall': 0.7964285714285714, 'f1-score': 0.6636904761904762, 'support': 280.0}
{'precision': 0.5571030640668524, 'recall': 0.7142857142857143, 'f1-score': 0.6259780907668232, 'support': 280.0}
{'precision': 0.548051948051948, 'recall': 0.7535714285714286, 'f1-score': 0.6345864661654136, 'support': 280.0}
{'precision': 0.548469387755102, 'recall': 0.7678571428571429, 'f1-score': 0.6398809523809523, 'support': 280.0}
{'precision': 0.514218009478673, 'recall': 0.775, 'f1-score': 0.6182336182336182, 'support': 280.0}
{'precision': 0.5298165137614679, 'recall': 0.825, 'f1-score': 0.6452513966480447, 'support': 280.0}
{'precision': 0.5118483412322274, 'recall': 0.7714285714285715, 'f1-score': 0.6153846153846154, 'support': 280.0}
{'precision': 0.5295508274231678, 'recall': 0.8, 'f1-score': 0.6372688477951636, 'support': 280.0}
{'precision': 0.5204081632653061, 'recall': 0.7285714285714285, 'f1-score': 0.6071428571428571, 'support': 280.0}
{'precision': 0.55