In [142]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

In [143]:
df = pd.read_csv('../00_Data/06_CorrelationDrop.csv')
df.head()

Unnamed: 0,Gender,Term Sub Reason,Location,Age Bucket,Manager ID,Talent,Department,active_months
0,1,1,1,2,102292,5,2,59.0
1,1,1,1,2,100411,5,3,4.0
2,0,1,1,2,102304,5,5,26.0
3,0,1,1,4,102324,5,7,126.0
4,0,11,1,3,100619,5,7,126.0


In [144]:
def classification(df:pd.DataFrame, iteration=1):

    result = []
    for num in range(1, iteration+1):
        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

        scaler = StandardScaler()
        XS_train = scaler.fit_transform(X_train)
        XS_test = scaler.transform(X_test)

        model = KNeighborsClassifier()
        model.fit(XS_train, y_train)

        train_score = round(model.score(XS_train, y_train) * 100, 2)
        test_score = round(model.score(XS_test, y_test) * 100, 2)

        stability = abs(train_score - test_score) 

        result.append([model, train_score, test_score, stability])
    
    df_result = pd.DataFrame(result)
    df_result.columns = ['Model', 'Train Score', 'Test Score', 'Stability']
    df_result = df_result.sort_values(by='Stability')

    return df_result

In [145]:
classification(df, 100).head(10)

Unnamed: 0,Model,Train Score,Test Score,Stability
7,KNeighborsClassifier(),18.44,1.2,17.24
57,KNeighborsClassifier(),19.48,1.81,17.67
70,KNeighborsClassifier(),21.56,3.61,17.95
30,KNeighborsClassifier(),20.26,1.81,18.45
2,KNeighborsClassifier(),19.48,0.6,18.88
59,KNeighborsClassifier(),21.3,2.41,18.89
3,KNeighborsClassifier(),20.78,1.81,18.97
39,KNeighborsClassifier(),20.26,1.2,19.06
86,KNeighborsClassifier(),20.26,1.2,19.06
49,KNeighborsClassifier(),20.52,1.2,19.32


In [146]:
def classify_active_months(num):

    if num < 48:
        value = 'ST'
    elif num < 96:
        value = 'MT'
    else:
        value = 'LT'

    return value

df['active_months'] = df['active_months'].apply(lambda x:classify_active_months(x))
df.head(3)

Unnamed: 0,Gender,Term Sub Reason,Location,Age Bucket,Manager ID,Talent,Department,active_months
0,1,1,1,2,102292,5,2,MT
1,1,1,1,2,100411,5,3,ST
2,0,1,1,2,102304,5,5,ST


In [147]:
classification(df, 100).head(10)

Unnamed: 0,Model,Train Score,Test Score,Stability
17,KNeighborsClassifier(),65.19,64.46,0.73
37,KNeighborsClassifier(),65.45,58.43,7.02
60,KNeighborsClassifier(),67.53,60.24,7.29
56,KNeighborsClassifier(),65.97,57.23,8.74
55,KNeighborsClassifier(),67.79,58.43,9.36
62,KNeighborsClassifier(),67.01,56.63,10.38
64,KNeighborsClassifier(),64.42,53.61,10.81
19,KNeighborsClassifier(),65.97,54.82,11.15
76,KNeighborsClassifier(),66.75,55.42,11.33
65,KNeighborsClassifier(),66.49,54.22,12.27


1. Model - LTO

In [148]:
df = pd.read_csv('../00_Data/06_CorrelationDrop.csv')
df.head()

def classify_active_months(num):

    if num > 144:
        value = 'LT'
    else:
        value = 'OTH'

    return value

df['active_months'] = df['active_months'].apply(lambda x:classify_active_months(x))
df.head(3)

Unnamed: 0,Gender,Term Sub Reason,Location,Age Bucket,Manager ID,Talent,Department,active_months
0,1,1,1,2,102292,5,2,OTH
1,1,1,1,2,100411,5,3,OTH
2,0,1,1,2,102304,5,5,OTH


In [149]:
classification(df, 100).head(10)

Unnamed: 0,Model,Train Score,Test Score,Stability
99,KNeighborsClassifier(),99.74,99.4,0.34
81,KNeighborsClassifier(),99.74,99.4,0.34
63,KNeighborsClassifier(),99.74,99.4,0.34
79,KNeighborsClassifier(),99.74,99.4,0.34
31,KNeighborsClassifier(),99.74,99.4,0.34
32,KNeighborsClassifier(),99.74,99.4,0.34
78,KNeighborsClassifier(),99.74,99.4,0.34
76,KNeighborsClassifier(),99.74,99.4,0.34
36,KNeighborsClassifier(),99.74,99.4,0.34
37,KNeighborsClassifier(),99.74,99.4,0.34


2. Model - LT2 (ABOVE 10 YEARS)

In [150]:
df = pd.read_csv('../00_Data/06_CorrelationDrop.csv')
df = df[df['active_months'] < 144]
df.head()

Unnamed: 0,Gender,Term Sub Reason,Location,Age Bucket,Manager ID,Talent,Department,active_months
0,1,1,1,2,102292,5,2,59.0
1,1,1,1,2,100411,5,3,4.0
2,0,1,1,2,102304,5,5,26.0
3,0,1,1,4,102324,5,7,126.0
4,0,11,1,3,100619,5,7,126.0


In [151]:
def classify_active_months(num):

    if num > 120:
        value = 'LT2'
    else:
        value = 'OTH'

    return value

df['active_months'] = df['active_months'].apply(lambda x:classify_active_months(x))
df.head(3)

Unnamed: 0,Gender,Term Sub Reason,Location,Age Bucket,Manager ID,Talent,Department,active_months
0,1,1,1,2,102292,5,2,OTH
1,1,1,1,2,100411,5,3,OTH
2,0,1,1,2,102304,5,5,OTH


In [152]:
classification(df, 100)

Unnamed: 0,Model,Train Score,Test Score,Stability
2,KNeighborsClassifier(),94.52,94.55,0.03
58,KNeighborsClassifier(),94.52,94.55,0.03
94,KNeighborsClassifier(),93.99,93.94,0.05
48,KNeighborsClassifier(),93.99,93.94,0.05
65,KNeighborsClassifier(),93.99,93.94,0.05
...,...,...,...,...
66,KNeighborsClassifier(),95.56,90.91,4.65
5,KNeighborsClassifier(),95.56,90.91,4.65
21,KNeighborsClassifier(),95.82,90.91,4.91
29,KNeighborsClassifier(),95.82,90.91,4.91


3. Model - LT1 (ABOVE 8 YEARS)

In [153]:
df = pd.read_csv('../00_Data/06_CorrelationDrop.csv')
df = df[df['active_months'] < 120]
df.head()

Unnamed: 0,Gender,Term Sub Reason,Location,Age Bucket,Manager ID,Talent,Department,active_months
0,1,1,1,2,102292,5,2,59.0
1,1,1,1,2,100411,5,3,4.0
2,0,1,1,2,102304,5,5,26.0
5,0,1,1,3,100846,5,7,112.0
7,0,1,1,3,100598,5,7,13.0


In [154]:
def classify_active_months(num):

    if num > 96:
        value = 'LT1'
    else:
        value = 'OTH'

    return value

df['active_months'] = df['active_months'].apply(lambda x:classify_active_months(x))
df.head(3)

Unnamed: 0,Gender,Term Sub Reason,Location,Age Bucket,Manager ID,Talent,Department,active_months
0,1,1,1,2,102292,5,2,OTH
1,1,1,1,2,100411,5,3,OTH
2,0,1,1,2,102304,5,5,OTH


In [155]:
classification(df, 100).head(10)

Unnamed: 0,Model,Train Score,Test Score,Stability
45,KNeighborsClassifier(),91.69,91.67,0.02
46,KNeighborsClassifier(),91.69,91.67,0.02
24,KNeighborsClassifier(),91.69,91.67,0.02
30,KNeighborsClassifier(),91.14,91.03,0.11
28,KNeighborsClassifier(),91.14,91.03,0.11
3,KNeighborsClassifier(),91.41,91.67,0.26
83,KNeighborsClassifier(),91.97,91.67,0.3
21,KNeighborsClassifier(),91.97,91.67,0.3
84,KNeighborsClassifier(),91.41,91.03,0.38
79,KNeighborsClassifier(),92.24,91.67,0.57


4. Model - MT2 (ABOVE 6 YEARS)

In [156]:
df = pd.read_csv('../00_Data/06_CorrelationDrop.csv')
df = df[df['active_months'] < 96]
df.head()

Unnamed: 0,Gender,Term Sub Reason,Location,Age Bucket,Manager ID,Talent,Department,active_months
0,1,1,1,2,102292,5,2,59.0
1,1,1,1,2,100411,5,3,4.0
2,0,1,1,2,102304,5,5,26.0
7,0,1,1,3,100598,5,7,13.0
8,0,11,1,1,100726,5,7,42.0


In [157]:
def classify_active_months(num):

    if num > 72:
        value = 'MT2'
    else:
        value = 'OTH'

    return value

df['active_months'] = df['active_months'].apply(lambda x:classify_active_months(x))
df.head(3)

Unnamed: 0,Gender,Term Sub Reason,Location,Age Bucket,Manager ID,Talent,Department,active_months
0,1,1,1,2,102292,5,2,OTH
1,1,1,1,2,100411,5,3,OTH
2,0,1,1,2,102304,5,5,OTH


In [158]:
classification(df, 100).head(10)

Unnamed: 0,Model,Train Score,Test Score,Stability
64,KNeighborsClassifier(),85.5,85.31,0.19
41,KNeighborsClassifier(),85.5,85.31,0.19
40,KNeighborsClassifier(),85.5,85.31,0.19
68,KNeighborsClassifier(),85.5,85.31,0.19
24,KNeighborsClassifier(),85.5,85.31,0.19
33,KNeighborsClassifier(),85.8,86.01,0.21
34,KNeighborsClassifier(),84.89,84.62,0.27
3,KNeighborsClassifier(),86.4,86.01,0.39
84,KNeighborsClassifier(),85.8,85.31,0.49
87,KNeighborsClassifier(),85.8,85.31,0.49


5. Model - MT1 (ABOVE 4 YEARS)

In [159]:
df = pd.read_csv('../00_Data/06_CorrelationDrop.csv')
df = df[df['active_months'] < 72]
df.head()

Unnamed: 0,Gender,Term Sub Reason,Location,Age Bucket,Manager ID,Talent,Department,active_months
0,1,1,1,2,102292,5,2,59.0
1,1,1,1,2,100411,5,3,4.0
2,0,1,1,2,102304,5,5,26.0
7,0,1,1,3,100598,5,7,13.0
8,0,11,1,1,100726,5,7,42.0


In [160]:
def classify_active_months(num):

    if num > 48:
        value = 'MT1'
    else:
        value = 'OTH'

    return value

df['active_months'] = df['active_months'].apply(lambda x:classify_active_months(x))
df.head(3)

Unnamed: 0,Gender,Term Sub Reason,Location,Age Bucket,Manager ID,Talent,Department,active_months
0,1,1,1,2,102292,5,2,MT1
1,1,1,1,2,100411,5,3,OTH
2,0,1,1,2,102304,5,5,OTH


In [161]:
classification(df, 100).head(10)

Unnamed: 0,Model,Train Score,Test Score,Stability
38,KNeighborsClassifier(),78.93,79.17,0.24
66,KNeighborsClassifier(),78.57,78.33,0.24
70,KNeighborsClassifier(),80.0,79.17,0.83
61,KNeighborsClassifier(),79.29,78.33,0.96
49,KNeighborsClassifier(),77.14,78.33,1.19
24,KNeighborsClassifier(),77.86,76.67,1.19
44,KNeighborsClassifier(),77.86,79.17,1.31
86,KNeighborsClassifier(),80.0,78.33,1.67
39,KNeighborsClassifier(),78.57,76.67,1.9
27,KNeighborsClassifier(),77.86,75.83,2.03


6. Model - ST2 (ABOVE 2 YEARS)

In [162]:
df = pd.read_csv('../00_Data/06_CorrelationDrop.csv')
df = df[df['active_months'] < 48]
df.head()

Unnamed: 0,Gender,Term Sub Reason,Location,Age Bucket,Manager ID,Talent,Department,active_months
1,1,1,1,2,100411,5,3,4.0
2,0,1,1,2,102304,5,5,26.0
7,0,1,1,3,100598,5,7,13.0
8,0,11,1,1,100726,5,7,42.0
9,0,11,1,1,100667,5,7,43.0


In [163]:
def classify_active_months(num):

    if num > 12:
        value = 'ST2'
    else:
        value = 'ST1'

    return value

df['active_months'] = df['active_months'].apply(lambda x:classify_active_months(x))
df.head(3)

Unnamed: 0,Gender,Term Sub Reason,Location,Age Bucket,Manager ID,Talent,Department,active_months
1,1,1,1,2,100411,5,3,ST1
2,0,1,1,2,102304,5,5,ST2
7,0,1,1,3,100598,5,7,ST2


In [164]:
classification(df, 100).head(10)

Unnamed: 0,Model,Train Score,Test Score,Stability
4,KNeighborsClassifier(),85.44,85.39,0.05
49,KNeighborsClassifier(),84.47,84.27,0.2
22,KNeighborsClassifier(),84.47,84.27,0.2
97,KNeighborsClassifier(),84.47,84.27,0.2
9,KNeighborsClassifier(),84.47,84.27,0.2
63,KNeighborsClassifier(),83.98,84.27,0.29
51,KNeighborsClassifier(),83.98,84.27,0.29
8,KNeighborsClassifier(),84.95,85.39,0.44
55,KNeighborsClassifier(),84.95,85.39,0.44
1,KNeighborsClassifier(),84.95,85.39,0.44
