## Productivity

### 1. Import and cleaning data

xxx

In [2]:
import pandas as pd
import numpy as np
import pickle
import statsmodels.api as sm

import matplotlib.pyplot as plt
plt.style.use('seaborn-ticks')
%matplotlib inline

pd.set_option("display.max_columns",101)

import gc
import math

In [3]:
df = pd.read_pickle(r'C:\Users\mjani\OneDrive\ML1\productivity_ready.p')
print(df.columns)
print(df.success.unique())

Index(['date', 'quarter', 'department', 'day', 'team', 'targeted_productivity',
       'smv', 'wip', 'over_time', 'incentive', 'idle_time', 'idle_men',
       'no_of_style_change', 'no_of_workers', 'actual_productivity', 'prod',
       'success', 'is_incentive', 'is_idle_time', 'month', 'is_style_change'],
      dtype='object')
[1 0]


In [4]:
# features that we will use
features = ["success","month", "department", "day", "team", "smv","over_time",
            "is_incentive", "is_idle_time", "is_style_change", "no_of_workers"]
levCols = []
numCols = []
for col in features:
    if df[col].dtype==object:
        levCols.append(col)
    else:
        numCols.append(col)

In [5]:
df["success"]

0       1
1       1
2       1
3       1
4       1
       ..
1192    0
1193    0
1194    0
1195    0
1196    0
Name: success, Length: 1197, dtype: int64

In [6]:
for var in levCols:
    display(((pd.crosstab(df["success"], df[var], margins=True)/df.shape[0])*100).round(1))

month,1,2,3,All
success,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,9.9,12.5,4.4,26.9
1,35.3,24.5,13.3,73.1
All,45.3,37.0,17.7,100.0


department,finishing,sweing,All
success,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,17.0,9.9,26.9
1,25.2,47.9,73.1
All,42.3,57.7,100.0


day,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,All
success,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,3.8,3.8,5.3,5.4,4.1,4.3,26.9
1,12.8,11.8,11.6,11.2,12.7,13.0,73.1
All,16.6,15.6,17.0,16.6,16.8,17.4,100.0


team,1,10,11,12,2,3,4,5,6,7,8,9,All
success,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,1.3,2.5,2.1,1.3,2.3,0.9,1.6,2.1,2.6,3.3,4.1,2.8,26.9
1,7.5,5.8,5.3,6.9,6.9,7.0,7.2,5.7,5.3,4.7,5.0,5.8,73.1
All,8.8,8.4,7.4,8.3,9.1,7.9,8.8,7.8,7.9,8.0,9.1,8.7,100.0


is_incentive,0,1,2,All
success,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,22.5,4.3,0.2,26.9
1,28.0,44.9,0.3,73.1
All,50.5,49.1,0.4,100.0


is_idle_time,0,1,All
success,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,25.7,1.2,26.9
1,72.8,0.3,73.1
All,98.5,1.5,100.0


is_style_change,0,1,All
success,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,23.5,3.4,26.9
1,64.2,8.9,73.1
All,87.7,12.3,100.0


In [6]:
# Lets calcualte some stats to see whichc varabiles are most valuable.
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df2 = df.copy()
for col in levCols:
    df2[col] = le.fit_transform(df[col])
numCols.remove("success")

In [7]:
from scipy import stats
from sklearn import feature_selection

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

minfos=[]
for var in levCols:
    print("\n", var)
    print("Pearson", stats.pearsonr(df2["success"], df2[var]))
    print("Mutual info", feature_selection.mutual_info_classif(df2[var].values.reshape(-1,1),df2["success"].values,random_state = 0))
    minfos.append(feature_selection.mutual_info_classif(df2[var].values.reshape(-1,1),df2["success"].values,random_state = 0))
    print("Chi2", feature_selection.chi2(df2[var].values.reshape(-1,1),df2["success"].values))
    print("Anova", feature_selection.f_classif(df2[var].values.reshape(-1,1),df2["success"].values))

imp = list(zip(minfos, levCols))

imp.sort(reverse=True)
imp


 month
Pearson (-0.05764356044352116, 0.04616207439921663)
Mutual info [0.01927889]
Chi2 (array([3.04161998]), array([0.08115489]))
Anova (array([3.984045], dtype=float32), array([0.04615975], dtype=float32))

 department
Pearson (0.2588863753228165, 8.775348222056215e-20)
Mutual info [0.01963103]
Chi2 (array([33.9132106]), array([5.76260863e-09]))
Anova (array([85.84513], dtype=float32), array([8.774773e-20], dtype=float32))

 day
Pearson (-0.009713387050659661, 0.7370838757260759)
Mutual info [0.00737069]
Chi2 (array([0.13087163]), array([0.71753001]))
Anova (array([0.1126998], dtype=float32), array([0.7371501], dtype=float32))

 team
Pearson (-0.14742253143349415, 3.004471631459274e-07)
Mutual info [0.02188792]
Chi2 (array([57.09675386]), array([4.14889915e-14]))
Anova (array([26.548443], dtype=float32), array([3.0044117e-07], dtype=float32))

 is_incentive
Pearson (0.3923872130839211, 2.4053065712321445e-45)
Mutual info [0.07920106]
Chi2 (array([95.30859648]), array([1.62904208e-2

[(array([0.07920106]), 'is_incentive'),
 (array([0.02188792]), 'team'),
 (array([0.01963103]), 'department'),
 (array([0.01927889]), 'month'),
 (array([0.00737069]), 'day'),
 (array([0]), 'is_style_change'),
 (array([0]), 'is_idle_time')]

The most important value seems to be is_incentive, which mutual information has the highest value. The least important are: is_style_change, is_idle_time.

In [9]:
df2.head()

Unnamed: 0,date,quarter,department,day,team,targeted_productivity,smv,wip,over_time,incentive,idle_time,idle_men,no_of_style_change,no_of_workers,actual_productivity,prod,success,is_incentive,is_idle_time,month,is_style_change
0,2015-01-01,Quarter1,1,3,10,0.8,26.16,1108.0,7080,98,0.0,0,0,59.0,0.940725,1.175907,1,1,0,0,0
1,2015-01-01,Quarter1,0,3,0,0.75,3.94,,960,0,0.0,0,0,8.0,0.8865,1.182,1,0,0,0,0
2,2015-01-01,Quarter1,1,3,2,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057,1.000713,1,1,0,0,0
3,2015-01-01,Quarter1,1,3,3,0.8,11.41,968.0,3660,50,0.0,0,0,30.5,0.80057,1.000713,1,1,0,0,0
4,2015-01-01,Quarter1,1,3,8,0.8,25.9,1170.0,1920,50,0.0,0,0,56.0,0.800382,1.000477,1,1,0,0,0


Below there are values of mutual information for continuous variables. The hishest one is for smv.

In [10]:
from scipy import stats
from sklearn import feature_selection

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

minfos=[]
for var in numCols:
    print(var, " Mutual info", feature_selection.mutual_info_regression(df2[var].astype(float).values.reshape(-1,1),df2["success"].astype(float).values,random_state = 0))
    minfos.append(feature_selection.mutual_info_regression(df2[var].astype(float).values.reshape(-1,1),df2["success"].astype(float).values,random_state = 0))
# Sort recorded values
imp = list(zip(minfos, numCols))
imp.sort(reverse=True)
imp

smv  Mutual info [0.134211]
over_time  Mutual info [0.03991998]
no_of_workers  Mutual info [0.05295754]


[(array([0.134211]), 'smv'),
 (array([0.05295754]), 'no_of_workers'),
 (array([0.03991998]), 'over_time')]

##### Creating variables
Here we get dummies and 

In [10]:
# KNN
# AND now switch to dummies
features = ["success","month", "department", "day", "team", "smv","over_time",
            "is_incentive", "is_idle_time", "is_style_change", "no_of_workers"]
levCols = []
numCols = []
for col in features:
    if df[col].dtype==object:
        levCols.append(col)
    else:
        numCols.append(col)
dummLev = pd.get_dummies(df[levCols], drop_first=True)
dummLev.shape
df_dummy = pd.concat([df[numCols], dummLev], axis=1)
numCols.remove("success")
df_dummy[numCols] = df_dummy[numCols].apply(lambda x: (x-x.mean())/x.std())
features = df_dummy.columns.tolist()
features.remove("success")

In [7]:
# # duplicate code?????
# features = ["success","month", "department", "day", "team", "smv","over_time",
#             "is_incentive", "is_idle_time", "is_style_change", "no_of_workers"]
# levCols = []
# numCols = []
# for col in features:
#     if df[col].dtype==object:
#         levCols.append(col)
#     else:
#         numCols.append(col)
# dummLev = pd.get_dummies(df[levCols], drop_first=True)
# dummLev.shape
# df3_dummy = pd.concat([df[numCols], dummLev], axis=1)

In [11]:
features = df_dummy.columns.tolist()
features.remove("success")
features


['smv',
 'over_time',
 'no_of_workers',
 'month_2',
 'month_3',
 'department_sweing',
 'day_Saturday',
 'day_Sunday',
 'day_Thursday',
 'day_Tuesday',
 'day_Wednesday',
 'team_10',
 'team_11',
 'team_12',
 'team_2',
 'team_3',
 'team_4',
 'team_5',
 'team_6',
 'team_7',
 'team_8',
 'team_9',
 'is_incentive_1',
 'is_incentive_2',
 'is_idle_time_1',
 'is_style_change_1']

### 2. Feature selection

In [13]:
# Now lets try logistic regression
from sklearn import metrics
import statsmodels.api as sm
    
aucs=[]
for var in features:
    mod = sm.GLM.from_formula(formula="success ~ " + var, data=df3_dummy, family=sm.families.Binomial())
    res = mod.fit()
    probs = res.predict()
    aucs.append(metrics.roc_auc_score(df3_dummy["success"].values, probs))
#     print(var, metrics.roc_auc_score(df3_dummy["success"].values, probs))

print("\n", "--------------", "\n")
imp = list(zip(aucs, features))
imp.sort(reverse=True)
imp


 -------------- 



[(0.727664596273292, 'is_incentive_1'),
 (0.6744081632653061, 'no_of_workers'),
 (0.6441987577639751, 'department_sweing'),
 (0.6408731144631765, 'over_time'),
 (0.6348127772848269, 'smv'),
 (0.5654906832298137, 'month_2'),
 (0.5418012422360249, 'team_8'),
 (0.5309192546583852, 'team_3'),
 (0.530111801242236, 'team_7'),
 (0.524360248447205, 'day_Thursday'),
 (0.5225838509316769, 'team_12'),
 (0.5199503105590062, 'day_Sunday'),
 (0.5196397515527951, 'team_4'),
 (0.5194534161490683, 'is_idle_time_1'),
 (0.5127950310559006, 'team_9'),
 (0.5121366459627329, 'team_6'),
 (0.5107701863354037, 'day_Tuesday'),
 (0.5091428571428571, 'day_Saturday'),
 (0.5085590062111801, 'month_3'),
 (0.5083975155279503, 'day_Wednesday'),
 (0.506583850931677, 'team_10'),
 (0.5049316770186335, 'team_2'),
 (0.5030931677018634, 'is_style_change_1'),
 (0.5028198757763975, 'team_11'),
 (0.5013913043478261, 'is_incentive_2'),
 (0.5000372670807453, 'team_5')]

In [14]:
chosen_vars = []
for auc,var in imp:
    if auc>= 0.51:
        chosen_vars.append(var)
        chosen_vars
chosen_vars

['is_incentive_1',
 'no_of_workers',
 'department_sweing',
 'over_time',
 'smv',
 'month_2',
 'team_8',
 'team_3',
 'team_7',
 'day_Thursday',
 'team_12',
 'day_Sunday',
 'team_4',
 'is_idle_time_1',
 'team_9',
 'team_6',
 'day_Tuesday']

In [8]:
# # KNN
# # poszło na gore 
# features = ["success","month", "department", "day", "team", "smv","over_time",
#             "is_incentive", "is_idle_time", "is_style_change", "no_of_workers"]
# levCols = []
# numCols = []
# for col in features:
#     if df[col].dtype==object:
#         levCols.append(col)
#     else:
#         numCols.append(col)
# dummLev = pd.get_dummies(df[levCols], drop_first=True)
# dummLev.shape
# df_dummy = pd.concat([df[numCols], dummLev], axis=1)
# numCols.remove("success")
# df_dummy[numCols] = df_dummy[numCols].apply(lambda x: (x-x.mean())/x.std())
# features = df_dummy.columns.tolist()
# features.remove("success")

In [16]:
df_dummy.dtypes

success                int64
smv                  float64
over_time            float64
no_of_workers        float64
month_2                uint8
month_3                uint8
department_sweing      uint8
day_Saturday           uint8
day_Sunday             uint8
day_Thursday           uint8
day_Tuesday            uint8
day_Wednesday          uint8
team_10                uint8
team_11                uint8
team_12                uint8
team_2                 uint8
team_3                 uint8
team_4                 uint8
team_5                 uint8
team_6                 uint8
team_7                 uint8
team_8                 uint8
team_9                 uint8
is_incentive_1         uint8
is_incentive_2         uint8
is_idle_time_1         uint8
is_style_change_1      uint8
dtype: object

In [18]:
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn import neighbors
df_dummy["success"] = df_dummy["success"].astype(int)
def runKNN(features=features):
    n_neighbors = 30
    kf = KFold(n_splits=5, random_state=0)
    probs = []
    aucs = []
    clf = neighbors.KNeighborsClassifier(n_neighbors, n_jobs=-1, p=1)
    for train, test in kf.split(df_dummy.index.values):
        X = df_dummy.iloc[train][features]
        clf.fit(X.values, df_dummy.iloc[X.index.values]["success"].values)
        prob = clf.predict_proba(df_dummy.iloc[test][features].values)
        aucs.append(metrics.roc_auc_score(df_dummy.iloc[test]["success"].values, prob[:,1]))
    print(np.mean(aucs), aucs)
    return np.mean(aucs)

In [19]:
# Długo sie liczy! + moze nie warto dodawac?
# newFeatures = []
# tempFeatures = []

# for k in range(len(features)):
#     aucs=[]
#     featToTest = list(set(features) - set(newFeatures))
#     for feat in featToTest:
#         tempFeatures = newFeatures.copy()
#         tempFeatures.append(feat)
#         auc = runKNN(tempFeatures)
#         aucs.append(auc)
#         print(feat, tempFeatures, auc)
#     imp = list(zip(aucs, featToTest))
#     imp.sort(reverse=True)
#     print(imp[0:5])
#     print("##############")
#     print("Selecting:", imp[0][1])
#     newFeatures.append(imp[0][1])
#     print("Current selection:", newFeatures)
#     print("##############")

We decide to remove observations with auc below 0.51 to improve the model. Most of the values that were pointed above were also important in logit model. <br>
Next step is to create models: Logit, SVM and KNN.

### 3. Modeling

### 3.1. Logit model

In [22]:
all_columns = "+".join(chosen_vars)
my_formula = "success ~" + all_columns
my_formula # formula used in code for logit model

'success ~is_incentive_1+no_of_workers+department_sweing+over_time+smv+month_2+team_8+team_3+team_7+day_Thursday+team_12+day_Sunday+team_4+is_idle_time_1+team_9+team_6+day_Tuesday'

In [23]:
mod = sm.GLM.from_formula( formula= my_formula , data= df_dummy, family=sm.families.Binomial())
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,success,No. Observations:,1197.0
Model:,GLM,Df Residuals:,1179.0
Model Family:,Binomial,Df Model:,17.0
Link Function:,logit,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-530.76
Date:,"Sun, 30 May 2021",Deviance:,1061.5
Time:,23:24:58,Pearson chi2:,1360.0
No. Iterations:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.9801,0.393,2.493,0.013,0.210,1.751
is_incentive_1,2.6607,0.283,9.412,0.000,2.107,3.215
no_of_workers,1.7037,0.406,4.201,0.000,0.909,2.499
department_sweing,-0.8160,0.680,-1.199,0.230,-2.150,0.518
over_time,-0.3378,0.146,-2.316,0.021,-0.624,-0.052
smv,-1.3382,0.215,-6.219,0.000,-1.760,-0.916
month_2,-0.4472,0.170,-2.624,0.009,-0.781,-0.113
team_8,-0.6663,0.254,-2.624,0.009,-1.164,-0.169
team_3,0.8311,0.362,2.294,0.022,0.121,1.541


In [24]:
from sklearn.metrics import roc_auc_score
kf = KFold(n_splits=5, shuffle=True, random_state=0)

for train, test in kf.split(df_dummy.index.values):
    mod = sm.GLM.from_formula(formula=my_formula, data=df_dummy.iloc[train], family=sm.families.Binomial())
    res = mod.fit()
    predsTrain = res.predict()
    preds = res.predict(df_dummy.iloc[test])
    print("Train AUC:", roc_auc_score(df_dummy.iloc[train].success, predsTrain), "Valid AUC:", roc_auc_score(df_dummy.iloc[test].success, preds))


Train AUC: 0.8381575007485776 Valid AUC: 0.7713068181818182
Train AUC: 0.8192279760907212 Valid AUC: 0.8431110344232594
Train AUC: 0.8142851619246468 Valid AUC: 0.8579996392496393
Train AUC: 0.8323395681886249 Valid AUC: 0.7772797378060536
Train AUC: 0.8251576438235743 Valid AUC: 0.7983735747820254


Valid AUC are quite different so there is a need to .....

To choose the best number of KFolds, we run logit model and get a mean value od AUC.

In [41]:
folds = [4,5,6,7,8,9,10]
for f in folds:
    trainRes = []
    valRes = []
    
    kf = KFold(n_splits=f, shuffle=True)

    for train, test in kf.split(df_dummy.index.values):
        mod = sm.GLM.from_formula(formula=my_formula, data=df_dummy.iloc[train], family=sm.families.Binomial())
        res = mod.fit()
        predsTrain = res.predict()
        preds = res.predict(df_dummy.iloc[test])
        trainRes.append(roc_auc_score(df_dummy.iloc[train].success, predsTrain))
        valRes.append(roc_auc_score(df_dummy.iloc[test].success, preds))
    print("KFolds:", f, "Train AUC:", np.mean(trainRes), "Valid AUC:", np.mean(valRes))

KFolds: 4 Train AUC: 0.8261450015864831 Valid AUC: 0.8128754229403008
KFolds: 5 Train AUC: 0.825276918785337 Valid AUC: 0.8076019628846801
KFolds: 6 Train AUC: 0.8257879278280572 Valid AUC: 0.8102525608493919
KFolds: 7 Train AUC: 0.8251246692483788 Valid AUC: 0.8132993521595209
KFolds: 8 Train AUC: 0.8251416050465292 Valid AUC: 0.8050649056281969
KFolds: 9 Train AUC: 0.8251817376353974 Valid AUC: 0.8076095479343024
KFolds: 10 Train AUC: 0.8251181719625758 Valid AUC: 0.8017022929995555


It seems that for KFolds = 7, Valid AUC is the best. <br>
KFolds: 7 <br>
Train AUC: 0.8251246692483788 <br> Valid AUC: 0.8132993521595209

In [47]:
trainRes = []
valRes = []
    
kf = KFold(n_splits=7, shuffle=True)

for train, test in kf.split(df_dummy.index.values):
    mod = sm.GLM.from_formula(formula=my_formula, data=df_dummy.iloc[train], family=sm.families.Binomial())
    res = mod.fit()
    predsTrain = res.predict()
    preds = res.predict(df_dummy.iloc[test])
    trainRes.append(roc_auc_score(df_dummy.iloc[train].success, predsTrain))
    valRes.append(roc_auc_score(df_dummy.iloc[test].success, preds))
print("Train AUC:", np.mean(trainRes), "Valid AUC:", np.mean(valRes))

Train AUC: 0.825386666776593 Valid AUC: 0.8093099460039309


### 3.2. SVM

In [48]:
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.svm import SVC

First step is to choose the best kernel.


In [56]:
aucs = []
aucsT = []
clf = SVC(C = 1, cache_size=500, kernel= 'linear',degree = 1,
          max_iter=-1, probability=True,
          tol=0.001, verbose=False, random_state = 3)
for train, test in kf.split(df_dummy.index.values):
    clf.fit(df_dummy.iloc[train][chosen_vars].values, df_dummy.iloc[train]["success"].values)
    prob = clf.predict_proba(df_dummy.iloc[test][chosen_vars])
    aucs.append(metrics.roc_auc_score(df_dummy.iloc[test]["success"].values, prob[:,1])) 

    prob2 = clf.predict_proba(df_dummy.iloc[train][chosen_vars])
    aucsT.append(metrics.roc_auc_score(df_dummy.iloc[train]["success"].values, prob2[:,1]))
print(c)
print("Train set: ", np.mean(aucsT))
print("Validation set: ", np.mean(aucs))
print("----------------------------")

10
Train set:  0.8136593313304242
Validation set:  0.7772868615567508
----------------------------


In [57]:
aucs = []
aucsT = []
clf = SVC(C = 1, cache_size=500, kernel= 'poly',
          max_iter=-1, probability=True,
          tol=0.001, verbose=False, random_state = 3)
for train, test in kf.split(df_dummy.index.values):
    clf.fit(df_dummy.iloc[train][chosen_vars].values, df_dummy.iloc[train]["success"].values)
    prob = clf.predict_proba(df_dummy.iloc[test][chosen_vars])
    aucs.append(metrics.roc_auc_score(df_dummy.iloc[test]["success"].values, prob[:,1])) 

    prob2 = clf.predict_proba(df_dummy.iloc[train][chosen_vars])
    aucsT.append(metrics.roc_auc_score(df_dummy.iloc[train]["success"].values, prob2[:,1]))
print(c)
print("Train set: ", np.mean(aucsT))
print("Validation set: ", np.mean(aucs))
print("----------------------------")

10
Train set:  0.872347334945539
Validation set:  0.7601503684303025
----------------------------


In [66]:
aucs = []
aucsT = []
clf = SVC(C = 1, cache_size=500, kernel= 'rbf',
          max_iter=-1, probability=True,
          tol=0.001, verbose=False, random_state = 3)
for train, test in kf.split(df_dummy.index.values):
    clf.fit(df_dummy.iloc[train][chosen_vars].values, df_dummy.iloc[train]["success"].values)
    prob = clf.predict_proba(df_dummy.iloc[test][chosen_vars])
    aucs.append(metrics.roc_auc_score(df_dummy.iloc[test]["success"].values, prob[:,1])) 

    prob2 = clf.predict_proba(df_dummy.iloc[train][chosen_vars])
    aucsT.append(metrics.roc_auc_score(df_dummy.iloc[train]["success"].values, prob2[:,1]))
print(c)
print("Train set: ", np.mean(aucsT))
print("Validation set: ", np.mean(aucs))
print("----------------------------")

10
Train set:  0.865473575368106
Validation set:  0.7834616595222912
----------------------------


The best one is RBF with degree 4.

Next step is to choose the best cost. As we can see below for c = 2 we get the best Validation set AUC. <br>
Train set:  0.8742633003804725 <br>
Validation set:  0.7783666481627748

In [67]:
kf = KFold(n_splits=5, random_state = 3)

C = [0.01, 0.1, 0.5, 1, 2, 3, 5, 10]

for c in C:
    aucs = []
    aucsT = []
    clf = SVC(C = c, cache_size=500, kernel= 'rbf',
              max_iter=-1, probability=True,
              tol=0.001, verbose=False, random_state = 3)
    for train, test in kf.split(df_dummy.index.values):
        clf.fit(df_dummy.iloc[train][chosen_vars].values, df_dummy.iloc[train]["success"].values)
        prob = clf.predict_proba(df_dummy.iloc[test][chosen_vars])
        aucs.append(metrics.roc_auc_score(df_dummy.iloc[test]["success"].values, prob[:,1])) 

        prob2 = clf.predict_proba(df_dummy.iloc[train][chosen_vars])
        aucsT.append(metrics.roc_auc_score(df_dummy.iloc[train]["success"].values, prob2[:,1]))
    print(c)
    print("Train set: ", np.mean(aucsT))
    print("Validation set: ", np.mean(aucs))
    print("----------------------------")



0.01
Train set:  0.8527592827905375
Validation set:  0.7687428727714005
----------------------------
0.1
Train set:  0.8559087040741027
Validation set:  0.7704166532595091
----------------------------
0.5
Train set:  0.8586321395743924
Validation set:  0.7802745813762769
----------------------------
1
Train set:  0.865473575368106
Validation set:  0.7834616595222912
----------------------------
2
Train set:  0.8742633003804725
Validation set:  0.7783666481627748
----------------------------
3
Train set:  0.8798693583328427
Validation set:  0.7717168486754633
----------------------------
5
Train set:  0.888834454231262
Validation set:  0.7590496381917148
----------------------------
10
Train set:  0.9009400027955454
Validation set:  0.7575091278373584
----------------------------


In [62]:
kf = KFold(n_splits=5, random_state = 3)

aucs = []
aucsT = []
clf = SVC(C = 2, cache_size=500, kernel= 'rbf',
          max_iter=-1, probability=True,
          tol=0.001, verbose=False, random_state = 3)
for train, test in kf.split(df_dummy.index.values):
    clf.fit(df_dummy.iloc[train][chosen_vars].values, df_dummy.iloc[train]["success"].values)
    prob = clf.predict_proba(df_dummy.iloc[test][chosen_vars])
    aucs.append(metrics.roc_auc_score(df_dummy.iloc[test]["success"].values, prob[:,1])) 

    prob2 = clf.predict_proba(df_dummy.iloc[train][chosen_vars])
    aucsT.append(metrics.roc_auc_score(df_dummy.iloc[train]["success"].values, prob2[:,1]))
print(c)
print("Train set: ", np.mean(aucsT))
print("Validation set: ", np.mean(aucs))
print("----------------------------")

10
Train set:  0.8742633003804725
Validation set:  0.7783666481627748
----------------------------


Next step is to chose the best value for KFolds. The 5 folds are the best for SVM.

In [70]:
K = [3, 5, 8, 10]
for k in K:
    
    import warnings
    warnings.simplefilter(action='ignore', category=FutureWarning)
    
    kf = KFold(n_splits=k, random_state = 3)

    aucs = []
    aucsT = []
    clf = SVC(C = 2, cache_size=500, kernel= 'rbf',
              max_iter=-1, probability=True,
              tol=0.001, verbose=False, random_state = 3)
    for train, test in kf.split(df_dummy.index.values):
        clf.fit(df_dummy.iloc[train][chosen_vars].values, df_dummy.iloc[train]["success"].values)
        prob = clf.predict_proba(df_dummy.iloc[test][chosen_vars])
        aucs.append(metrics.roc_auc_score(df_dummy.iloc[test]["success"].values, prob[:,1])) 

        prob2 = clf.predict_proba(df_dummy.iloc[train][chosen_vars])
        aucsT.append(metrics.roc_auc_score(df_dummy.iloc[train]["success"].values, prob2[:,1]))
    print(k)
    print("Train set: ", np.mean(aucsT))
    print("Validation set: ", np.mean(aucs))
    print("----------------------------")

3
Train set:  0.8762728228545202
Validation set:  0.7686167473801676
----------------------------
5
Train set:  0.8742633003804725
Validation set:  0.7783666481627748
----------------------------
8
Train set:  0.870991246089796
Validation set:  0.770489034215821
----------------------------
10
Train set:  0.8700417542850589
Validation set:  0.7768644329709875
----------------------------


#### Place for in imbalance

#### Best SVM model


Value of AUC is much smaller than in logit model.

In [71]:
kf = KFold(n_splits=5, random_state = 3)
aucs = []
aucsT = []


clf = SVC(C = 2, cache_size=500, kernel= 'rbf',
max_iter=-1, probability=True,
tol=0.001, verbose=False, random_state = 3)

for train, test in kf.split(df_dummy.index.values):
    clf.fit(df_dummy.iloc[train][chosen_vars].values, df_dummy.iloc[train]["success"].values)
    prob = clf.predict_proba(df_dummy.iloc[test][chosen_vars])
    aucs.append(metrics.roc_auc_score(df_dummy.iloc[test]["success"].values, prob[:,1]))



    prob2 = clf.predict_proba(df_dummy.iloc[train][chosen_vars])
    aucsT.append(metrics.roc_auc_score(df_dummy.iloc[train]["success"].values, prob2[:,1]))



print("Train set: ", np.mean(aucsT))
print("Validation set: ", np.mean(aucs))
print("----------------------------")

Train set:  0.8742633003804725
Validation set:  0.7783666481627748
----------------------------


## KNN

In [72]:
from sklearn.model_selection import StratifiedKFold
import math
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn import neighbors

First thing is to chose the best KFold number. As it seems the best values are 6 and 9. WHich one to choose????? <br>
folds:  6 <br>
AUC:  0.8086128044390489<br>
ACC:  0.768538525963149<br>
folds:  9<br>
AUC:  0.809097714977945<br>
ACC:  0.7677527151211361<br>

In [77]:
splits = [4,5,6, 7,8,9, 10]
for split in splits:
    kf = KFold(n_splits=split, shuffle=True,random_state=0)
    probs = []
    indicies = []
    aucs = []
    bacc = []
    accs = []
    n_neighbors = int(math.sqrt(df_dummy.shape[0]))
    clf = neighbors.KNeighborsClassifier(n_neighbors, n_jobs=-1, p=1)
    for train, test in kf.split(df_dummy.index.values):
        clf.fit(df_dummy.iloc[train][chosen_vars].values, df_dummy.iloc[train]["success"].values)
        prob = clf.predict_proba(df_dummy.iloc[test][chosen_vars].values)
        probs.append(prob)
        indicies.append(test)
        aucs.append(metrics.roc_auc_score(df_dummy.iloc[test]["success"].values, prob[:,1]))
        accs.append(metrics.accuracy_score(df_dummy.iloc[test]["success"].values, (prob[:,1]>0.50).astype(int)))
    print("folds: ", split)
    print('AUC: ', np.mean(aucs))
    print('ACC: ',np.mean(accs))


folds:  4
AUC:  0.8000407312461417
ACC:  0.76273132664437
folds:  5
AUC:  0.8017439190072244
ACC:  0.7660564853556485
folds:  6
AUC:  0.8086128044390489
ACC:  0.768538525963149
folds:  7
AUC:  0.8052018579196293
ACC:  0.7677527151211362
folds:  8
AUC:  0.8022637910110286
ACC:  0.7702125279642058
folds:  9
AUC:  0.809097714977945
ACC:  0.7677527151211361
folds:  10
AUC:  0.8057009383625025
ACC:  0.7701680672268908


Best auc is for 8 folds. ACC:  0.781619779156479

In [78]:
# Manhattan
splits = [4,5,6, 7,8,9, 10]
for split in splits:
    kf = KFold(n_splits=split, random_state=0)
    probs = []
    indicies = []
    aucs = []
    bacc = []
    accs = []
    n_neighbors = int(math.sqrt(df_dummy.shape[0]))
    clf = neighbors.KNeighborsClassifier(n_neighbors, n_jobs=-1, p=2)
    for train, test in kf.split(df_dummy.index.values):
        clf.fit(df_dummy.iloc[train][chosen_vars].values, df_dummy.iloc[train]["success"].values)
        prob = clf.predict_proba(df_dummy.iloc[test][chosen_vars].values)
        probs.append(prob)
        indicies.append(test)
        aucs.append(metrics.roc_auc_score(df_dummy.iloc[test]["success"].values, prob[:,1]))
        accs.append(metrics.accuracy_score(df_dummy.iloc[test]["success"].values, (prob[:,1]>0.50).astype(int)))
    print("folds: ", split)
    print('AUC: ', np.mean(aucs))
    print('ACC: ',np.mean(accs))


folds:  4
AUC:  0.7725186881409163
ACC:  0.762675585284281
folds:  5
AUC:  0.7754703681562208
ACC:  0.7551046025104602
folds:  6
AUC:  0.7826712501156131
ACC:  0.7500670016750419
folds:  7
AUC:  0.7805300224961821
ACC:  0.7577276524644946
folds:  8
AUC:  0.7811068579407554
ACC:  0.754261744966443
folds:  9
AUC:  0.7820674608299727
ACC:  0.7577276524644946
folds:  10
AUC:  0.7835929490111873
ACC:  0.7567997198879551


Best auc is for 10 folds. ACC:  0.7567997198879551 MANHATTAN


For 0.8 * sqrt of number of obserations

In [81]:
# Manhattan
neighbors_list = [0.7, 0.8, 0.9, 1, 1.1, 1.2, 1.3]
for n in neighbors_list:
    kf = KFold(n_splits=10, random_state=0)
    probs = []
    indicies = []
    aucs = []
    bacc = []
    accs = []
    n_neighbors = int(math.sqrt(df_dummy.shape[0]))
    clf = neighbors.KNeighborsClassifier(int(n_neighbors*n), n_jobs=-1, p=2)
    for train, test in kf.split(df_dummy.index.values):
        clf.fit(df_dummy.iloc[train][chosen_vars].values, df_dummy.iloc[train]["success"].values)
        prob = clf.predict_proba(df_dummy.iloc[test][chosen_vars].values)
        probs.append(prob)
        indicies.append(test)
        aucs.append(metrics.roc_auc_score(df_dummy.iloc[test]["success"].values, prob[:,1]))
        accs.append(metrics.accuracy_score(df_dummy.iloc[test]["success"].values, (prob[:,1]>0.50).astype(int)))
    print("Number of neighbours: ", n, int(n_neighbors*n))
    print('AUC: ', np.mean(aucs))
    print('ACC: ',np.mean(accs))


Number of neighbours:  0.7 23
AUC:  0.7862866025004377
ACC:  0.7693627450980391
Number of neighbours:  0.8 27
AUC:  0.7862386590498829
ACC:  0.754313725490196
Number of neighbours:  0.9 30
AUC:  0.7823645481741375
ACC:  0.7534803921568628
Number of neighbours:  1 34
AUC:  0.7835929490111873
ACC:  0.7567997198879551
Number of neighbours:  1.1 37
AUC:  0.7837439695970901
ACC:  0.7517647058823529
Number of neighbours:  1.2 40
AUC:  0.784827747406134
ACC:  0.7576050420168067
Number of neighbours:  1.3 44
AUC:  0.7794997356958996
ACC:  0.7584313725490196


For 34 neighbors the best :))
