In [17]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pandas.plotting import scatter_matrix

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from sklearn import tree
from sklearn.model_selection import GridSearchCV

In [18]:
df = pd.read_csv('Train_Attrition_Cat.csv',sep=';')
#train = df.drop(columns=['BusinessTravel','EducationField', 'MaritalStatus','JobRole'])

In [19]:
from sklearn.preprocessing import LabelEncoder

In [20]:
label_encoders = dict()
column2encode = ['BusinessTravel', 'Department', 'EducationField', 'MaritalStatus','JobRole']

for col in column2encode:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [21]:
attributes = [col for col in df.columns if col != 'Attrition']
X_train = df[attributes].values
y_train = df['Attrition']

In [22]:
test_set = pd.read_csv('Test_pulito.csv')

In [23]:
Label_encoders = dict()
column2encode = ['BusinessTravel', 'Department', 'EducationField', 'MaritalStatus','JobRole']

for col in column2encode:
    le = LabelEncoder()
    test_set[col] = le.fit_transform(test_set[col])
    label_encoders[col] = le

In [24]:
attributes = [col for col in test_set.columns if col != 'Attrition']
X_test = test_set[attributes].values
y_test = test_set['Attrition']

In [30]:
clf = DecisionTreeClassifier(criterion='gini', max_depth=None, 
                             #il minimo splitting di almeno due esempi, min_leaf = min numb di esempi che si richiede iin una foglia
                             min_samples_split=2, min_samples_leaf=1)
#passiamo il training e le labels, citraro dice meglio usare la gini, focalizzaraci su max e min split
clf.fit(X_train, y_train)

DecisionTreeClassifier()

In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [28]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [34]:
param_list = {'min_samples_split': [2, 5, 10, 20],
              'min_samples_leaf': [1, 5, 10, 20],
             }

grid_search = GridSearchCV(clf, param_grid=param_list)
grid_search.fit(X_train, y_train)
clf = grid_search.best_estimator_

In [35]:
param_list = {'max_depth': [None] + list(np.arange(2, 20)),
              'min_samples_split': [2, 5, 10, 20, 30, 50, 100],
              'min_samples_leaf': [1, 5, 10, 20, 30, 50, 100],
             }

random_search = RandomizedSearchCV(clf, param_distributions=param_list, 
                                   n_iter=100)
random_search.fit(X_train, y_train)
clf = random_search.best_estimator_

In [36]:
report(grid_search.cv_results_, n_top=3)

Model with rank: 1
Mean validation score: 0.809 (std: 0.020)
Parameters: {'min_samples_leaf': 10, 'min_samples_split': 2}

Model with rank: 2
Mean validation score: 0.806 (std: 0.036)
Parameters: {'min_samples_leaf': 20, 'min_samples_split': 2}

Model with rank: 2
Mean validation score: 0.806 (std: 0.036)
Parameters: {'min_samples_leaf': 20, 'min_samples_split': 5}

Model with rank: 2
Mean validation score: 0.806 (std: 0.036)
Parameters: {'min_samples_leaf': 20, 'min_samples_split': 10}

Model with rank: 2
Mean validation score: 0.806 (std: 0.036)
Parameters: {'min_samples_leaf': 20, 'min_samples_split': 20}



In [37]:
report(random_search.cv_results_, n_top=3)

Model with rank: 1
Mean validation score: 0.832 (std: 0.013)
Parameters: {'min_samples_split': 100, 'min_samples_leaf': 1, 'max_depth': None}

Model with rank: 2
Mean validation score: 0.832 (std: 0.012)
Parameters: {'min_samples_split': 100, 'min_samples_leaf': 20, 'max_depth': 7}

Model with rank: 2
Mean validation score: 0.832 (std: 0.012)
Parameters: {'min_samples_split': 100, 'min_samples_leaf': 5, 'max_depth': 11}

Model with rank: 2
Mean validation score: 0.832 (std: 0.012)
Parameters: {'min_samples_split': 100, 'min_samples_leaf': 5, 'max_depth': 6}

Model with rank: 2
Mean validation score: 0.832 (std: 0.012)
Parameters: {'min_samples_split': 100, 'min_samples_leaf': 20, 'max_depth': 13}

Model with rank: 2
Mean validation score: 0.832 (std: 0.012)
Parameters: {'min_samples_split': 100, 'min_samples_leaf': 5, 'max_depth': 15}

Model with rank: 2
Mean validation score: 0.832 (std: 0.012)
Parameters: {'min_samples_split': 100, 'min_samples_leaf': 10, 'max_depth': None}

