In [13]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pandas.plotting import scatter_matrix

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

from sklearn import tree
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score

## training set e test set

In [2]:
df = pd.read_csv('Train_Attrition_Cat.csv', sep=";")

test_set = pd.read_csv('Test_pulito.csv')

In [3]:
#portiamo i valori categorici a valori numerici 
label_encoders = dict()
column2encode = ['BusinessTravel','Department','EducationField','MaritalStatus','JobRole']

for col in column2encode:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
    

### prima usiamo il validation test

In [4]:
attributes = [col for col in df.columns if col != 'Attrition']
X = df[attributes].values
y = df['Attrition']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=100, 
                                                    stratify=y)

### best attributes: 
per recall = {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 5, 'min_samples_split': 2}

per F1 = {'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 5, 'min_samples_split': 5}

In [30]:
#per recall = {'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 5, 'min_samples_split': 2}

# we build a DT
clf = DecisionTreeClassifier(criterion='gini', max_depth=None, 
                             min_samples_split=5, min_samples_leaf=2)
# we fit out model by giving the TRAINING SET + Y LABELS
clf.fit(X_train, y_train)

DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=5)

### performance evaluation tramite VALIDATION SET

In [8]:
y_pred = clf.predict(X_test)

In [9]:
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))

Accuracy 0.805921052631579
F1-score [0.88499025 0.37894737]


In [10]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.87      0.90      0.88       253
         Yes       0.41      0.35      0.38        51

    accuracy                           0.81       304
   macro avg       0.64      0.63      0.63       304
weighted avg       0.80      0.81      0.80       304



In [11]:
confusion_matrix(y_test, y_pred)

array([[227,  26],
       [ 33,  18]])

### Cross validation

In [22]:
#con valori
#{'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 5, 'min_samples_split': 2}

scores = cross_val_score(clf, X_train, y_train, cv=10)
print('Accuracy: %0.4f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

scores = cross_val_score(clf, X_train, y_train, cv=10, scoring='f1_macro')
print('F1-score: %0.4f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

Accuracy: 0.7728 (+/- 0.09)
F1-score: 0.5909 (+/- 0.14)


In [None]:
#qui usiamo per F1 = {'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 5, 'min_samples_split': 5}

In [35]:
# we build a DT
clf = DecisionTreeClassifier(criterion='gini', max_depth=15, 
                             min_samples_split=5, min_samples_leaf=5)
# we fit out model by giving the TRAINING SET + Y LABELS
clf.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=15, min_samples_leaf=5, min_samples_split=5)

In [24]:
y_pred = clf.predict(X_test)

In [25]:
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))

Accuracy 0.7763157894736842
F1-score [0.864      0.37037037]


In [26]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.87      0.85      0.86       253
         Yes       0.35      0.39      0.37        51

    accuracy                           0.78       304
   macro avg       0.61      0.62      0.62       304
weighted avg       0.79      0.78      0.78       304



In [27]:

scores = cross_val_score(clf, X_train, y_train, cv=10)
print('Accuracy: %0.4f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

scores = cross_val_score(clf, X_train, y_train, cv=10, scoring='f1_macro')
print('F1-score: %0.4f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

Accuracy: 0.8095 (+/- 0.08)
F1-score: 0.6337 (+/- 0.18)


### performance evaluation su TEST SET

In [28]:
for col in column2encode:
    le = LabelEncoder()
    test_set[col] = le.fit_transform(test_set[col])
    label_encoders[col] = le

In [29]:
attributes = [col for col in test_set.columns if col != 'Attrition']
x_test = test_set[attributes].values
y_test = test_set['Attrition']

In [31]:
clf

DecisionTreeClassifier(min_samples_leaf=2, min_samples_split=5)

In [32]:
y_pred = clf.predict(x_test)

In [33]:
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))

Accuracy 0.7687074829931972
F1-score [0.86454183 0.20930233]


In [34]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.86      0.87      0.86       249
         Yes       0.22      0.20      0.21        45

    accuracy                           0.77       294
   macro avg       0.54      0.54      0.54       294
weighted avg       0.76      0.77      0.76       294



In [36]:
clf

DecisionTreeClassifier(max_depth=15, min_samples_leaf=5, min_samples_split=5)

In [37]:
y_pred = clf.predict(x_test)

In [38]:
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))

Accuracy 0.8095238095238095
F1-score [0.88888889 0.33333333]


In [39]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          No       0.88      0.90      0.89       249
         Yes       0.36      0.31      0.33        45

    accuracy                           0.81       294
   macro avg       0.62      0.61      0.61       294
weighted avg       0.80      0.81      0.80       294



In [None]:
from sklearn.preprocessing import LabelBinarizer

In [None]:
lb = LabelBinarizer()
lb.fit(y_test)
lb.classes_.tolist()

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()
by_test = lb.transform(y_test)
by_pred = lb.transform(y_pred)
for i in range(3):
    fpr[i], tpr[i], _ = roc_curve(by_test[:, i], by_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
    
roc_auc = roc_auc_score(by_test, by_pred, average=None)
roc_auc