In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pandas.plotting import scatter_matrix

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc, roc_auc_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
from sklearn import tree
from sklearn.model_selection import GridSearchCV


In [3]:
df = pd.read_csv('Train_newColumns.csv')
#dfnew = df.drop(columns=['BusinessTravel', 'Department', 'EducationField', 'MaritalStatus','JobRole'])

In [4]:
label_encoders = dict()
column2encode = ['BusinessTravel', 'Department', 'EducationField', 'MaritalStatus','JobRole']

for col in column2encode:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [5]:
attributes = [col for col in df.columns if col != 'Attrition']
X = df[attributes].values
y = df['Attrition']

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=100, 
                                                    stratify=y)

In [15]:
X_train.shape, X_test.shape

((709, 24), (304, 24))

In [6]:
# Set the parameters by cross-validation
tuned_parameters = {
    'criterion' : ['gini', 'entropy'],
    'max_depth': [2, 5, 10, 15, None],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 5, 10, 20],
}

#scores = ['precision', 'recall', 'f1']
scores = ['recall', 'f1']

for score in scores:
    print("# Tuning hyper-parameters for ----> %s" % score)
    print()
    
    obj = tree.DecisionTreeClassifier()
    
    if (score == "precision" or "recall"):
      clf = GridSearchCV(obj, tuned_parameters, cv=10, scoring=score)
    else:
      clf = GridSearchCV(obj, tuned_parameters, cv=10, scoring='%s_macro' % score)
                         
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for ----> recall

Best parameters set found on development set:

{'criterion': 'gini', 'max_depth': None, 'min_samples_leaf': 5, 'min_samples_split': 2}

Grid scores on development set:

0.128 (+/-0.291) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}
0.128 (+/-0.291) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 5}
0.128 (+/-0.291) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 10}
0.128 (+/-0.291) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 20}
0.128 (+/-0.291) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 2}
0.128 (+/-0.291) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 5}
0.128 (+/-0.291) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 10}
0.128 (+/-0.291) for {'criterion': 'gini',

Best parameters set found on development set:

{'criterion': 'gini', 'max_depth': 15, 'min_samples_leaf': 5, 'min_samples_split': 5}

Grid scores on development set:

0.165 (+/-0.358) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 2}
0.165 (+/-0.358) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 5}
0.165 (+/-0.358) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 10}
0.165 (+/-0.358) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 1, 'min_samples_split': 20}
0.165 (+/-0.358) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 2}
0.165 (+/-0.358) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 5}
0.165 (+/-0.358) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_samples_split': 10}
0.165 (+/-0.358) for {'criterion': 'gini', 'max_depth': 2, 'min_samples_leaf': 5, 'min_s

In [7]:
#capire l'importanza di una features
for col, imp in zip(attributes, clf.feature_importances_):
    print(col, imp)

AttributeError: 'GridSearchCV' object has no attribute 'feature_importances_'