In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, f1_score, classification_report

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [2]:
df = pd.read_csv('Train_Attrition_Cat.csv', sep=";")
test_set = pd.read_csv('Test_pulito.csv')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1013 entries, 0 to 1012
Data columns (total 25 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1013 non-null   int64 
 1   Attrition                 1013 non-null   object
 2   BusinessTravel            1013 non-null   object
 3   Department                1013 non-null   object
 4   DistanceFromHome          1013 non-null   int64 
 5   Education                 1013 non-null   int64 
 6   EducationField            1013 non-null   object
 7   EnvironmentSatisfaction   1013 non-null   int64 
 8   Gender                    1013 non-null   int64 
 9   JobInvolvement            1013 non-null   int64 
 10  JobRole                   1013 non-null   object
 11  JobSatisfaction           1013 non-null   int64 
 12  MaritalStatus             1013 non-null   object
 13  MonthlyIncome             1013 non-null   int64 
 14  NumCompaniesWorked      

In [28]:
label_encoders = dict()
column2encode = ['Attrition','BusinessTravel','Department','EducationField','MaritalStatus','JobRole']

for col in column2encode:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le
    
for col in column2encode:
    le = LabelEncoder()
    test_set[col] = le.fit_transform(test_set[col])
    label_encoders[col] = le

In [30]:
attributes = [col for col in df.columns if col != 'Attrition']
X_train = df[attributes].values
#il target class
y_train = df['Attrition']

In [31]:
attributes = [col for col in test_set.columns if col != 'Attrition']
X_test = test_set[attributes].values
y_test = test_set['Attrition']

In [20]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

In [27]:
clf = KNeighborsClassifier(n_neighbors=1)

scores = cross_val_score(clf, X, y, cv=10)
print('Accuracy: %0.4f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

scores = cross_val_score(clf, X, y, cv=10, scoring='f1_macro')
print('F1-score: %0.4f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

Accuracy: 0.7245 (+/- 0.07)
F1-score: 0.5056 (+/- 0.07)


In [None]:
#pos_label=1 is not a valid label: array(['No', 'Yes'], dtype='<U3')


In [32]:

# Split the dataset in two equal parts
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

# Set the parameters by cross-validation
tuned_parameters = {
    'n_neighbors': list(range(1, 31, 3)),
    'weights': ['uniform', 'distance'],
}

#scores = ['precision', 'recall', 'f1']
scores = ['recall', 'f1']

for score in scores:
    print("# Tuning hyper-parameters for ----> %s" % score)
    print()
    
    obj = KNeighborsClassifier()   
    
    if (score == "recall"):
      clf = GridSearchCV(obj, tuned_parameters, cv=5, scoring=score)
    else:
      clf = GridSearchCV(obj, tuned_parameters, cv=5, scoring='%s_macro' % score)

    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()

# Tuning hyper-parameters for ----> recall

Best parameters set found on development set:

{'n_neighbors': 1, 'weights': 'uniform'}

Grid scores on development set:

0.206 (+/-0.053) for {'n_neighbors': 1, 'weights': 'uniform'}
0.206 (+/-0.053) for {'n_neighbors': 1, 'weights': 'distance'}
0.018 (+/-0.029) for {'n_neighbors': 4, 'weights': 'uniform'}
0.088 (+/-0.037) for {'n_neighbors': 4, 'weights': 'distance'}
0.018 (+/-0.029) for {'n_neighbors': 7, 'weights': 'uniform'}
0.041 (+/-0.047) for {'n_neighbors': 7, 'weights': 'distance'}
0.006 (+/-0.024) for {'n_neighbors': 10, 'weights': 'uniform'}
0.029 (+/-0.000) for {'n_neighbors': 10, 'weights': 'distance'}
0.006 (+/-0.024) for {'n_neighbors': 13, 'weights': 'uniform'}
0.012 (+/-0.029) for {'n_neighbors': 13, 'weights': 'distance'}
0.000 (+/-0.000) for {'n_neighbors': 16, 'weights': 'uniform'}
0.012 (+/-0.029) for {'n_neighbors': 16, 'weights': 'distance'}
0.000 (+/-0.000) for {'n_neighbors': 19, 'weights': 'uniform'}
0.012 (+/-0.029

In [35]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [36]:
param_list = {'n_neighbors': list(range(1, 31, 3)),
    'weights': ['uniform', 'distance']
}

clf = KNeighborsClassifier(n_neighbors=1)

random_search = RandomizedSearchCV(clf, param_distributions=param_list, n_iter=100)
random_search.fit(X_train, y_train)
report(random_search.cv_results_, n_top=3)



Model with rank: 1
Mean validation score: 0.832 (std: 0.000)
Parameters: {'weights': 'uniform', 'n_neighbors': 16}

Model with rank: 1
Mean validation score: 0.832 (std: 0.000)
Parameters: {'weights': 'uniform', 'n_neighbors': 19}

Model with rank: 1
Mean validation score: 0.832 (std: 0.000)
Parameters: {'weights': 'uniform', 'n_neighbors': 22}

Model with rank: 1
Mean validation score: 0.832 (std: 0.000)
Parameters: {'weights': 'uniform', 'n_neighbors': 25}

Model with rank: 1
Mean validation score: 0.832 (std: 0.000)
Parameters: {'weights': 'uniform', 'n_neighbors': 28}



In [37]:
# we build a DT
clf = KNeighborsClassifier(n_neighbors=16)
#we fit out model by giving the TRAINING SET + Y LABELS
clf.fit(X_train, y_train)

scores = cross_val_score(clf, X_train, y_train, cv=10)
print('Accuracy: %0.4f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

scores = cross_val_score(clf, X_train, y_train, cv=10, scoring='f1_macro')
print('F1-score: %0.4f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

Accuracy: 0.8322 (+/- 0.00)
F1-score: 0.4542 (+/- 0.00)


In [38]:
# we build a DT
clf = KNeighborsClassifier(n_neighbors=1)
#we fit out model by giving the TRAINING SET + Y LABELS
clf.fit(X_train, y_train)

scores = cross_val_score(clf, X_train, y_train, cv=10)
print('Accuracy: %0.4f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

scores = cross_val_score(clf, X_train, y_train, cv=10, scoring='f1_macro')
print('F1-score: %0.4f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

Accuracy: 0.7245 (+/- 0.07)
F1-score: 0.5056 (+/- 0.07)


In [40]:
y_pred = clf.predict(X_test)


In [41]:
print('Accuracy %s' % accuracy_score(y_test, y_pred))
print('F1-score %s' % f1_score(y_test, y_pred, average=None))

Accuracy 0.7585034013605442
F1-score [0.85940594 0.14457831]


In [42]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.87      0.86       249
           1       0.16      0.13      0.14        45

    accuracy                           0.76       294
   macro avg       0.50      0.50      0.50       294
weighted avg       0.74      0.76      0.75       294

