In [174]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle 
from sklearn import tree
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

Importing the data from the csv file into panda dataframe

In [175]:
data=pd.read_csv("/Users/oshanoshu/firstMachineLearning/cleaned.csv")
data

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,work_accident,left,promotion_last_5years,department,salary
0,0.38,0.53,2,157.000000,3.000000,0,1,0,sales,low
1,0.80,0.86,5,262.000000,6.000000,0,1,0,sales,medium
2,0.11,0.88,7,272.000000,4.000000,0,1,0,sales,medium
3,0.72,0.87,5,223.000000,5.000000,0,1,0,sales,low
4,0.37,0.52,2,200.511732,3.380048,0,1,0,sales,low
5,0.41,0.50,2,200.511732,3.380048,0,1,0,sales,low
6,0.10,0.77,6,247.000000,4.000000,0,1,0,sales,low
7,0.92,0.85,5,259.000000,5.000000,0,1,0,sales,low
8,0.89,1.00,5,224.000000,5.000000,0,1,0,sales,low
9,0.42,0.53,2,142.000000,3.000000,0,1,0,sales,low


Converting all the salary values to numeric

In [176]:
def changeToInt(x):
    if x=='low':
        return 1
    elif x=='medium':
        return 2
    else:
        return 3
data.salary=data['salary'].apply(changeToInt)

Converting all the department values to numeric value

In [177]:
data.department.unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

In [187]:
data.department.replace(('sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'),(1,2,3,4,5,6,7,8,9,10),inplace=True)

Name of columns

In [188]:
data.columns

Index(['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'work_accident', 'left',
       'promotion_last_5years', 'department', 'salary'],
      dtype='object')

Separating features and target from the data

In [189]:
x_data=data[['satisfaction_level', 'last_evaluation', 'number_project',
       'average_montly_hours', 'time_spend_company', 'work_accident',
       'promotion_last_5years', 'department', 'salary']]

In [190]:
y_data=data['left']

Function on k-Fold Validation and Classifier

In [191]:
#Passing the features, the target, and test set percentage to the function
def k_Fold_Validation_Classifier(X_data,Y_data,test_percentage):
    #Dividing the data into test set and training set 
    X_train, X_test, y_train, y_test = train_test_split(X_data, Y_data, test_size=test_percentage,random_state=1)
    
    #Shuffling the data to have uniform mix
    X_train,y_train=shuffle(X_train,y_train)
    
    #Initializing a Decision Tree Classifier
    classify=tree.DecisionTreeClassifier()
    
     #Evaluating the Decision Tree Classifier using cross_val_score() function 
    cvs=cross_val_score(classify,X_train,y_train,cv=5)
    
    print("The accuracy on each validation fold of the classifier is: ")
    for scores in cvs:
        print (scores)
    
    #Predicting the value using cross fold validation and finally fitting the classifier
    predicted_value=cross_val_predict(classify,X_test,y_test,cv=5)
    
    #Fitting the classifier using training set
    classify.fit(X_train,y_train)
    
    #Predicting the value using test set in the classifier
    y_pred=classify.predict(X_test)
    
    #The Confusin matrix
    cm=confusion_matrix(y_test,y_pred)
    print("Confusion Matrix:")
    print(cm)
    
    #The Precision Score
    
    print("Precision score %.2f"%precision_score(y_test,y_pred))
    print("Recall score %.2f"%recall_score(y_test,y_pred))
    print("F1 score %.2f"%f1_score(y_test,y_pred))

Training Set: 85% Test Set: 15%

In [192]:
k_Fold_Validation_Classifier(x_data,y_data,0.15)

The accuracy on each validation fold of the classifier is: 
0.9664268585131894
0.9702495201535508
0.9683301343570058
0.9692898272552783
0.9673547767642823
Confusion Matrix:
[[1466   42]
 [  36  295]]
Precision score 0.88
Recall score 0.89
F1 score 0.88


Training Set: 75% Test Set: 25%

In [193]:
k_Fold_Validation_Classifier(x_data,y_data,0.25)

The accuracy on each validation fold of the classifier is: 
0.9717391304347827
0.9717237629146275
0.9679173463839043
0.9651795429815017
0.9662676822633297
Confusion Matrix:
[[2449   62]
 [  50  504]]
Precision score 0.89
Recall score 0.91
F1 score 0.90


Training Set: 65% Test Set: 35%

In [194]:
k_Fold_Validation_Classifier(x_data,y_data,0.35)

The accuracy on each validation fold of the classifier is: 
0.9648902821316614
0.9636135508155583
0.9617074701820465
0.9686126804770873
0.967984934086629
Confusion Matrix:
[[3454   69]
 [  71  697]]
Precision score 0.91
Recall score 0.91
F1 score 0.91
