# 4. read the processed data

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("processed-data.csv")

In [3]:
df.head()

Unnamed: 0,pull_out,satisfaction_level,last_evaluation,project_count,average_monthly_hours,years_at_company,work_accident,promotion,department,salary
0,1,0.38,0.53,2,157,3,0,0,0,0
1,1,0.8,0.86,5,262,6,0,0,0,1
2,1,0.11,0.88,7,272,4,0,0,0,1
3,1,0.72,0.87,5,223,5,0,0,0,0
4,1,0.37,0.52,2,159,3,0,0,0,0


# 5. Modelling

Our problem is classification problem so we're going to use classification algorithms. 
More importantly, it is binary classification (target feature is binary), so we will use following algorithms for modelling:
1. Logistic Regression
2. Support Vector Machines

In [4]:
df.head()

Unnamed: 0,pull_out,satisfaction_level,last_evaluation,project_count,average_monthly_hours,years_at_company,work_accident,promotion,department,salary
0,1,0.38,0.53,2,157,3,0,0,0,0
1,1,0.8,0.86,5,262,6,0,0,0,1
2,1,0.11,0.88,7,272,4,0,0,0,1
3,1,0.72,0.87,5,223,5,0,0,0,0
4,1,0.37,0.52,2,159,3,0,0,0,0


In [5]:
# for training the data, we are currently taking all the columns except the target feature pull out.
# we will drop target feature from main df and seperate it
target_feature = df.pop('pull_out')

In [6]:
target_feature

0        1
1        1
2        1
3        1
4        1
        ..
14994    1
14995    1
14996    1
14997    1
14998    1
Name: pull_out, Length: 14999, dtype: int64

In [7]:
target_feature.unique()

array([1, 0])

In [8]:
df.head()

Unnamed: 0,satisfaction_level,last_evaluation,project_count,average_monthly_hours,years_at_company,work_accident,promotion,department,salary
0,0.38,0.53,2,157,3,0,0,0,0
1,0.8,0.86,5,262,6,0,0,0,1
2,0.11,0.88,7,272,4,0,0,0,1
3,0.72,0.87,5,223,5,0,0,0,0
4,0.37,0.52,2,159,3,0,0,0,0


In [9]:
# split data into training and testing datasets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df, target_feature, test_size=0.2, random_state=16)
# X_train and y_train are used for training the model.
# X_test and y_test will be used for model evaluation and accuracy calculation.

## 5.1 Logistic Regression

In [45]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

In [37]:
lg = LogisticRegression(max_iter=400)
lg.fit(X_train, y_train)
lg_train_score = lg.score(X_train, y_train)
lg_test_score = lg.score(X_test, y_test)

In [38]:
print("training score for logistic regression = ", lg_train_score)
print("test score for logistic regression = ", lg_test_score)

training score for logistic regression =  0.7873156096341362
test score for logistic regression =  0.7863333333333333


In [52]:
# confusion matrix
y_pred = lg.fit(X_train, y_train).predict(X_test)

cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm)
print(cm_df)

      0    1
0  2112  158
1   483  247


## 5.2 Support Vector Machines 

In [41]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
svm_train_score = svm.score(X_train, y_train)
svm_test_score = svm.score(X_test, y_test)

In [42]:
print("training score for SVM = ", svm_train_score)
print("test score for SVM = ", svm_test_score)

training score for SVM =  0.7852321026752229
test score for SVM =  0.779


In [50]:
# confusion matrix
y_pred = svm.fit(X_train, y_train).predict(X_test)

cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm)
print(cm_df)

      0   1
0  2269   1
1   662  68


## 5.3 K-Nearest Neighbours

In [43]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
knn_train_score = knn.score(X_train, y_train)
knn_test_score = knn.score(X_test, y_test)

In [44]:
print("training score for knn = ", knn_train_score)
print("test score for knn = ", knn_test_score)

training score for knn =  0.9526627218934911
test score for knn =  0.929


In [51]:
# confusion matrix
y_pred = knn.fit(X_train, y_train).predict(X_test)

cm = confusion_matrix(y_test, y_pred)
cm_df = pd.DataFrame(cm)
print(cm_df)

      0    1
0  2126  144
1    69  661
