In [3]:
import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [4]:
data = pd.read_csv('train (1).csv')

In [5]:
data

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,1,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,0,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,1,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,0,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,0,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1053,57,0,Travel_Rarely,405,Research & Development,1,2,Life Sciences,1,1483,...,1,80,1,13,2,2,12,9,2,8
1054,49,0,Travel_Rarely,1490,Research & Development,7,4,Life Sciences,1,1484,...,2,80,2,29,3,3,8,7,0,7
1055,34,0,Travel_Frequently,829,Research & Development,15,3,Medical,1,1485,...,4,80,2,16,3,2,14,8,6,9
1056,28,1,Travel_Frequently,1496,Sales,1,3,Technical Degree,1,1486,...,4,80,1,5,3,4,3,2,1,2


In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1058 entries, 0 to 1057
Data columns (total 35 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Age                       1058 non-null   int64 
 1   Attrition                 1058 non-null   int64 
 2   BusinessTravel            1058 non-null   object
 3   DailyRate                 1058 non-null   int64 
 4   Department                1058 non-null   object
 5   DistanceFromHome          1058 non-null   int64 
 6   Education                 1058 non-null   int64 
 7   EducationField            1058 non-null   object
 8   EmployeeCount             1058 non-null   int64 
 9   EmployeeNumber            1058 non-null   int64 
 10  EnvironmentSatisfaction   1058 non-null   int64 
 11  Gender                    1058 non-null   object
 12  HourlyRate                1058 non-null   int64 
 13  JobInvolvement            1058 non-null   int64 
 14  JobLevel                

### 1. Preprocessing

In [26]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [27]:
def preprocess_input(df):
    df = df.copy()
    
    #1. Drop single value and ID columns
    df = df.drop(['EmployeeCount', 'EmployeeNumber', 'Over18', 'StandardHours'], axis=1)
    
    #2. Convert binary columns to numeric (encode)
    df['Gender'] = df['Gender'].replace({'Female': 0, 'Male': 1})
    df['OverTime'] = df['OverTime'].replace({'No': 0, 'Yes': 1})
    
    #3. Ordinal-encode the BusinessTravel column
    df['BusinessTravel'] = df['BusinessTravel'].replace({'Non-Travel': 0, 'Travel_Rarely': 1, 'Travel_Frequently': 2})
    
    #4. One Hot Encoding to the remaining object type columns
    for column in ['Department', 'EducationField', 'JobRole', 'MaritalStatus']:
        df = onehot_encode(df, column=column)
        
    # Split data into X and y - Attrition = Target
    y = df['Attrition']
    X = df.drop('Attrition', axis=1)
    
    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75, shuffle=True, random_state=1)
    
    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)
    
    return X_train, X_test, y_train, y_test

In [30]:
X_train, X_test, y_train, y_test = preprocess_input(data)

In [31]:
X_train

Unnamed: 0,Age,BusinessTravel,DailyRate,DistanceFromHome,Education,EnvironmentSatisfaction,Gender,HourlyRate,JobInvolvement,JobLevel,...,JobRole_Laboratory Technician,JobRole_Manager,JobRole_Manufacturing Director,JobRole_Research Director,JobRole_Research Scientist,JobRole_Sales Executive,JobRole_Sales Representative,MaritalStatus_Divorced,MaritalStatus_Married,MaritalStatus_Single
553,0.285824,-0.186605,-0.039058,-0.861349,-1.864619,1.175703,-1.160239,1.016023,-1.040808,-0.978181,...,-0.465035,-0.286104,-0.334966,-0.248152,2.028825,-0.534089,-0.245276,-0.541884,-0.904880,1.448329
676,-0.243257,-0.186605,0.775705,1.467369,-1.864619,1.175703,-1.160239,-0.727822,0.384686,-0.090655,...,-0.465035,-0.286104,-0.334966,-0.248152,-0.492896,-0.534089,-0.245276,-0.541884,1.105119,-0.690451
403,0.391640,-0.186605,-0.428089,-0.983913,0.089965,-0.648822,0.861892,-0.827470,1.810179,0.796870,...,-0.465035,-0.286104,-0.334966,-0.248152,-0.492896,1.872347,-0.245276,-0.541884,1.105119,-0.690451
766,1.343985,-0.186605,1.575787,-0.861349,1.067257,-0.648822,0.861892,-0.179756,0.384686,2.571922,...,-0.465035,-0.286104,-0.334966,4.029780,-0.492896,-0.534089,-0.245276,-0.541884,1.105119,-0.690451
567,-0.349073,-0.186605,-1.262426,-0.861349,0.089965,1.175703,0.861892,-0.279405,0.384686,-0.090655,...,-0.465035,-0.286104,-0.334966,-0.248152,-0.492896,1.872347,-0.245276,-0.541884,-0.904880,1.448329
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
960,0.391640,1.710546,0.484543,-0.983913,0.089965,0.263440,-1.160239,0.019540,0.384686,-0.090655,...,-0.465035,-0.286104,-0.334966,-0.248152,-0.492896,1.872347,-0.245276,1.845415,-0.904880,-0.690451
847,-0.349073,1.710546,-0.369368,-0.983913,0.089965,1.175703,0.861892,1.564089,-1.040808,-0.090655,...,-0.465035,-0.286104,-0.334966,-0.248152,-0.492896,-0.534089,-0.245276,-0.541884,-0.904880,1.448329
715,-0.454889,1.710546,0.017217,-0.983913,1.067257,0.263440,-1.160239,0.916375,1.810179,-0.090655,...,-0.465035,-0.286104,-0.334966,-0.248152,-0.492896,-0.534089,-0.245276,-0.541884,1.105119,-0.690451
905,-0.878154,-0.186605,-0.308199,-0.983913,0.089965,1.175703,-1.160239,1.065847,-1.040808,1.684396,...,-0.465035,-0.286104,-0.334966,4.029780,-0.492896,-0.534089,-0.245276,1.845415,-0.904880,-0.690451


### 2. Training

In [32]:
models = {
    "                   Logistic Regression": LogisticRegression(),
    "                   K-Nearest Neighbors": KNeighborsClassifier(),
    "                         Decision Tree": DecisionTreeClassifier(),
    "Support Vector Machine (Linear Kernel)": LinearSVC(),
    "   Support Vector Machine (RBF Kernel)": SVC(),
    "                        Neural Network": MLPClassifier(),
    "                         Random Forest": RandomForestClassifier(),
    "                     Gradient Boosting": GradientBoostingClassifier()
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

                   Logistic Regression trained.
                   K-Nearest Neighbors trained.
                         Decision Tree trained.




Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.




                        Neural Network trained.
                         Random Forest trained.
                     Gradient Boosting trained.


### 3. Results

In [33]:
for name, model in models.items():
    print(name + ": {:.2f}%".format(model.score(X_test, y_test) * 100))

                   Logistic Regression: 87.55%
                   K-Nearest Neighbors: 80.38%
                         Decision Tree: 73.96%
Support Vector Machine (Linear Kernel): 87.17%
   Support Vector Machine (RBF Kernel): 83.40%
                        Neural Network: 87.92%
                         Random Forest: 82.64%
                     Gradient Boosting: 82.64%
