### Data Load and Exploration

In [1]:
import pandas as pd

In [2]:
train_df = pd.read_csv('data/train.csv')

In [3]:
train_df.shape

(59598, 24)

In [4]:
train_df.columns

Index(['Employee ID', 'Age', 'Gender', 'Years at Company', 'Job Role',
       'Monthly Income', 'Work-Life Balance', 'Job Satisfaction',
       'Performance Rating', 'Number of Promotions', 'Overtime',
       'Distance from Home', 'Education Level', 'Marital Status',
       'Number of Dependents', 'Job Level', 'Company Size', 'Company Tenure',
       'Remote Work', 'Leadership Opportunities', 'Innovation Opportunities',
       'Company Reputation', 'Employee Recognition', 'Attrition'],
      dtype='object')

In [5]:
# Check for null values
train_df.isna().sum()

Employee ID                 0
Age                         0
Gender                      0
Years at Company            0
Job Role                    0
Monthly Income              0
Work-Life Balance           0
Job Satisfaction            0
Performance Rating          0
Number of Promotions        0
Overtime                    0
Distance from Home          0
Education Level             0
Marital Status              0
Number of Dependents        0
Job Level                   0
Company Size                0
Company Tenure              0
Remote Work                 0
Leadership Opportunities    0
Innovation Opportunities    0
Company Reputation          0
Employee Recognition        0
Attrition                   0
dtype: int64

In [6]:
train_df.head()

Unnamed: 0,Employee ID,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,...,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition,Attrition
0,8410,31,Male,19,Education,5390,Excellent,Medium,Average,2,...,0,Mid,Medium,89,No,No,No,Excellent,Medium,Stayed
1,64756,59,Female,4,Media,5534,Poor,High,Low,3,...,3,Mid,Medium,21,No,No,No,Fair,Low,Stayed
2,30257,24,Female,10,Healthcare,8159,Good,High,Low,0,...,3,Mid,Medium,74,No,No,No,Poor,Low,Stayed
3,65791,36,Female,7,Education,3989,Good,High,High,1,...,2,Mid,Small,50,Yes,No,No,Good,Medium,Stayed
4,65026,56,Male,41,Education,4821,Fair,Very High,Average,0,...,0,Senior,Medium,68,No,No,No,Fair,Medium,Stayed


### Exploratory code OneHotEncoder

In [7]:
from sklearn.preprocessing import OneHotEncoder

# OneHotEncoder transforming the "Gender" column to numeric code 
ohe = OneHotEncoder(sparse_output=False)
ohe.fit_transform(train_df[["Gender"]])

array([[0., 1.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [0., 1.],
       [0., 1.]])

In [8]:
ohe.categories_

[array(['Female', 'Male'], dtype=object)]

### Data Processing Pipeline

Defining a Pipeline that separates how to process different data features (numerical and categorical).

In [9]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler


numeric_features = ["Employee ID","Age", "Years at Company", "Monthly Income", "Number of Promotions", "Distance from Home", "Number of Dependents", "Company Tenure"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["Gender", "Job Role", "Work-Life Balance", "Job Satisfaction", "Performance Rating", "Overtime", "Education Level", "Marital Status", "Job Level", 
                        "Company Size", "Remote Work", "Leadership Opportunities", "Innovation Opportunities", "Company Reputation", "Employee Recognition"]
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore"))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

### Model Building and Training

In [10]:
X_train = train_df.drop("Attrition", axis="columns")
X_train.head()

Unnamed: 0,Employee ID,Age,Gender,Years at Company,Job Role,Monthly Income,Work-Life Balance,Job Satisfaction,Performance Rating,Number of Promotions,...,Marital Status,Number of Dependents,Job Level,Company Size,Company Tenure,Remote Work,Leadership Opportunities,Innovation Opportunities,Company Reputation,Employee Recognition
0,8410,31,Male,19,Education,5390,Excellent,Medium,Average,2,...,Married,0,Mid,Medium,89,No,No,No,Excellent,Medium
1,64756,59,Female,4,Media,5534,Poor,High,Low,3,...,Divorced,3,Mid,Medium,21,No,No,No,Fair,Low
2,30257,24,Female,10,Healthcare,8159,Good,High,Low,0,...,Married,3,Mid,Medium,74,No,No,No,Poor,Low
3,65791,36,Female,7,Education,3989,Good,High,High,1,...,Single,2,Mid,Small,50,Yes,No,No,Good,Medium
4,65026,56,Male,41,Education,4821,Fair,Very High,Average,0,...,Divorced,0,Senior,Medium,68,No,No,No,Fair,Medium


In [11]:
y_train = train_df.Attrition
y_train.head()

0    Stayed
1    Stayed
2    Stayed
3    Stayed
4    Stayed
Name: Attrition, dtype: object

In [12]:
test_df = pd.read_csv('data/test.csv')
X_test = test_df.drop("Attrition", axis="columns")
y_test = test_df.Attrition

### K-Nearest Neighbors

In [13]:
from sklearn.neighbors import KNeighborsClassifier

knn = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", KNeighborsClassifier(n_neighbors=30))]
)

knn.fit(X_train, y_train)

In [14]:
print("KNN score: %.3f" % knn.score(X_test, y_test))

KNN score: 0.718


### Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

logreg.fit(X_train, y_train)

In [16]:
print("LogRegression score: %.3f" % logreg.score(X_test, y_test))

LogRegression score: 0.755


### Decision Trees

In [17]:
from sklearn.tree import DecisionTreeClassifier

dtree = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", DecisionTreeClassifier())]
)

dtree.fit(X_train, y_train)

In [18]:
print("Decision Trees score: %.3f" % dtree.score(X_test, y_test))

Decision Trees score: 0.668


### Introducing Feature Selection (Chi-sqrd test)

In [None]:
from sklearn.feature_selection import SelectPercentile, chi2

numeric_features = ["Employee ID","Age", "Years at Company", "Monthly Income", "Number of Promotions", "Distance from Home", "Number of Dependents", "Company Tenure"]
numeric_transformer = Pipeline(
    steps=[("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())]
)

categorical_features = ["Gender", "Job Role", "Work-Life Balance", "Job Satisfaction", "Performance Rating", "Overtime", "Education Level", "Marital Status", "Job Level", 
                        "Company Size", "Remote Work", "Leadership Opportunities", "Innovation Opportunities", "Company Reputation", "Employee Recognition"]
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OneHotEncoder(handle_unknown="ignore")), # OneHotEncoder previously tested
        ("selector", SelectPercentile(chi2, percentile=50)), # Removes all but a user-specified highest scoring percentage of categorical features
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

### K-Nearest Neighbors

In [20]:
knn = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", KNeighborsClassifier(n_neighbors=30))]
)

knn.fit(X_train, y_train)

In [21]:
print("KNN score: %.3f" % knn.score(X_test, y_test))

KNN score: 0.730


### Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression

logreg = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", LogisticRegression())]
)

logreg.fit(X_train, y_train)

In [23]:
print("LogRegression score: %.3f" % logreg.score(X_test, y_test))

LogRegression score: 0.754


### Decision Trees

In [24]:
from sklearn.tree import DecisionTreeClassifier

dtree = Pipeline(
    steps=[("preprocessor", preprocessor), ("classifier", DecisionTreeClassifier())]
)

dtree.fit(X_train, y_train)

In [25]:
print("Decision Trees score: %.3f" % dtree.score(X_test, y_test))

Decision Trees score: 0.670
