In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter
import psycopg2

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced

In [4]:
import db_password from config

In [5]:
url = f"postgres://postgres:{db_password}@127.0.0.1:5433/Attrition"

In [6]:
def connect(url):
    # Connect to the server
    conn = None
    try:
        print('Connecting...')
        conn = psycopg2.connect(url)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error) 
    print("Connected")
    return conn

In [7]:
#A function to conect a db and bring in Data into a panda df 
def sql_to_df(conn, select_query, column_names):
    cursor = conn.cursor()
    try:
        cursor.execute(select_query)
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error")
        cursor.close()
        return 0
    
    # Turn tupples into a df
    tupples = cursor.fetchall()
    cursor.close()
    
    df = pd.DataFrame(tupples, columns=column_names)
    return df

In [8]:
# Connect to the database
conn = connect(url)
column_names = ["Attrition", "Age", "Department", "EmployeeNumber", "Gender", "HourlyRate","JobLevel", "MaritalStatus", "NumCompaniesWorked", "PercentSalaryHike", "PerformanceRating", "StockOptionLevel", "TotalWorkingYears", "TrainingTimesLastYear", "WorkLifeBalance", "YearsAtCompany", "YearsInCurrentRole", "YearsSinceLastPromotion" ]
# SELECT * query
attrition_df = sql_to_df(conn, "select * FROM joint_table", column_names)
attrition_df.head()

Connecting...
Connected


Unnamed: 0,Attrition,Age,Department,EmployeeNumber,Gender,HourlyRate,JobLevel,MaritalStatus,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
0,Yes,41,Sales,1,Female,94,2,Single,8,11,3,0,8,0,1,6,4,0
1,No,49,Research & Development,2,Male,61,2,Married,1,23,4,1,10,3,3,10,7,1
2,Yes,37,Research & Development,4,Male,92,1,Single,6,15,3,0,7,3,3,0,0,0
3,No,33,Research & Development,5,Female,56,1,Married,1,11,3,0,8,3,3,8,7,3
4,No,27,Research & Development,7,Male,40,1,Married,9,12,3,1,6,3,3,2,2,2


In [9]:
# Display datatypes
attrition_df.dtypes

Attrition                  object
Age                         int64
Department                 object
EmployeeNumber              int64
Gender                     object
HourlyRate                  int64
JobLevel                    int64
MaritalStatus              object
NumCompaniesWorked          int64
PercentSalaryHike           int64
PerformanceRating           int64
StockOptionLevel            int64
TotalWorkingYears           int64
TrainingTimesLastYear       int64
WorkLifeBalance             int64
YearsAtCompany              int64
YearsInCurrentRole          int64
YearsSinceLastPromotion     int64
dtype: object

# Data Cleaning

In [10]:
# Transform String column for Gender

def change_string(gender):
    if gender == "Female":
        return 1
    else: return 0
    
attrition_df["Gender"] = attrition_df["Gender"].apply(change_string)
attrition_df.head()

Unnamed: 0,Attrition,Age,Department,EmployeeNumber,Gender,HourlyRate,JobLevel,MaritalStatus,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
0,Yes,41,Sales,1,1,94,2,Single,8,11,3,0,8,0,1,6,4,0
1,No,49,Research & Development,2,0,61,2,Married,1,23,4,1,10,3,3,10,7,1
2,Yes,37,Research & Development,4,0,92,1,Single,6,15,3,0,7,3,3,0,0,0
3,No,33,Research & Development,5,1,56,1,Married,1,11,3,0,8,3,3,8,7,3
4,No,27,Research & Development,7,0,40,1,Married,9,12,3,1,6,3,3,2,2,2


In [11]:
# Transform String column for Marital Status

def change_string(marital_status):
    if marital_status == "Single":
        return 1
    elif marital_status == "Married":
        return 2
    else: return 0
    
attrition_df["MaritalStatus"] = attrition_df["MaritalStatus"].apply(change_string)
attrition_df.head()

Unnamed: 0,Attrition,Age,Department,EmployeeNumber,Gender,HourlyRate,JobLevel,MaritalStatus,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
0,Yes,41,Sales,1,1,94,2,1,8,11,3,0,8,0,1,6,4,0
1,No,49,Research & Development,2,0,61,2,2,1,23,4,1,10,3,3,10,7,1
2,Yes,37,Research & Development,4,0,92,1,1,6,15,3,0,7,3,3,0,0,0
3,No,33,Research & Development,5,1,56,1,2,1,11,3,0,8,3,3,8,7,3
4,No,27,Research & Development,7,0,40,1,2,9,12,3,1,6,3,3,2,2,2


In [12]:
# Transform String column for Deparment

def change_string(dept):
    if dept == "Human Resources":
        return 1
    elif dept == "Research & Development":
        return 2
    elif dept == "Sales":
        return 3
    else: return 0
    
attrition_df["Department"] = attrition_df["Department"].apply(change_string)
attrition_df.head()

Unnamed: 0,Attrition,Age,Department,EmployeeNumber,Gender,HourlyRate,JobLevel,MaritalStatus,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
0,Yes,41,3,1,1,94,2,1,8,11,3,0,8,0,1,6,4,0
1,No,49,2,2,0,61,2,2,1,23,4,1,10,3,3,10,7,1
2,Yes,37,2,4,0,92,1,1,6,15,3,0,7,3,3,0,0,0
3,No,33,2,5,1,56,1,2,1,11,3,0,8,3,3,8,7,3
4,No,27,2,7,0,40,1,2,9,12,3,1,6,3,3,2,2,2


In [13]:
#find null values

for column in attrition_df.columns:
    print(f"Column {column} has {attrition_df[column].isnull().sum()} null values")

Column Attrition has 0 null values
Column Age has 0 null values
Column Department has 0 null values
Column EmployeeNumber has 0 null values
Column Gender has 0 null values
Column HourlyRate has 0 null values
Column JobLevel has 0 null values
Column MaritalStatus has 0 null values
Column NumCompaniesWorked has 0 null values
Column PercentSalaryHike has 0 null values
Column PerformanceRating has 0 null values
Column StockOptionLevel has 0 null values
Column TotalWorkingYears has 0 null values
Column TrainingTimesLastYear has 0 null values
Column WorkLifeBalance has 0 null values
Column YearsAtCompany has 0 null values
Column YearsInCurrentRole has 0 null values
Column YearsSinceLastPromotion has 0 null values


In [14]:
#remove the EmployeeID Column because we have index#s

attrition_df.drop(columns=["EmployeeNumber"], inplace=True)
attrition_df.head()

Unnamed: 0,Attrition,Age,Department,Gender,HourlyRate,JobLevel,MaritalStatus,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
0,Yes,41,3,1,94,2,1,8,11,3,0,8,0,1,6,4,0
1,No,49,2,0,61,2,2,1,23,4,1,10,3,3,10,7,1
2,Yes,37,2,0,92,1,1,6,15,3,0,7,3,3,0,0,0
3,No,33,2,1,56,1,2,1,11,3,0,8,3,3,8,7,3
4,No,27,2,0,40,1,2,9,12,3,1,6,3,3,2,2,2


# Split the Data into Training and Testing

In [15]:
# Create our features
columns = ["Age", "Department", "Gender", "HourlyRate","JobLevel", "MaritalStatus", "NumCompaniesWorked", "PercentSalaryHike", "PerformanceRating", "StockOptionLevel", "TotalWorkingYears", "TrainingTimesLastYear", "WorkLifeBalance", "YearsAtCompany", "YearsInCurrentRole", "YearsSinceLastPromotion" ]

# Create our target
target = ["Attrition"]

In [16]:
# Create our features
X = pd.get_dummies(attrition_df.drop(columns="Attrition"))

# Create our target
y = attrition_df["Attrition"]

In [17]:
X.describe()

Unnamed: 0,Age,Department,Gender,HourlyRate,JobLevel,MaritalStatus,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,2.260544,0.4,65.891156,2.063946,1.235374,2.693197,15.209524,3.153741,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755
std,9.135373,0.527792,0.490065,20.329428,1.10694,0.790757,2.498009,3.659938,0.360824,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243
min,18.0,1.0,0.0,30.0,1.0,0.0,0.0,11.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,30.0,2.0,0.0,48.0,1.0,1.0,1.0,12.0,3.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0
50%,36.0,2.0,0.0,66.0,2.0,1.0,2.0,14.0,3.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0
75%,43.0,3.0,1.0,83.75,3.0,2.0,4.0,18.0,3.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0
max,60.0,3.0,1.0,100.0,5.0,2.0,9.0,25.0,4.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0


In [18]:
# Check the balance of our target values
y.value_counts()

No     1233
Yes     237
Name: Attrition, dtype: int64

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(1102, 16)

# Logistic Regression

In [20]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter({'attrition': 1})

Counter({'attrition': 1})

In [21]:
# Train the Logistic Regression model using the resampled data

model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [22]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

NameError: name 'y_pred' is not defined

In [None]:
# Display the confusion matrix

y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

In [None]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

# Naive Random Oversampling

In [23]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter({'Attrition': 1})

Counter({'Attrition': 1})

In [24]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [25]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

NameError: name 'y_pred' is not defined

In [26]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[187, 122],
       [ 15,  44]], dtype=int64)

In [27]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.93      0.61      0.75      0.73      0.67      0.44       309
        Yes       0.27      0.75      0.61      0.39      0.67      0.46        59

avg / total       0.82      0.63      0.72      0.68      0.67      0.45       368



# SMOTE Oversampling

In [28]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resampled, y_resampled = SMOTE(random_state=1,
sampling_strategy='auto').fit_resample(
   X_train, y_train)

Counter({'Attrition': 1})

Counter({'Attrition': 1})

In [29]:
# Train the Logistic Regression model using the resampled data
modelS = LogisticRegression(solver='lbfgs', random_state=1)
modelS.fit(X_resampled, y_resampled)

y_pred = modelS.predict(X_test)

In [30]:
# Calculated the balanced accuracy score

balanced_accuracy_score(y_test, y_pred)

0.6066041358126268

In [31]:
# Display the confusion matrix

cm2 = confusion_matrix(y_test, y_pred)
print(cm2)

[[223  86]
 [ 30  29]]


In [32]:
# Print the imbalanced classification report

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.88      0.72      0.49      0.79      0.60      0.36       309
        Yes       0.25      0.49      0.72      0.33      0.60      0.35        59

avg / total       0.78      0.68      0.53      0.72      0.60      0.36       368



# Undersampling 

In [33]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
cc = ClusterCentroids(random_state=1)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)
Counter({'Attrition': 1})

Counter({'Attrition': 1})

In [34]:
# Train the Logistic Regression model using the resampled data
modelU = LogisticRegression(solver='lbfgs', random_state=1)
modelU.fit(X_resampled, y_resampled)

y_pred = modelU.predict(X_test)

In [35]:
# Calculated the balanced accuracy score

balanced_accuracy_score(y_test, y_pred)

0.6368548077450497

In [36]:
# Display the confusion matrix

cm3 = confusion_matrix(y_test, y_pred)

print(cm3)

[[116 193]
 [  6  53]]


In [37]:
# Print the imbalanced classification report

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.95      0.38      0.90      0.54      0.58      0.32       309
        Yes       0.22      0.90      0.38      0.35      0.58      0.35        59

avg / total       0.83      0.46      0.81      0.51      0.58      0.33       368



# Combination (Over and Under) Sampling

In [38]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN

smote_enn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smote_enn.fit_resample(X, y)
Counter({'Attrition': 1})

Counter({'Attrition': 1})

In [39]:
# Train the Logistic Regression model using the resampled data
modelENN = LogisticRegression(solver='lbfgs', random_state=1)
modelENN.fit(X_resampled, y_resampled)

In [40]:
# Calculated the balanced accuracy score
y_pred = modelENN.predict(X_test)

balanced_accuracy_score(y_test, y_pred)

0.6778563984422139

In [41]:
# Display the confusion matrix
cm4 = confusion_matrix(y_test, y_pred)
print(cm4)

[[178 131]
 [ 13  46]]


In [42]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.93      0.58      0.78      0.71      0.67      0.44       309
        Yes       0.26      0.78      0.58      0.39      0.67      0.46        59

avg / total       0.82      0.61      0.75      0.66      0.67      0.44       368



# Balanced Random Forest Classifier

In [43]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier

model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
model_fit = model.fit(X_train, y_train)

In [44]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model_fit.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6852613679995612

In [45]:
# Display the confusion matrix
cMatrix = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cMatrix_df = pd.DataFrame(
    cMatrix, index=["Actual Attrition", "Actual Non-Attrition"], columns=["Predicted Attrition", "Predicted Non-Attrition"])
cMatrix_df

Unnamed: 0,Predicted Attrition,Predicted Non-Attrition
Actual Attrition,214,95
Actual Non-Attrition,19,40


In [46]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.92      0.69      0.68      0.79      0.69      0.47       309
        Yes       0.30      0.68      0.69      0.41      0.69      0.47        59

avg / total       0.82      0.69      0.68      0.73      0.69      0.47       368



# Easy Ensemble AdaBoost Classifier

In [47]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier

eeClass = EasyEnsembleClassifier(n_estimators=100, random_state=1)

# Fit
eeClass.fit(X_train, y_train)

In [48]:
# Calculated the balanced accuracy score

y_pred = eeClass.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6767045142888486

In [49]:
# Display the confusion matrix

# Calculating the confusion matrix
cMatrix = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cMatrix_df = pd.DataFrame(
    cMatrix, index=["Actual Attrition", "Actual Non-Attrition"], columns=["Predicted Attrition", "Predicted Non-Attrition"])
cMatrix_df

Unnamed: 0,Predicted Attrition,Predicted Non-Attrition
Actual Attrition,193,116
Actual Non-Attrition,16,43


In [50]:
# Print the imbalanced classification report

print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

         No       0.92      0.62      0.73      0.75      0.67      0.45       309
        Yes       0.27      0.73      0.62      0.39      0.67      0.46        59

avg / total       0.82      0.64      0.71      0.69      0.67      0.45       368



In [51]:
# List the features sorted in descending order by feature importance

features_rank = sorted(zip(model.feature_importances_, X.columns), reverse=True)
for feature in features_rank:
    print(f"{feature[1]}: ({feature[0]})")

Age: (0.11684352428497774)
HourlyRate: (0.11636970580674827)
YearsAtCompany: (0.09117741703398999)
TotalWorkingYears: (0.09036411269491801)
PercentSalaryHike: (0.07665296332363561)
NumCompaniesWorked: (0.06989329360529674)
StockOptionLevel: (0.06501756408479857)
YearsInCurrentRole: (0.062185893571234475)
JobLevel: (0.06211060560507296)
TrainingTimesLastYear: (0.05240617184043876)
YearsSinceLastPromotion: (0.04605463800924101)
WorkLifeBalance: (0.04214013699530177)
MaritalStatus: (0.036445695162181165)
Department: (0.033732310747454364)
Gender: (0.026861922791159953)
PerformanceRating: (0.011744044443550605)
