In [20]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# # Dependencies for interaction with database:
# from sqlalchemy import create_engine
# from sqlalchemy.orm import Session


# Machine Learning dependencies:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Validation libraries
from sklearn import metrics
from sklearn.metrics import accuracy_score, mean_squared_error, precision_recall_curve
from sklearn.model_selection import cross_val_score

from collections import Counter
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from imblearn.metrics import classification_report_imbalanced
from imblearn.over_sampling import RandomOverSampler


from config import db_password
import psycopg2




In [21]:
url = f"postgres://postgres:{db_password}@127.0.0.1:543/Employee_Attrition"


In [22]:
def connect(url):
    # Connect to the server
    conn = None
    try:
        print('Connecting...')
        conn = psycopg2.connect(url)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error) 
    print("Connected")
    return conn

In [23]:
#A function to conect a db and bring in Data into a panda df 
def sql_to_df(conn, select_query, column_names):
    cursor = conn.cursor()
    try:
        cursor.execute(select_query)
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error")
        cursor.close()
        return 0
    
    # Turn tupples into a df
    tupples = cursor.fetchall()
    cursor.close()
    
    df = pd.DataFrame(tupples, columns=column_names)
    return df

In [24]:
# Connect to the database
conn = connect(url)
column_names = ["Attrition", "Age", "Department", "EmployeeNumber", "Gender", "HourlyRate","JobLevel", "MaritalStatus", "NumCompaniesWorked", "PercentSalaryHike", "PerformanceRating", "StockOptionLevel", "TotalWorkingYears", "TrainingTimesLastYear", "WorkLifeBalance", "YearsAtCompany", "YearsInCurrentRole", "YearsSinceLastPromotion" ]
# Execute the "SELECT *" query
attrition_df = sql_to_df(conn, "select * FROM joint_table", column_names)
attrition_df.head()

Connecting...
Connected


Unnamed: 0,Attrition,Age,Department,EmployeeNumber,Gender,HourlyRate,JobLevel,MaritalStatus,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
0,Yes,41,Sales,1,Female,94,2,Single,8,11,3,0,8,0,1,6,4,0
1,No,49,Research & Development,2,Male,61,2,Married,1,23,4,1,10,3,3,10,7,1
2,Yes,37,Research & Development,4,Male,92,1,Single,6,15,3,0,7,3,3,0,0,0
3,No,33,Research & Development,5,Female,56,1,Married,1,11,3,0,8,3,3,8,7,3
4,No,27,Research & Development,7,Male,40,1,Married,9,12,3,1,6,3,3,2,2,2


In [25]:
#find null values
for column in attrition_df.columns:
    print(f"Column {column} has {attrition_df[column].isnull().sum()} null values")

Column Attrition has 0 null values
Column Age has 0 null values
Column Department has 0 null values
Column EmployeeNumber has 0 null values
Column Gender has 0 null values
Column HourlyRate has 0 null values
Column JobLevel has 0 null values
Column MaritalStatus has 0 null values
Column NumCompaniesWorked has 0 null values
Column PercentSalaryHike has 0 null values
Column PerformanceRating has 0 null values
Column StockOptionLevel has 0 null values
Column TotalWorkingYears has 0 null values
Column TrainingTimesLastYear has 0 null values
Column WorkLifeBalance has 0 null values
Column YearsAtCompany has 0 null values
Column YearsInCurrentRole has 0 null values
Column YearsSinceLastPromotion has 0 null values


In [26]:
# Display datatypes
attrition_df.dtypes

Attrition                  object
Age                         int64
Department                 object
EmployeeNumber              int64
Gender                     object
HourlyRate                  int64
JobLevel                    int64
MaritalStatus              object
NumCompaniesWorked          int64
PercentSalaryHike           int64
PerformanceRating           int64
StockOptionLevel            int64
TotalWorkingYears           int64
TrainingTimesLastYear       int64
WorkLifeBalance             int64
YearsAtCompany              int64
YearsInCurrentRole          int64
YearsSinceLastPromotion     int64
dtype: object

# Data Cleaning

In [27]:
# Transform String column for Attrition
def change_string(attrition):
    if attrition == "Yes":
        return 1
    else: return 0
    
attrition_df["Attrition"] = attrition_df["Attrition"].apply(change_string)
attrition_df.head()

Unnamed: 0,Attrition,Age,Department,EmployeeNumber,Gender,HourlyRate,JobLevel,MaritalStatus,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
0,1,41,Sales,1,Female,94,2,Single,8,11,3,0,8,0,1,6,4,0
1,0,49,Research & Development,2,Male,61,2,Married,1,23,4,1,10,3,3,10,7,1
2,1,37,Research & Development,4,Male,92,1,Single,6,15,3,0,7,3,3,0,0,0
3,0,33,Research & Development,5,Female,56,1,Married,1,11,3,0,8,3,3,8,7,3
4,0,27,Research & Development,7,Male,40,1,Married,9,12,3,1,6,3,3,2,2,2


In [28]:
# Transform String column for Gender
def change_string(gender):
    if gender == "Female":
        return 1
    else: return 0
    
attrition_df["Gender"] = attrition_df["Gender"].apply(change_string)
attrition_df.head()

Unnamed: 0,Attrition,Age,Department,EmployeeNumber,Gender,HourlyRate,JobLevel,MaritalStatus,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
0,1,41,Sales,1,1,94,2,Single,8,11,3,0,8,0,1,6,4,0
1,0,49,Research & Development,2,0,61,2,Married,1,23,4,1,10,3,3,10,7,1
2,1,37,Research & Development,4,0,92,1,Single,6,15,3,0,7,3,3,0,0,0
3,0,33,Research & Development,5,1,56,1,Married,1,11,3,0,8,3,3,8,7,3
4,0,27,Research & Development,7,0,40,1,Married,9,12,3,1,6,3,3,2,2,2


In [29]:
# Transform String column for Marital Status
def change_string(marital_status):
    if marital_status == "Single":
        return 1
    elif marital_status == "Married":
        return 2
    else: return 0
    
attrition_df["MaritalStatus"] = attrition_df["MaritalStatus"].apply(change_string)
attrition_df.head()

Unnamed: 0,Attrition,Age,Department,EmployeeNumber,Gender,HourlyRate,JobLevel,MaritalStatus,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
0,1,41,Sales,1,1,94,2,1,8,11,3,0,8,0,1,6,4,0
1,0,49,Research & Development,2,0,61,2,2,1,23,4,1,10,3,3,10,7,1
2,1,37,Research & Development,4,0,92,1,1,6,15,3,0,7,3,3,0,0,0
3,0,33,Research & Development,5,1,56,1,2,1,11,3,0,8,3,3,8,7,3
4,0,27,Research & Development,7,0,40,1,2,9,12,3,1,6,3,3,2,2,2


In [30]:
# Transform String column for Department
def change_string(dept):
    if dept == "Human Resources":
        return 1
    elif dept == "Research & Development":
        return 2
    elif dept == "Sales":
        return 3
    else: return 0
    
attrition_df["Department"] = attrition_df["Department"].apply(change_string)
attrition_df.head()

Unnamed: 0,Attrition,Age,Department,EmployeeNumber,Gender,HourlyRate,JobLevel,MaritalStatus,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
0,1,41,3,1,1,94,2,1,8,11,3,0,8,0,1,6,4,0
1,0,49,2,2,0,61,2,2,1,23,4,1,10,3,3,10,7,1
2,1,37,2,4,0,92,1,1,6,15,3,0,7,3,3,0,0,0
3,0,33,2,5,1,56,1,2,1,11,3,0,8,3,3,8,7,3
4,0,27,2,7,0,40,1,2,9,12,3,1,6,3,3,2,2,2


In [None]:
#find null values
for column in attrition_df.columns:
    print(f"Column {column} has {attrition_df[column].isnull().sum()} null values")

In [31]:
#remove the EmployeeID Column because we have index#s
attrition_df.drop(columns=["EmployeeNumber"], inplace=True)
attrition_df.head()

Unnamed: 0,Attrition,Age,Department,Gender,HourlyRate,JobLevel,MaritalStatus,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
0,1,41,3,1,94,2,1,8,11,3,0,8,0,1,6,4,0
1,0,49,2,0,61,2,2,1,23,4,1,10,3,3,10,7,1
2,1,37,2,0,92,1,1,6,15,3,0,7,3,3,0,0,0
3,0,33,2,1,56,1,2,1,11,3,0,8,3,3,8,7,3
4,0,27,2,0,40,1,2,9,12,3,1,6,3,3,2,2,2


# Split the Data into Training and Testing

In [33]:
# Create our features
X = pd.get_dummies(attrition_df.drop(columns="Attrition"))
X = pd.get_dummies(X)

# Create our target
y = attrition_df["Attrition"]

In [34]:
X.describe()

Unnamed: 0,Age,Department,Gender,HourlyRate,JobLevel,MaritalStatus,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
count,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0,1470.0
mean,36.92381,2.260544,0.4,65.891156,2.063946,1.235374,2.693197,15.209524,3.153741,0.793878,11.279592,2.79932,2.761224,7.008163,4.229252,2.187755
std,9.135373,0.527792,0.490065,20.329428,1.10694,0.790757,2.498009,3.659938,0.360824,0.852077,7.780782,1.289271,0.706476,6.126525,3.623137,3.22243
min,18.0,1.0,0.0,30.0,1.0,0.0,0.0,11.0,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,30.0,2.0,0.0,48.0,1.0,1.0,1.0,12.0,3.0,0.0,6.0,2.0,2.0,3.0,2.0,0.0
50%,36.0,2.0,0.0,66.0,2.0,1.0,2.0,14.0,3.0,1.0,10.0,3.0,3.0,5.0,3.0,1.0
75%,43.0,3.0,1.0,83.75,3.0,2.0,4.0,18.0,3.0,1.0,15.0,3.0,3.0,9.0,7.0,3.0
max,60.0,3.0,1.0,100.0,5.0,2.0,9.0,25.0,4.0,3.0,40.0,6.0,4.0,40.0,18.0,15.0


In [35]:
# Check the balance of our target values
y.value_counts()

0    1233
1     237
Name: Attrition, dtype: int64

In [37]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train,y_test = train_test_split(X, y, random_state=1, stratify=y)
X_train.shape

(1102, 16)

# Logistic Regression

In [43]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
Counter({'attrition': 1})

Counter({'attrition': 1})

In [44]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [45]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(y_test, y_pred)

0.6774724370577587

In [46]:
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[183, 126],
       [ 14,  45]])

In [47]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      0.59      0.76      0.72      0.67      0.44       309
          1       0.26      0.76      0.59      0.39      0.67      0.46        59

avg / total       0.82      0.62      0.74      0.67      0.67      0.45       368



# Naive Random Oversampling

In [48]:
# Resample the training data with the RandomOversampler
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=1)
X_resample, y_resample = ros.fit_resample(X_train, y_train)
Counter({"Attrition":1})

Counter({'Attrition': 1})

In [49]:
# Train the Logistic Regression model using the resampled data
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

In [50]:
# Calculated the balanced accuracy score
from sklearn.metrics import balanced_accuracy_score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6774724370577587

In [51]:
# Display the confusion matrix
from sklearn.metrics import confusion_matrix
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[183, 126],
       [ 14,  45]])

In [52]:
# Print the imbalanced classification report
from imblearn.metrics import classification_report_imbalanced
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      0.59      0.76      0.72      0.67      0.44       309
          1       0.26      0.76      0.59      0.39      0.67      0.46        59

avg / total       0.82      0.62      0.74      0.67      0.67      0.45       368



# SMOTE Oversampling

In [82]:
# Resample the training data with SMOTE
from imblearn.over_sampling import SMOTE
X_resample, y_resample = SMOTE(random_state=1, sampling_strategy='auto').fit_resample(X_train, y_train)
Counter({"Attrition":1})

Counter({'Attrition': 1})

In [83]:
# Train the Logistic Regression model using the resampled data
modelS = LogisticRegression(solver='lbfgs', random_state=1)
modelS.fit(X_resample, y_resample)

In [84]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.603724425429214

In [85]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[106, 203],
       [  8,  51]])

In [86]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      0.34      0.86      0.50      0.54      0.28       309
          1       0.20      0.86      0.34      0.33      0.54      0.31        59

avg / total       0.81      0.43      0.78      0.47      0.54      0.29       368



# Undersampling

In [93]:
# Resample the data using the ClusterCentroids resampler
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.under_sampling import ClusterCentroids
Clusterc = ClusterCentroids(random_state=1)
X_resample, y_resample = Clusterc.fit_resample(X_train, y_train)
Counter({"Atrrition":1})

Counter({'Atrrition': 1})

In [98]:
# Train the Logistic Regression model using the resampled data
modelU.fit(X_resampled, y_resampled)

In [99]:
balanced_accuracy_score(y_test, y_pred)

0.603724425429214

In [100]:
y_pred = model.predict(X_test)
confusion_matrix(y_test, y_pred)

array([[106, 203],
       [  8,  51]])

In [101]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.93      0.34      0.86      0.50      0.54      0.28       309
          1       0.20      0.86      0.34      0.33      0.54      0.31        59

avg / total       0.81      0.43      0.78      0.47      0.54      0.29       368



# Combination (Over and Under) Sampling

In [102]:
# Resample the training data with SMOTEENN
# Warning: This is a large dataset, and this step may take some time to complete
from imblearn.combine import SMOTEENN
smote = SMOTEENN(random_state=1)
X_resample, y_resample = smote.fit_resample(X, y)
Counter({"Attrition":1})

Counter({'Attrition': 1})

In [103]:
# Train the Logistic Regression model using the resampled data
modelC = LogisticRegression(solver='lbfgs', random_state=1)
modelC.fit(X_resample, y_resample)

In [104]:
y_pred = modelC.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6576709999451484

In [105]:
# Display the confusion matrix
confusion_matrix(y_test, y_pred)

array([[176, 133],
       [ 15,  44]])

In [106]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.57      0.75      0.70      0.65      0.42       309
          1       0.25      0.75      0.57      0.37      0.65      0.43        59

avg / total       0.81      0.60      0.72      0.65      0.65      0.42       368



# Balanced Random Forest Classifier

In [107]:
# Resample the training data with the BalancedRandomForestClassifier
from imblearn.ensemble import BalancedRandomForestClassifier
rf_model = BalancedRandomForestClassifier(n_estimators=100, random_state=1)
rf_model = rf_model.fit(X_train, y_train)

In [108]:
# Calculated the balanced accuracy score
y_pred = rf_model.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6461247325983215

In [109]:
# Display the confusion matrix
cMatrix = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cMatrix_df = pd.DataFrame(
    cMatrix, index=["Actual Attrition", "Actual Non-Attrition"], columns=["Predicted Attrition", "Predicted Non-Attrition"])
cMatrix_df

Unnamed: 0,Predicted Attrition,Predicted Non-Attrition
Actual Attrition,216,93
Actual Non-Attrition,24,35


In [110]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.90      0.70      0.59      0.79      0.64      0.42       309
          1       0.27      0.59      0.70      0.37      0.64      0.41        59

avg / total       0.80      0.68      0.61      0.72      0.64      0.42       368



# Easy Ensemble AdaBoost Classifier

In [111]:
# Train the EasyEnsembleClassifier
from imblearn.ensemble import EasyEnsembleClassifier
eec = EasyEnsembleClassifier(n_estimators=100, random_state=1)
eec = eec.fit(X_train, y_train)
eec

In [112]:
# Calculated the balanced accuracy score
y_pred = eec.predict(X_test)
balanced_accuracy_score(y_test, y_pred)

0.6783226372661949

In [113]:
# Display the confusion matrix
cMatrix = confusion_matrix(y_test, y_pred)

# Create a DataFrame from the confusion matrix.
cMatrix_df = pd.DataFrame(
    cMatrix, index=["Actual Attrition", "Actual Non-Attrition"], columns=["Predicted Attrition", "Predicted Non-Attrition"])
cMatrix_df

Unnamed: 0,Predicted Attrition,Predicted Non-Attrition
Actual Attrition,194,115
Actual Non-Attrition,16,43


In [114]:
# Print the imbalanced classification report
print(classification_report_imbalanced(y_test, y_pred))

                   pre       rec       spe        f1       geo       iba       sup

          0       0.92      0.63      0.73      0.75      0.68      0.45       309
          1       0.27      0.73      0.63      0.40      0.68      0.46        59

avg / total       0.82      0.64      0.71      0.69      0.68      0.45       368



In [115]:
# List the features sorted in descending order by feature importance
features_rank = sorted(zip(model.feature_importances_, X.columns), reverse=True)
for feature in features_rank:
    print(f"{feature[1]}: ({feature[0]})")

AttributeError: 'LogisticRegression' object has no attribute 'feature_importances_'