In [4]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path
from collections import Counter

from sqlalchemy import create_engine

from config import db_password
import psycopg2

# Connecting Database & Data Cleaning

In [5]:
url = f"postgres://postgres:{db_password}@127.0.0.1:5433/Employee_Attrition"

In [6]:
def connect(url):
    # Connect to the server
    conn = None
    try:
        print('Connecting...')
        conn = psycopg2.connect(url)
    except (Exception, psycopg2.DatabaseError) as error:
        print(error) 
    print("Connected")
    return conn

In [7]:
#A function to conect a db and bring in Data into a panda df 
def sql_to_df(conn, select_query, column_names):
    cursor = conn.cursor()
    try:
        cursor.execute(select_query)
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error")
        cursor.close()
        return 0
    
    # Turn tupples into a df
    tupples = cursor.fetchall()
    cursor.close()
    
    df = pd.DataFrame(tupples, columns=column_names)
    return df

In [8]:
# Connect to the database
conn = connect(url)
column_names = ["Attrition", "Age", "Department", "EmployeeNumber", "Gender", "HourlyRate","JobLevel", "MaritalStatus", "NumCompaniesWorked", "PercentSalaryHike", "PerformanceRating", "StockOptionLevel", "TotalWorkingYears", "TrainingTimesLastYear", "WorkLifeBalance", "YearsAtCompany", "YearsInCurrentRole", "YearsSinceLastPromotion" ]
# Execute the "SELECT *" query
attrition_df = sql_to_df(conn, "select * FROM joint_table", column_names)
attrition_df.head()

Connecting...
Connected


Unnamed: 0,Attrition,Age,Department,EmployeeNumber,Gender,HourlyRate,JobLevel,MaritalStatus,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
0,Yes,41,Sales,1,Female,94,2,Single,8,11,3,0,8,0,1,6,4,0
1,No,49,Research & Development,2,Male,61,2,Married,1,23,4,1,10,3,3,10,7,1
2,Yes,37,Research & Development,4,Male,92,1,Single,6,15,3,0,7,3,3,0,0,0
3,No,33,Research & Development,5,Female,56,1,Married,1,11,3,0,8,3,3,8,7,3
4,No,27,Research & Development,7,Male,40,1,Married,9,12,3,1,6,3,3,2,2,2


In [10]:
#find null values

for column in attrition_df.columns:
    print(f"Column {column} has {attrition_df[column].isnull().sum()} null values")

Column Attrition has 0 null values
Column Age has 0 null values
Column Department has 0 null values
Column EmployeeNumber has 0 null values
Column Gender has 0 null values
Column HourlyRate has 0 null values
Column JobLevel has 0 null values
Column MaritalStatus has 0 null values
Column NumCompaniesWorked has 0 null values
Column PercentSalaryHike has 0 null values
Column PerformanceRating has 0 null values
Column StockOptionLevel has 0 null values
Column TotalWorkingYears has 0 null values
Column TrainingTimesLastYear has 0 null values
Column WorkLifeBalance has 0 null values
Column YearsAtCompany has 0 null values
Column YearsInCurrentRole has 0 null values
Column YearsSinceLastPromotion has 0 null values


In [11]:
# Transform String column for Attrition

def change_string(attrition):
    if attrition == "Yes":
        return 1
    else: return 0
    
attrition_df["Attrition"] = attrition_df["Attrition"].apply(change_string)
attrition_df.head()

Unnamed: 0,Attrition,Age,Department,EmployeeNumber,Gender,HourlyRate,JobLevel,MaritalStatus,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
0,1,41,Sales,1,Female,94,2,Single,8,11,3,0,8,0,1,6,4,0
1,0,49,Research & Development,2,Male,61,2,Married,1,23,4,1,10,3,3,10,7,1
2,1,37,Research & Development,4,Male,92,1,Single,6,15,3,0,7,3,3,0,0,0
3,0,33,Research & Development,5,Female,56,1,Married,1,11,3,0,8,3,3,8,7,3
4,0,27,Research & Development,7,Male,40,1,Married,9,12,3,1,6,3,3,2,2,2


In [12]:
# Transform String column for Gender

def change_string(gender):
    if gender == "Female":
        return 1
    else: return 0
    
attrition_df["Gender"] = attrition_df["Gender"].apply(change_string)
attrition_df.head()

Unnamed: 0,Attrition,Age,Department,EmployeeNumber,Gender,HourlyRate,JobLevel,MaritalStatus,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
0,1,41,Sales,1,1,94,2,Single,8,11,3,0,8,0,1,6,4,0
1,0,49,Research & Development,2,0,61,2,Married,1,23,4,1,10,3,3,10,7,1
2,1,37,Research & Development,4,0,92,1,Single,6,15,3,0,7,3,3,0,0,0
3,0,33,Research & Development,5,1,56,1,Married,1,11,3,0,8,3,3,8,7,3
4,0,27,Research & Development,7,0,40,1,Married,9,12,3,1,6,3,3,2,2,2


In [14]:
# Transform String column for Marital Status

def change_string(marital_status):
    if marital_status == "Single":
        return 1
    elif marital_status == "Married":
        return 2
    else: return 0
    
attrition_df["MaritalStatus"] = attrition_df["MaritalStatus"].apply(change_string)
attrition_df.head()

Unnamed: 0,Attrition,Age,Department,EmployeeNumber,Gender,HourlyRate,JobLevel,MaritalStatus,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
0,1,41,Sales,1,1,94,2,1,8,11,3,0,8,0,1,6,4,0
1,0,49,Research & Development,2,0,61,2,2,1,23,4,1,10,3,3,10,7,1
2,1,37,Research & Development,4,0,92,1,1,6,15,3,0,7,3,3,0,0,0
3,0,33,Research & Development,5,1,56,1,2,1,11,3,0,8,3,3,8,7,3
4,0,27,Research & Development,7,0,40,1,2,9,12,3,1,6,3,3,2,2,2


In [19]:
# Transform String column for Deparment

def change_string(dept):
    if dept == "Human Resources":
        return 1
    elif dept == "Research & Development":
        return 2
    elif dept == "Sales":
        return 3
    else: return 0
    
attrition_df["Department"] = attrition_df["Department"].apply(change_string)
attrition_df.head()

Unnamed: 0,Attrition,Age,Department,EmployeeNumber,Gender,HourlyRate,JobLevel,MaritalStatus,NumCompaniesWorked,PercentSalaryHike,PerformanceRating,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion
0,1,41,3,1,1,94,2,1,8,11,3,0,8,0,1,6,4,0
1,0,49,2,2,0,61,2,2,1,23,4,1,10,3,3,10,7,1
2,1,37,2,4,0,92,1,1,6,15,3,0,7,3,3,0,0,0
3,0,33,2,5,1,56,1,2,1,11,3,0,8,3,3,8,7,3
4,0,27,2,7,0,40,1,2,9,12,3,1,6,3,3,2,2,2


In [None]:
#remove the EmployeeID Column because we have index#s

#attrition_df.drop(columns=["EmployeNumber"], inplace=True)
#attrition_df.head()

# Split the Data into Training and Testing

In [None]:
# Create our features


# Create our target



In [None]:
X.describe()

In [1]:
# Check the balance of our target values