# Import libraries

In [542]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler

# Import data

In [543]:
# Load the dataset
df = pd.read_csv('D:\workplace\ML\HR_Analytics\HR_analytics\general_data.csv')

# Show the dataset
df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


# Data preprocessing

## Summary of dataset

In [544]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4410 entries, 0 to 4409
Data columns (total 24 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Age                      4410 non-null   int64  
 1   Attrition                4410 non-null   object 
 2   BusinessTravel           4410 non-null   object 
 3   Department               4410 non-null   object 
 4   DistanceFromHome         4410 non-null   int64  
 5   Education                4410 non-null   int64  
 6   EducationField           4410 non-null   object 
 7   EmployeeCount            4410 non-null   int64  
 8   EmployeeID               4410 non-null   int64  
 9   Gender                   4410 non-null   object 
 10  JobLevel                 4410 non-null   int64  
 11  JobRole                  4410 non-null   object 
 12  MaritalStatus            4410 non-null   object 
 13  MonthlyIncome            4410 non-null   int64  
 14  NumCompaniesWorked      

## Remove missing values

In [545]:
df = df.dropna()

## Summary of statistic

In [546]:
df.describe()

Unnamed: 0,Age,DistanceFromHome,Education,EmployeeCount,EmployeeID,JobLevel,MonthlyIncome,NumCompaniesWorked,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
count,4382.0,4382.0,4382.0,4382.0,4382.0,4382.0,4382.0,4382.0,4382.0,4382.0,4382.0,4382.0,4382.0,4382.0,4382.0,4382.0
mean,36.933364,9.198996,2.912369,1.0,2207.804884,2.063898,65061.702419,2.693291,15.210634,8.0,0.794614,11.290278,2.798266,7.010497,2.191693,4.126198
std,9.137272,8.105396,1.024728,0.0,1271.688783,1.106115,47142.310175,2.497832,3.663007,0.0,0.852397,7.785717,1.289402,6.129351,3.224994,3.569674
min,18.0,1.0,1.0,1.0,1.0,1.0,10090.0,0.0,11.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,30.0,2.0,2.0,1.0,1108.25,1.0,29110.0,1.0,12.0,8.0,0.0,6.0,2.0,3.0,0.0,2.0
50%,36.0,7.0,3.0,1.0,2208.5,2.0,49190.0,2.0,14.0,8.0,1.0,10.0,3.0,5.0,1.0,3.0
75%,43.0,14.0,4.0,1.0,3308.75,3.0,83790.0,4.0,18.0,8.0,1.0,15.0,3.0,9.0,3.0,7.0
max,60.0,29.0,5.0,1.0,4409.0,5.0,199990.0,9.0,25.0,8.0,3.0,40.0,6.0,40.0,15.0,17.0


## Label encoder

In [547]:
encoder = LabelEncoder()

for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = encoder.fit_transform(df[col])

## Data splitting

In [548]:
# Split the dataframe into 3 datasets : training (70%), test (20%), valid(10%)
train, test, valid = np.split(np.array(df), [(int)(.7*len(df)), (int)(.9*len(df))])

X_train, X_test, X_valid = np.delete(train, 1, axis=1), np.delete(test, 1, axis=1), np.delete(train, 1, axis=1)
y_train, y_test, y_valid = train[:,1], test[:,1], valid[:,1]

## Standard scaler

In [549]:
sscaler = StandardScaler()
X_train = sscaler.fit_transform(X_train)

# Build the model

In [550]:
def sigmoid(X):
    return 1/(1 + np.exp(-X))


class Logistic_Regression:
    def __init__(self):
        self.weights = None
        self.bias = None
    
    def fit(self,X,y): # Override
        pass
 
    def predict(self, X): # Inherited
        y_predict = sigmoid(X.dot(self.weights) + self.bias)
        return np.array([0 if y <= 0.5 else 1 for y in y_predict])
    
class Stochastic_Gradient_Descent(Logistic_Regression):
    def __init__(self, lr=0.01, max_iter=200, random_state=21522832):
        super().__init__()
        self.lr = lr                        # Learning rate
        self.max_iter = max_iter            # Max iteration
        self.random_state = random_state
        self.cost_trend = []                # Visualize the cost/loss function
    
    def fit(self,X,y):
        rgen = np.random.RandomState(seed=self.random_state)
        weights = rgen.normal(loc=0.0, scale=0.1, size=X.shape[1] + 1)
        self.weights = weights[1:]
        self.bias = weights[0]

        for _ in range(self.max_iter):
            for i,x in enumerate(X):
                y_predict = sigmoid(x.dot(self.weights) + self.bias)
                diff = y[i] - y_predict
                self.bias += self.lr * diff
                self.weights += self.lr * x * diff 

In [551]:
LgR_model = Stochastic_Gradient_Descent(lr = 1e-2, max_iter = 100, random_state = 21522832)
LgR_model.fit(X_train,y_train)

In [552]:
def accuracy (y_predict, y_test):
    return np.sum(y_predict == y_test) / len(y_test)

y_predict_LgR = LgR_model.predict(X_test)
print('Accuracy of LgR_model: ', round(accuracy(y_test,y_predict_LgR),6))

Accuracy of LgR_model:  0.828767


  return 1/(1 + np.exp(-X))
