Load Database


In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn import preprocessing, model_selection
from sklearn.linear_model import LinearRegression

In [4]:
df = pd.read_excel('/content/sample_data/MFGEmployees.xlsx')

Data Exploration


In [5]:
df.head()

Unnamed: 0,EmployeeNumber,Surname,GivenName,Gender,City,JobTitle,DepartmentName,StoreLocation,Division,BusinessUnit,Age,LengthService,AbsentHours
0,1,Gutierrez,Molly,F,Burnaby,Baker,Bakery,Burnaby,Stores,Stores,32,6,36.577306
1,2,Hardwick,Stephen,M,Courtenay,Baker,Bakery,Nanaimo,Stores,Stores,40,5,30.165072
2,3,Delgado,Chester,M,Richmond,Baker,Bakery,Richmond,Stores,Stores,48,4,83.807798
3,4,Simon,Irene,F,Victoria,Baker,Bakery,Victoria,Stores,Stores,44,3,70.020165
4,5,Delvalle,Edward,M,New Westminster,Baker,Bakery,New Westminster,Stores,Stores,35,3,0.0


In [10]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8336 entries, 0 to 8335
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   EmployeeNumber  8336 non-null   int64  
 1   Surname         8336 non-null   object 
 2   GivenName       8336 non-null   object 
 3   Gender          8336 non-null   object 
 4   City            8336 non-null   object 
 5   JobTitle        8336 non-null   object 
 6   DepartmentName  8336 non-null   object 
 7   StoreLocation   8336 non-null   object 
 8   Division        8336 non-null   object 
 9   BusinessUnit    8336 non-null   object 
 10  Age             8336 non-null   int64  
 11  LengthService   8336 non-null   int64  
 12  AbsentHours     8336 non-null   float64
dtypes: float64(1), int64(3), object(9)
memory usage: 846.8+ KB


In [6]:
df.describe()

Unnamed: 0,EmployeeNumber,Age,LengthService,AbsentHours
count,8336.0,8336.0,8336.0,8336.0
mean,4168.5,41.502039,4.28131,61.283978
std,2406.540255,9.948626,2.477186,49.038365
min,1.0,3.0,0.0,0.0
25%,2084.75,35.0,3.0,19.12759
50%,4168.5,42.0,4.0,56.005808
75%,6252.25,48.0,5.0,94.284692
max,8336.0,77.0,43.0,272.530123


In [11]:
#create a mapping of unique values in a given field to integers and create new columns of mapped integers
def mapper(field, new_field):
    keys = field.unique()
    dicts = dict(zip(keys, range(len(keys))))
    print(dicts)
    df[new_field] = field.map(dicts).astype(int)

mapper(df.Gender, 'MappedGender')
mapper(df.City, 'MappedCity')
mapper(df.JobTitle, 'MappedTitle')
mapper(df.DepartmentName, 'MappedDept')
mapper(df.StoreLocation, 'MappedStoreLoc')
mapper(df.Division, 'MappedDivision')

{'F': 0, 'M': 1}
{'Burnaby': 0, 'Courtenay': 1, 'Richmond': 2, 'Victoria': 3, 'New Westminster': 4, 'Vancouver': 5, 'Sechelt': 6, 'Kamloops': 7, 'North Vancouver': 8, 'Vananda': 9, 'West Vancouver': 10, 'Nanaimo': 11, 'Aldergrove': 12, 'Kelowna': 13, 'Trail': 14, 'Penticton': 15, 'Duncan': 16, 'Crawford Bay': 17, 'Port Hardy': 18, 'Logan Lake': 19, 'Abbotsford': 20, 'Dawson Creek': 21, 'Surrey': 22, 'Squamish': 23, 'Good Hope Lake': 24, 'Sidney': 25, 'Coquitlam': 26, 'Chilliwack': 27, 'Okanagan Mission': 28, 'Ganges': 29, 'Prince George': 30, 'Atlin': 31, 'Whistler': 32, 'Spences Bridge': 33, 'New Westminister': 34, 'Gibsons': 35, 'Vernon': 36, 'Fauquier': 37, 'Mackenzie': 38, 'Gold Bridge': 39, 'Fort Fraser': 40, 'Nelson': 41, 'Kaslo': 42, 'Creston': 43, 'Wynndel': 44, 'Muncho Lake': 45, 'Kitimat': 46, 'Lac La Hache': 47, 'Armstrong': 48, 'Quesnel': 49, 'Hixon': 50, 'Ocean Falls': 51, 'Vallican': 52, 'North Pender Island': 53, 'Montney': 54, 'Burns Lake': 55, 'Midway': 56, 'Westwold':

Regression Analysis

In [12]:
print(df.columns)

Index(['EmployeeNumber', 'Surname', 'GivenName', 'Gender', 'City', 'JobTitle',
       'DepartmentName', 'StoreLocation', 'Division', 'BusinessUnit', 'Age',
       'LengthService', 'AbsentHours', 'MappedGender', 'MappedCity',
       'MappedTitle', 'MappedDept', 'MappedStoreLoc', 'MappedDivision'],
      dtype='object')


In [15]:
num_feat = list(df.drop(columns=["Surname","GivenName","Gender","City","JobTitle","DepartmentName","StoreLocation","Division","BusinessUnit"]))
num_feat

['EmployeeNumber',
 'Age',
 'LengthService',
 'AbsentHours',
 'MappedGender',
 'MappedCity',
 'MappedTitle',
 'MappedDept',
 'MappedStoreLoc',
 'MappedDivision']

Test for Normal Distribution

In [16]:
from scipy.stats import normaltest


In [17]:
for i in num_feat:
  stats, pval=normaltest(df[i])
  if pval > 0.05:
    print(i,': Normal Distributed')
  else:
    print(i,': Not Normal Distributed')

EmployeeNumber : Not Normal Distributed
Age : Normal Distributed
LengthService : Not Normal Distributed
AbsentHours : Not Normal Distributed
MappedGender : Not Normal Distributed
MappedCity : Not Normal Distributed
MappedTitle : Not Normal Distributed
MappedDept : Not Normal Distributed
MappedStoreLoc : Not Normal Distributed
MappedDivision : Not Normal Distributed


Defining Y and X

In [19]:
y = df["AbsentHours"]
y

0        36.577306
1        30.165072
2        83.807798
3        70.020165
4         0.000000
           ...    
8331     93.665111
8332      0.000000
8333    176.356940
8334     60.321917
8335    112.023389
Name: AbsentHours, Length: 8336, dtype: float64

In [25]:
x = df.drop(columns=["EmployeeNumber","Surname","GivenName","Gender","City","JobTitle","DepartmentName","StoreLocation","Division","BusinessUnit","AbsentHours"])
x

Unnamed: 0,Age,LengthService,MappedGender,MappedCity,MappedTitle,MappedDept,MappedStoreLoc,MappedDivision
0,32,6,0,0,0,0,0,0
1,40,5,1,1,0,0,1,0
2,48,4,1,2,0,0,2,0
3,44,3,0,3,0,0,3,0
4,35,3,1,4,0,0,4,0
...,...,...,...,...,...,...,...,...
8331,46,4,0,63,7,6,29,0
8332,34,2,1,54,7,6,24,0
8333,58,4,0,10,7,6,6,0
8334,43,6,0,5,13,12,5,0


Standardize the Inputs

In [27]:
# prompt: standardize the x

from sklearn.preprocessing import StandardScaler

x = StandardScaler().fit_transform(x)

Training ML model

In [29]:
from sklearn.model_selection import train_test_split

In [51]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
model = LinearRegression().fit(x_train, y_train)


In [52]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(6668, 8) (1668, 8) (6668,) (1668,)


Run Linear Regression


In [31]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_train, y_train)

Showing the R2 Score of the Model

In [53]:
from sklearn.metrics import r2_score

y_pred = model.predict(x_test)
r2_score = r2_score(y_test, y_pred)
print("R2 Score:", r2_score)


R2 Score: 0.702451175728555


Showing the Mean Absolute Error of the Model

In [54]:
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:", mae)


Mean Absolute Error: 20.274307847052594


Showing the Coefficient of the Model

In [80]:
model.coef_

array([41.67445105, -2.68548638, -8.21798198,  0.12998078, -0.76348627,
        0.71981341, -0.3439691 , -2.43321743])

In [79]:
import matplotlib.pyplot as plt
import statsmodels.api as sm

Displaying the Linear Regression Equation of the Model

In [84]:
print("Linear Regression Equation:")
for i in range(len(model.coef_)):
  print(f"{model.coef_[i]:.3f} * {num_feat[i]} + ", end="")
print(f"{model.intercept_:.3f}")


Linear Regression Equation:
41.674 * EmployeeNumber + -2.685 * Age + -8.218 * LengthService + 0.130 * AbsentHours + -0.763 * MappedGender + 0.720 * MappedCity + -0.344 * MappedTitle + -2.433 * MappedDept + 61.209
