#Exercise: Multiple Linear Regression for Employee Salary Prediction

Goal: Build a complete multiple linear regression model to predict employee salaries based on multiple features including a categorical variable (department), with predictions comparison and coefficient analysis

##Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

##Importing the dataset

In [2]:
dataset = pd.read_csv('employee_dataset.csv')

In [3]:
print(dataset)

    Years_Experience  Skills_Score  Projects_Completed   Department  Salary
0                  6            61                  42           HR   70755
1                  3            92                  28           HR   78045
2                 12            41                  16  Engineering   71272
3                 14            63                  15           HR   74250
4                 10            83                  47           HR   92921
5                  7            69                  44        Sales   80085
6                 12            77                   3  Engineering   84359
7                  4            41                  37        Sales   56958
8                  6            99                   7        Sales   80208
9                  9            60                  21        Sales   74703
10                 2            72                   9        Sales   64446
11                 6            51                  39        Sales   71863
12          

##Creating the matrix of features

In [4]:
X = dataset.iloc[:, :-1].values

In [5]:
print(X)

[[6 61 42 'HR']
 [3 92 28 'HR']
 [12 41 16 'Engineering']
 [14 63 15 'HR']
 [10 83 47 'HR']
 [7 69 44 'Sales']
 [12 77 3 'Engineering']
 [4 41 37 'Sales']
 [6 99 7 'Sales']
 [9 60 21 'Sales']
 [2 72 9 'Sales']
 [6 51 39 'Sales']
 [10 97 18 'Sales']
 [10 61 4 'Sales']
 [7 100 25 'Engineering']
 [4 83 14 'HR']
 [3 64 50 'Sales']
 [7 88 9 'Sales']
 [7 66 26 'Sales']
 [2 98 2 'Sales']]


##Creating the dependent variable vector

In [6]:
y = dataset.iloc[:, -1].values

In [7]:
print(y)

[70755 78045 71272 74250 92921 80085 84359 56958 80208 74703 64446 71863
 96582 73964 95878 66829 68475 83998 74266 72368]


##Encoding categorical data

###Encoding the independent variable

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [3])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [9]:
print(X)

[[0.0 1.0 0.0 6 61 42]
 [0.0 1.0 0.0 3 92 28]
 [1.0 0.0 0.0 12 41 16]
 [0.0 1.0 0.0 14 63 15]
 [0.0 1.0 0.0 10 83 47]
 [0.0 0.0 1.0 7 69 44]
 [1.0 0.0 0.0 12 77 3]
 [0.0 0.0 1.0 4 41 37]
 [0.0 0.0 1.0 6 99 7]
 [0.0 0.0 1.0 9 60 21]
 [0.0 0.0 1.0 2 72 9]
 [0.0 0.0 1.0 6 51 39]
 [0.0 0.0 1.0 10 97 18]
 [0.0 0.0 1.0 10 61 4]
 [1.0 0.0 0.0 7 100 25]
 [0.0 1.0 0.0 4 83 14]
 [0.0 0.0 1.0 3 64 50]
 [0.0 0.0 1.0 7 88 9]
 [0.0 0.0 1.0 7 66 26]
 [0.0 0.0 1.0 2 98 2]]


##Splitting the dataset into training set and test set

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [11]:
print(X_train)

[[1.0 0.0 0.0 12 41 16]
 [1.0 0.0 0.0 7 100 25]
 [0.0 1.0 0.0 10 83 47]
 [0.0 0.0 1.0 7 88 9]
 [0.0 0.0 1.0 4 41 37]
 [0.0 1.0 0.0 3 92 28]
 [0.0 0.0 1.0 10 61 4]
 [0.0 1.0 0.0 6 61 42]
 [0.0 0.0 1.0 2 98 2]
 [0.0 0.0 1.0 7 66 26]
 [0.0 0.0 1.0 9 60 21]
 [0.0 1.0 0.0 4 83 14]
 [0.0 0.0 1.0 6 99 7]
 [0.0 0.0 1.0 10 97 18]
 [0.0 0.0 1.0 6 51 39]
 [0.0 0.0 1.0 7 69 44]]


In [12]:
print(X_test)

[[0.0 1.0 0.0 14 63 15]
 [0.0 0.0 1.0 3 64 50]
 [1.0 0.0 0.0 12 77 3]
 [0.0 0.0 1.0 2 72 9]]


In [13]:
print(y_train)

[71272 95878 92921 83998 56958 78045 73964 70755 72368 74266 74703 66829
 80208 96582 71863 80085]


In [14]:
print(y_test)

[74250 68475 84359 64446]


##Building the model

In [15]:
from sklearn.linear_model import LinearRegression
mlr = LinearRegression()

##Training the model

In [16]:
mlr.fit(X_train, y_train)

##Predicting the test set results

In [17]:
y_pred = mlr.predict(X_test)

In [18]:
print(y_pred)

[82005.91775938 72143.75882702 87320.3211262  60858.00564097]


##Display the predicted and actual test set results

In [19]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)), axis=1))

[[82005.92 74250.  ]
 [72143.76 68475.  ]
 [87320.32 84359.  ]
 [60858.01 64446.  ]]


##Making a single prediction

In [20]:
print(mlr.predict([[1, 0, 0, 1, 70, 4]]))

[57910.31]


##Calculating the values of the coefficients

###Calculating b<sub>0</sub> (y-intercept)

In [21]:
print(mlr.intercept_)

14118.342990818339


##Calculating b<sub>i</sub> (slope coefficients)

In [22]:
print(mlr.coef_)

[ 2843.63 -3597.63   753.99  2363.8    532.83   321.58]
