In [1]:
import pandas as pd

In [2]:
dataset = pd.read_csv('hiring.csv')

In [3]:
dataset

Unnamed: 0,experience,test_score,interview_score,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000
5,three,7.0,10,62000
6,ten,,7,72000
7,eleven,7.0,8,80000


In [4]:
dataset.dtypes

experience          object
test_score         float64
interview_score      int64
salary               int64
dtype: object

In [5]:
dataset.isnull().sum()

experience         2
test_score         1
interview_score    0
salary             0
dtype: int64

In [6]:
#convert categorical column into Numerical using Label Encoding
colname = ['experience'] # list having only categorical column

from sklearn.preprocessing import LabelEncoder # import LabelEncoder function from preprocessing sublibrary
le=LabelEncoder()                              # save LabelEncoder function in a variable le
for x in colname:                            
    dataset[x]=le.fit_transform(dataset[x]) # it assigns numbers to all values of categorical column

In [7]:
# handling null values
dataset['experience'].fillna(0, inplace=True) # fill null values with 0 
dataset['test_score'].fillna(dataset['test_score'].mean(), inplace=True) # fill null value with mean

In [8]:
dataset.isnull().sum()

experience         0
test_score         0
interview_score    0
salary             0
dtype: int64

In [9]:
dataset

Unnamed: 0,experience,test_score,interview_score,salary
0,6,8.0,9,50000
1,6,8.0,6,45000
2,1,6.0,7,60000
3,5,10.0,10,65000
4,2,9.0,6,70000
5,4,7.0,10,62000
6,3,7.857143,7,72000
7,0,7.0,8,80000


In [10]:
# define X and Y
X = dataset.iloc[:, :3]
Y = dataset.iloc[:,3]

In [11]:
X

Unnamed: 0,experience,test_score,interview_score
0,6,8.0,9
1,6,8.0,6
2,1,6.0,7
3,5,10.0,10
4,2,9.0,6
5,4,7.0,10
6,3,7.857143,7
7,0,7.0,8


In [12]:
Y

0    50000
1    45000
2    60000
3    65000
4    70000
5    62000
6    72000
7    80000
Name: salary, dtype: int64

In [13]:
#split X and Y into 2 parts to get training and testing data
from sklearn.model_selection import train_test_split # import train_test_split func from model_selection sublib.
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=10) # training and testing data (model can randomly take 10 rows at a time)

In [14]:
X_train

Unnamed: 0,experience,test_score,interview_score
6,3,7.857143,7
7,0,7.0,8
0,6,8.0,9
4,2,9.0,6
5,4,7.0,10
1,6,8.0,6


In [15]:
X_test

Unnamed: 0,experience,test_score,interview_score
2,1,6.0,7
3,5,10.0,10


In [16]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(6, 3)
(2, 3)
(6,)
(2,)


In [17]:
#Build a linear regression model
from sklearn.linear_model import LinearRegression #import LinearRegression function from linear_model sublib.
lm = LinearRegression()  # save LinearRegression func in lm var.
lm.fit(X_train,Y_train)  # fit function use to train the model

LinearRegression()

In [18]:
print(lm.coef_) # it will return β1, β2,β3

[-5707.35524257  1354.7209181   1161.71100678]


In [19]:
print(lm.intercept_) # it will return β0 (intercept on Y-axis)

63656.23369848727


In [20]:
Y_pred = lm.predict(X_test) # predict y for testing data

In [21]:
Y_pred

array([74209.181012  , 60283.77673448])

In [22]:
lm.score(X_train,Y_train)  # score of the model when we train the data

0.936380419126448

In [23]:
# Saving model to disk
import pickle  #import pickle library
pickle.dump(lm, open('model.pkl','wb'))  # create the pickle file named as model.pkl

In [24]:
# Loading model to compare the results
model_linear = pickle.load(open('model.pkl','rb'))

In [25]:
model_linear

LinearRegression()

In [26]:
print(model_linear.predict([[2, 9, 6]]))  # 2D array

[71404.27751695]


