In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('hiring.csv')

df.head()

Unnamed: 0,experience,test_score,interview_score,salary
0,,8.0,9,50000
1,,8.0,6,45000
2,five,6.0,7,60000
3,two,10.0,10,65000
4,seven,9.0,6,70000


In [3]:
# Checking for data types.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   experience       6 non-null      object 
 1   test_score       7 non-null      float64
 2   interview_score  8 non-null      int64  
 3   salary           8 non-null      int64  
dtypes: float64(1), int64(2), object(1)
memory usage: 384.0+ bytes


In [4]:
# Checking for missing values.
df.isna().sum()

experience         2
test_score         1
interview_score    0
salary             0
dtype: int64

In [29]:
df['experience'].fillna(value='0', inplace=True)

df['test_score'].fillna(value=(df['test_score'].mean()), inplace=True)

In [6]:
df.isna().sum()

experience         0
test_score         0
interview_score    0
salary             0
dtype: int64

No more missing values.

Values in the 'experience' column are object type.

In [39]:
# Converting values in 'experience' column into numerical.

def conv(x):
    dict = {'five':5, 'two':2, 'seven':7, 'three':3, 'ten':10, 'eleven':11, 0:0}
    return dict[x]

In [40]:
df['experience'] = df['experience'].apply(lambda x: conv(x))

df['experience']

0     0
1     0
2     5
3     2
4     7
5     3
6    10
7    11
Name: experience, dtype: int64

In [45]:
df.dtypes

experience           int64
test_score         float64
interview_score      int64
salary               int64
dtype: object

All data is converted into numerical format.

In [41]:
# Dividing data into X and y as IDV and DV resp.
X = df.iloc[:,:-1]
X

Unnamed: 0,experience,test_score,interview_score
0,0,8.0,9
1,0,8.0,6
2,5,6.0,7
3,2,10.0,10
4,7,9.0,6
5,3,7.0,10
6,10,7.857143,7
7,11,7.0,8


In [72]:
y = df['salary']
y.head()

0    50000
1    45000
2    60000
3    65000
4    70000
Name: salary, dtype: int64

In [68]:
from sklearn.model_selection import train_test_split

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [74]:
X_train

Unnamed: 0,experience,test_score,interview_score
1,0,8.0,6
6,10,7.857143,7
0,0,8.0,9
4,7,9.0,6
3,2,10.0,10
5,3,7.0,10


In [75]:
y_train

1    45000
6    72000
0    50000
4    70000
3    65000
5    62000
Name: salary, dtype: int64

In [76]:
from sklearn.linear_model import LinearRegression

lr = LinearRegression()

In [77]:
lr.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [78]:
y_pred = lr.predict(X_test)

In [79]:
y_pred

array([77084.63115158, 55577.80011431])

In [80]:
from sklearn.metrics import r2_score

In [81]:
r2_score(y_test, y_pred)

0.8597238632433393

In [82]:
lr.predict([[6,10,7.8]])

array([71398.59752853])

In [55]:
df

Unnamed: 0,experience,test_score,interview_score,salary
0,0,8.0,9,50000
1,0,8.0,6,45000
2,5,6.0,7,60000
3,2,10.0,10,65000
4,7,9.0,6,70000
5,3,7.0,10,62000
6,10,7.857143,7,72000
7,11,7.0,8,80000


__Model Deployment__

In [83]:
import pickle

In [84]:
pickle.dump(lr, open('model.pkl', 'wb'))