Multiple linear regression : one output row vs multiple input row
hypothesis : y_hat = m1x1+m2x2+m3x3+...+mnxn+b

In [1]:
import matplotlib.pyplot as plt
import pandas as pd
dataset = pd.read_csv('placement_data_MLR.csv')
dataset.salary = dataset.salary.fillna(0)
dataset

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,No,55.0,Mkt&HR,58.80,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,Yes,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,No,75.0,Mkt&Fin,57.80,Placed,250000.0
3,4,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,No,66.0,Mkt&HR,59.43,Not Placed,0.0
4,5,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,No,96.8,Mkt&Fin,55.50,Placed,425000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,No,91.0,Mkt&Fin,74.49,Placed,400000.0
211,212,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,No,74.0,Mkt&Fin,53.62,Placed,275000.0
212,213,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,Yes,59.0,Mkt&Fin,69.72,Placed,295000.0
213,214,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,No,70.0,Mkt&HR,60.23,Placed,204000.0


Changed default string entries into integer encoding, beacause there is NAN in the salary column of the dataset for those students who are not placed. Our model will not be able to configure that NAN type of data for prediction. So, we convert such type of data into some integer encoding. Like salary == salary for those students who are placed, but salary = 0 for those students who are not placed. Similary we replaced work experience from default 'Yes' 'No' to (1,0).

In [2]:
dataset['workex'].value_counts() # returns the scenario of what kind of dataset we have in workex column (run the code separately to see the results)
encoded_workex = {'workex':{'Yes':1,'No':0}}
dataset = dataset.replace(encoded_workex)
dataset

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,0,55.0,Mkt&HR,58.80,Placed,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,1,86.5,Mkt&Fin,66.28,Placed,200000.0
2,3,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,0,75.0,Mkt&Fin,57.80,Placed,250000.0
3,4,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,0,66.0,Mkt&HR,59.43,Not Placed,0.0
4,5,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,0,96.8,Mkt&Fin,55.50,Placed,425000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,0,91.0,Mkt&Fin,74.49,Placed,400000.0
211,212,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,0,74.0,Mkt&Fin,53.62,Placed,275000.0
212,213,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,1,59.0,Mkt&Fin,69.72,Placed,295000.0
213,214,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,0,70.0,Mkt&HR,60.23,Placed,204000.0


In [3]:
dataset['status'].value_counts()
encoded_degree_t = {'status':{'Placed':1,'Not Placed':0}}
dataset = dataset.replace(encoded_degree_t)
dataset

Unnamed: 0,sl_no,gender,ssc_p,ssc_b,hsc_p,hsc_b,hsc_s,degree_p,degree_t,workex,etest_p,specialisation,mba_p,status,salary
0,1,M,67.00,Others,91.00,Others,Commerce,58.00,Sci&Tech,0,55.0,Mkt&HR,58.80,1,270000.0
1,2,M,79.33,Central,78.33,Others,Science,77.48,Sci&Tech,1,86.5,Mkt&Fin,66.28,1,200000.0
2,3,M,65.00,Central,68.00,Central,Arts,64.00,Comm&Mgmt,0,75.0,Mkt&Fin,57.80,1,250000.0
3,4,M,56.00,Central,52.00,Central,Science,52.00,Sci&Tech,0,66.0,Mkt&HR,59.43,0,0.0
4,5,M,85.80,Central,73.60,Central,Commerce,73.30,Comm&Mgmt,0,96.8,Mkt&Fin,55.50,1,425000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
210,211,M,80.60,Others,82.00,Others,Commerce,77.60,Comm&Mgmt,0,91.0,Mkt&Fin,74.49,1,400000.0
211,212,M,58.00,Others,60.00,Others,Science,72.00,Sci&Tech,0,74.0,Mkt&Fin,53.62,1,275000.0
212,213,M,67.00,Others,67.00,Others,Commerce,73.00,Comm&Mgmt,1,59.0,Mkt&Fin,69.72,1,295000.0
213,214,F,74.00,Others,66.00,Others,Commerce,58.00,Comm&Mgmt,0,70.0,Mkt&HR,60.23,1,204000.0


In [4]:
# dropping some columns which seems redundant for our prediction
dataset = dataset.drop(['hsc_b','ssc_b','gender','hsc_s','degree_t','specialisation'],axis=1) # dropping the columns which seems irrelevant
dataset

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,workex,etest_p,mba_p,status,salary
0,1,67.00,91.00,58.00,0,55.0,58.80,1,270000.0
1,2,79.33,78.33,77.48,1,86.5,66.28,1,200000.0
2,3,65.00,68.00,64.00,0,75.0,57.80,1,250000.0
3,4,56.00,52.00,52.00,0,66.0,59.43,0,0.0
4,5,85.80,73.60,73.30,0,96.8,55.50,1,425000.0
...,...,...,...,...,...,...,...,...,...
210,211,80.60,82.00,77.60,0,91.0,74.49,1,400000.0
211,212,58.00,60.00,72.00,0,74.0,53.62,1,275000.0
212,213,67.00,67.00,73.00,1,59.0,69.72,1,295000.0
213,214,74.00,66.00,58.00,0,70.0,60.23,1,204000.0


In [5]:
x = dataset.iloc[:,:-1] # all the independent inputs
y = dataset.iloc[:,-1] # salary

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2) # 20% of the data is reserved for testing and rest 80% of the data is used to train the model.
MLR = LinearRegression()
MLR.fit(x_train,y_train) # trains the model
y_pred = MLR.predict(x_test)
y_pred

array([261106.19858296, 312763.21365664, 307198.79320989,  10861.33267885,
       323619.5769725 , 275893.61509439, 274459.31263393, 286610.74899105,
       283507.40666979, -45428.1813024 ,  -7404.63818925, 339196.22779315,
       247345.01274164, 326415.16804959, 325886.48010141, 309939.98845897,
       -11200.14582014, 306996.17412297, 277440.18061654, 340297.2604139 ,
       -12282.0816941 ,   9613.78397459, 290553.14091333, 278758.10866987,
        31246.6392408 ,  19316.67445185,  10051.88398531,  37321.6000186 ,
        -8234.13865699,  30307.78674757, 275904.10006339, -16402.53184229,
       286471.29399698, 326631.85715948, 362262.8685235 , 329122.13190642,
       358860.16349245, 244356.57615017,  28913.63961803, 288053.57909216,
         1542.73047185,   7277.68376082, -17924.27055532])

In [7]:
from sklearn.metrics import r2_score
r2_score = r2_score(y_test,y_pred)
r2_score

0.8457667966459432

In [8]:
x_sample = x_test.iloc[1:2,:]
x_sample

Unnamed: 0,sl_no,ssc_p,hsc_p,degree_p,workex,etest_p,mba_p,status
10,11,58.0,61.0,60.0,1,62.0,60.85,1


In [9]:
y_sample = y_test.iloc[1:2]
y_sample

10    260000.0
Name: salary, dtype: float64

In [10]:
MLR.predict(x_sample)

array([312763.21365664])