### Import Libaries.

In [1]:
import pandas as pd 
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
student_df = pd.read_csv('./Datasets/student_salary.csv')

In [4]:
student_df.sample(5)

Unnamed: 0,study_hours,attendance,cgpa,projects,salary
59,10,88,9.42,6,159004
111,3,92,6.51,2,104944
189,7,75,8.07,3,126810
40,9,68,9.3,5,141785
187,10,99,9.67,5,161984


In [5]:
X = student_df.iloc[ : , : 4]
Y = student_df.iloc[ : , -1]

In [6]:
X

Unnamed: 0,study_hours,attendance,cgpa,projects
0,7,81,8.41,4
1,4,70,7.29,3
2,8,75,8.47,5
3,5,92,7.49,2
4,7,68,7.49,4
...,...,...,...,...
195,8,71,8.83,4
196,5,85,7.37,2
197,4,75,6.90,3
198,2,96,6.27,2


In [7]:
Y

0      138285
1      106432
2      144662
3      110521
4      126959
        ...  
195    130469
196    116336
197    109790
198     99444
199    130070
Name: salary, Length: 200, dtype: int64

### Using built-in class sklearn.

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 3)

In [9]:
mlr = LinearRegression()

In [10]:
mlr.fit(X_train, y_train)

In [11]:
y_pred = mlr.predict(X_test)

In [12]:
mae = mean_absolute_error(y_test, y_pred)
mae

3336.502075712303

In [13]:
mse = mean_squared_error(y_test, y_pred)
mse

18827969.056655265

In [14]:
rmse = np.sqrt(mse)
rmse

np.float64(4339.120770001137)

In [15]:
r2_value = r2_score(y_test, y_pred)
r2_value

0.9725092166540754

In [16]:
# adjusted r2_score value.
1 - ((1 - r2_value) * (y_test.shape[0] - 1) / (y_test.shape[0] - 4 - 1))

0.9693674128431126

In [17]:
# cofficients of these features and intercepts(beta NOT)
print(mlr.coef_)
print(mlr.intercept_)

[2299.1654811   249.72778346 8749.95623455 5091.56255817]
4356.456150001337


### Multiple Linear Regression from Scratch.

In [18]:

# creating a class of Multiple Linear Regression.
class MyMultipleLinearRegression:

    def __init__(self): 
        cofficients = None 
        intercept = None 

    def fit(self, X_train, y_train): 
        print('Hey, here you train the model')

        # add all 1's at the first column in input feature vector. 
        X_train = np.insert(X_train, 0, 1, axis = 1) 
        betas = np.dot(np.linalg.inv(np.dot(np.transpose(X_train), X_train)), np.dot(np.transpose(X_train), y_train)) 
        # betas = np.linalg.inv(np.dot(X_train.T,X_train)).dot(X_train.T).dot(y_train)
        self.cofficients = betas[1 : ]
        self.intercept = betas[0]
        

    def predict(self, X_test): 
        y_pred = np.dot(X_test, self.cofficients) + self.intercept
        return y_pred

In [19]:
# here create an object of the class. 
my_lr = MyMultipleLinearRegression()

In [20]:
# training. 
my_lr.fit(X_train, y_train)

Hey, here you train the model


In [21]:
# testing. 
my_lr.predict(X_test)

array([148862.84052762,  81567.00503223, 113655.97502759, 107932.08752708,
        95225.61095331, 146718.31621695,  78268.09341859,  94902.54297133,
       140618.8651467 ,  86103.37637193, 125057.54708416,  80628.68483924,
        82385.75670172, 109196.76092497, 142478.10423776,  78851.25502252,
       164146.48393446, 114547.53694726,  99137.04114929, 130106.97989305,
       124567.04538224, 130359.13625483, 143617.49531092,  93174.80830865,
        74139.02604621, 140489.78929663, 132112.39107873,  93457.68846316,
        79917.28425123,  97144.9110369 ,  86150.66646089, 109098.38722172,
       162529.36206296, 127362.40285327,  95069.93153211, 111749.44771487,
       123335.52622271,  85902.83544009, 137440.07551811,  78845.54122128])

In [22]:
mae = mean_absolute_error(y_test, y_pred)
mae

3336.502075712303

In [23]:
mse = mean_squared_error(y_test, y_pred)
mse

18827969.056655265

In [24]:
rmse = np.sqrt(mse)
rmse

np.float64(4339.120770001137)

In [25]:
r2_value = r2_score(y_test, y_pred)
r2_value

0.9725092166540754

In [26]:
# adjusted r2_score value.
1 - ((1 - r2_value) * (y_test.shape[0] - 1) / (y_test.shape[0] - 4 - 1))

0.9693674128431126

In [27]:
print(my_lr.cofficients)
print(my_lr.intercept)

[2299.1654811   249.72778346 8749.95623455 5091.56255817]
4356.456150064287
