### Multiple Linear Regression

1) n input features<br>
2) 1 target variable/feature

3) Equation => 
 $  y = m1x1 + m2x2+ m3x3 +...+ mnxn + c $
 <br>
4) where xi = input features/ independent variables<br>
5) y = dependent variable/ target/label<br>
6) mi = slope/ coefficient<br>
7) c = bias/intercept<br>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
df = pd.read_csv('C:/Users/50_Startups.csv')
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


In [4]:
df.shape

(50, 5)

In [5]:
df.dtypes

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State               object
Profit             float64
dtype: object

In [6]:
df['State'].value_counts()

California    17
New York      17
Florida       16
Name: State, dtype: int64

In [7]:
df.isnull().sum()

R&D Spend          0
Administration     0
Marketing Spend    0
State              0
Profit             0
dtype: int64

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   R&D Spend        50 non-null     float64
 1   Administration   50 non-null     float64
 2   Marketing Spend  50 non-null     float64
 3   State            50 non-null     object 
 4   Profit           50 non-null     float64
dtypes: float64(4), object(1)
memory usage: 2.1+ KB


In [9]:
# correlation
df.corr()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit
R&D Spend,1.0,0.241955,0.724248,0.9729
Administration,0.241955,1.0,-0.032154,0.200717
Marketing Spend,0.724248,-0.032154,1.0,0.747766
Profit,0.9729,0.200717,0.747766,1.0


In [10]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()

In [11]:
df['State'] = lb.fit_transform(df['State'])
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,2,192261.83
1,162597.7,151377.59,443898.53,0,191792.06
2,153441.51,101145.55,407934.54,1,191050.39
3,144372.41,118671.85,383199.62,2,182901.99
4,142107.34,91391.77,366168.42,1,166187.94


In [12]:
df.dtypes

R&D Spend          float64
Administration     float64
Marketing Spend    float64
State                int32
Profit             float64
dtype: object

In [13]:
x = df.iloc[:,:-1]  # x = df[['R&D Spend','Administration','Marketing Spend','State']]
y = df.iloc[:,-1]   # y = df['Profit']

In [14]:
print(x.shape)
print(y.shape)
print(type(x))
print(type(y))

(50, 4)
(50,)
<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


In [15]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.20)

In [16]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(40, 4)
(10, 4)
(40,)
(10,)


In [17]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression()

In [18]:
reg.fit(x_train,y_train)

LinearRegression()

In [19]:
reg.score(x_train,y_train)

0.9479557026399661

In [20]:
reg.score(x_test,y_test)

0.9577951623607377

In [21]:
y_pred = reg.predict(x_test)
y_pred

array([ 46254.25268358, 132235.09410576, 157904.00639336,  98852.93905086,
       148090.89195694, 100097.94791295, 114916.0080689 , 102207.6622386 ,
        57505.10562846, 161203.61458638])

In [22]:
from sklearn.metrics import r2_score
print(r2_score(y_test,y_pred))

0.9577951623607377


In [23]:
reg.coef_

array([ 8.03126138e-01, -2.31917193e-02,  2.98282630e-02,  8.77158294e+02])

In [24]:
reg.intercept_

49395.0357992999

In [25]:
df_res = pd.DataFrame({'y_test':y_test,'y_pred':y_pred})
df_res.head(8)

Unnamed: 0,y_test,y_pred
47,42559.73,46254.252684
17,125370.37,132235.094106
6,156122.51,157904.006393
33,96778.92,98852.939051
15,129917.04,148090.891957
30,99937.59,100097.947913
24,108552.04,114916.008069
25,107404.34,102207.662239


In [26]:
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse
mabe = mae(y_test,y_pred) 
msqe = mse(y_test,y_pred)
rmse = np.sqrt(mse(y_test,y_pred))
print(f'MSE is {msqe}')
print(f'MAE is {mabe}')
print(f'RMSE is {rmse}')

MSE is 56001283.44418235
MAE is 5777.498814858655
RMSE is 7483.400526778074


In [27]:
df.columns

Index(['R&D Spend', 'Administration', 'Marketing Spend', 'State', 'Profit'], dtype='object')

In [28]:
df['Profit_pred_eqn'] = df['R&D Spend']* reg.coef_[0] + df['Administration']*reg.coef_[1] + df['Marketing Spend']*reg.coef_[2] +  df['State']*reg.coef_[3] + reg.intercept_
df.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit,Profit_pred_eqn
0,165349.2,136897.8,471784.1,2,192261.83,194843.221756
1,162597.7,151377.59,443898.53,0,191792.06,189711.514257
2,153441.51,101145.55,407934.54,1,191050.39,183327.321052
3,144372.41,118671.85,383199.62,2,182901.99,175776.583354
4,142107.34,91391.77,366168.42,1,166187.94,173204.948982


In [29]:
df['State'].value_counts()

2    17
0    17
1    16
Name: State, dtype: int64