**MULTIPLE LINEAR REGRESSION**

Yi = B0 + B1Xi1 + B2Xi2 .... BJXiJ + BpXip + Ei

E(ei2) = E((yi - Yİ)**2)

*B = (X^T * X)^-1 * X^T*Y

In [17]:
import pandas as pd
import numpy as np

df = pd.read_csv("/content/Advertising.csv")

X = df.drop(columns= ["sales","Unnamed: 0"],axis=1)
y = df[["sales"]]

y.head()
X.head()

Unnamed: 0,TV,radio,newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4


In [2]:
X.head()

Unnamed: 0.1,Unnamed: 0,TV,radio,newspaper
0,1,230.1,37.8,69.2
1,2,44.5,39.3,45.1
2,3,17.2,45.9,69.3
3,4,151.5,41.3,58.5
4,5,180.8,10.8,58.4


In [3]:
# Creating a model and Inspecting Using statmodels
import statsmodels.api as sm

lm = sm.OLS(y, X)
model = lm.fit()
model.summary()

0,1,2,3
Dep. Variable:,sales,R-squared (uncentered):,0.984
Model:,OLS,Adj. R-squared (uncentered):,0.984
Method:,Least Squares,F-statistic:,2986.0
Date:,"Sun, 14 May 2023",Prob (F-statistic):,2.42e-174
Time:,16:04:07,Log-Likelihood:,-412.23
No. Observations:,200,AIC:,832.5
Df Residuals:,196,BIC:,845.6
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Unnamed: 0,0.0092,0.002,4.846,0.000,0.005,0.013
TV,0.0507,0.001,36.092,0.000,0.048,0.054
radio,0.2125,0.009,23.240,0.000,0.194,0.231
newspaper,0.0137,0.006,2.153,0.033,0.001,0.026

0,1,2,3
Omnibus:,18.921,Durbin-Watson:,1.943
Prob(Omnibus):,0.0,Jarque-Bera (JB):,28.622
Skew:,-0.56,Prob(JB):,6.09e-07
Kurtosis:,4.476,Cond. No.,14.6


In [None]:
# F stat : model's significance

In [18]:
#  Scikit learns model creation

from sklearn.linear_model import LinearRegression

lm = LinearRegression()
model =lm.fit(X, y)
print(f"B0: {model.intercept_}")
print(f"B1,2,3 values: {model.coef_}")

B0: [2.93888937]
B1,2,3 values: [[ 0.04576465  0.18853002 -0.00103749]]


#Prediction

 Sales = 2.94 + TV * 0.19 - NEWSPAPAER * 0.001

By hand:

30 TV, 10 RADIO, 40 NEWSPAPER

 2.94 + 30 * 0.04 + 10 * 0.19 - 40 * 0.001 = 5.9999

In [22]:
import pandas as pd

new_data = [[30],[10],[40]]

new_df=pd.DataFrame(new_data).T
new_df

Unnamed: 0,0,1,2
0,30,10,40


In [23]:
model.predict(new_df)



array([[6.15562918]])

In [25]:
# BY HAND :  5.9999
# BY SKLEARN : 6.15562918

In [29]:
from sklearn.metrics import mean_squared_error
import numpy as np


mse=mean_squared_error(y,model.predict(X))
mse

2.784126314510936

In [30]:
rmse = np.sqrt(mse)
rmse

1.6685701407225697

**MODEL TUNING**

 - HoldOut Method

In [32]:


from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.20,random_state=42)

X_train.head()

Unnamed: 0,TV,radio,newspaper
79,116.0,7.7,23.1
197,177.0,9.3,6.4
38,43.1,26.7,35.1
24,62.3,12.6,18.3
122,224.0,2.4,15.6


In [33]:
y_train.head()

Unnamed: 0,sales
79,11.0
197,12.8
38,10.1
24,9.7
122,11.6


In [36]:
lm = LinearRegression()

lm.fit(X_train,y_train)
predicted = model.predict(X_train)  
mse = mean_squared_error(y_train,predicted) # (real,predicted)

# Train errors
print(mse)
print(np.sqrt(mse))

2.719174755223224
1.6489920421952389


In [37]:
# Test errors

predicted = model.predict(X_test)  
mse = mean_squared_error(y_test,predicted) # (real,predicted)

print(mse)
print(np.sqrt(mse))

3.0439325516617846
1.7446869494731094


- K-FOLD CROSS VALIDATION

In [39]:
from sklearn.model_selection import cross_val_score

cross_val_score(model, X_train, y_train, cv=10, scoring="neg_mean_squared_error")


array([-5.57303426, -2.86235681, -2.06504684, -1.09186983, -1.66159795,
       -2.50694042, -2.92821679, -2.01207197, -7.2250041 , -1.66156243])

In [43]:
cross_validated_mse=np.mean(-cross_val_score(model, X_train, y_train, cv=10, scoring="neg_mean_squared_error"))
cross_validated_mse

-2.9587701386797574

In [44]:
cross_validated_rmse=np.sqrt(np.mean(-cross_val_score(model, X_train, y_train, cv=10, scoring="neg_mean_squared_error")))
cross_validated_rmse

1.7201075950880973