## Simple Linear Regression -> High Bias Low Variance Model

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from pandas_profiling import ProfileReport

In [3]:
dataframe = pd.read_csv("https://raw.githubusercontent.com/naveenmnav/MachineLearning/main/Basic/Advertising/advertising.csv")

In [4]:
dataframe.head()

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,12.0
3,151.5,41.3,58.5,16.5
4,180.8,10.8,58.4,17.9


In [4]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   TV         200 non-null    float64
 1   Radio      200 non-null    float64
 2   Newspaper  200 non-null    float64
 3   Sales      200 non-null    float64
dtypes: float64(4)
memory usage: 6.4 KB


In [5]:
dataframe.describe()

Unnamed: 0,TV,Radio,Newspaper,Sales
count,200.0,200.0,200.0,200.0
mean,147.0425,23.264,30.554,15.1305
std,85.854236,14.846809,21.778621,5.283892
min,0.7,0.0,0.3,1.6
25%,74.375,9.975,12.75,11.0
50%,149.75,22.9,25.75,16.0
75%,218.825,36.525,45.1,19.05
max,296.4,49.6,114.0,27.0


In [6]:
report = ProfileReport(dataframe)

In [7]:
report.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

## To check after Profiling

1. Correlation of features with the target
2. There should not be Multicollinearity (relationship bw 2 independent features)
3. Look for the missing values

In [8]:
report.to_file('SimpleLinearRegression_Profiling.html')

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [9]:
X = dataframe[['TV']]

In [10]:
X

Unnamed: 0,TV
0,230.1
1,44.5
2,17.2
3,151.5
4,180.8
...,...
195,38.2
196,94.2
197,177.0
198,283.6


In [11]:
y = dataframe.iloc[:,-1]

In [12]:
y

0      22.1
1      10.4
2      12.0
3      16.5
4      17.9
       ... 
195     7.6
196    14.0
197    14.8
198    25.5
199    18.4
Name: Sales, Length: 200, dtype: float64

In [13]:
from sklearn.linear_model import LinearRegression

In [14]:
model1 = LinearRegression()

In [15]:
# Trains the Train data. fit() is a standard method for training which determines the best value of m and c
model1.fit(X,y)

LinearRegression()

In [16]:
# y = mx + c -> m = slope, c = intercept

# c = intercept_ , m = coef_

model1.intercept_

6.974821488229891

In [17]:
model1.coef_

array([0.05546477])

In [18]:
# Sales y = 0.05546477x + 6.974821488229891

In [19]:
file1 = 'lr-sample.sav'  # Bytecode format

In [20]:
# file created which contains the model. File size will not increase. It does not take care of the data trained as the model has
# only the y = mx+c equation. File size based on no. of features
pickle.dump(model1, open(file1,'wb'))

In [21]:
# Now we can predict by passing the values to model.predict() (test data)
# As this model only has 1 feature trained, we pass one value, usually passs the array with data of no. of features trained and 
# present. Here we pass 100 as dollars spent for ads
tv_sales_1 = model1.predict([[10000]])

  "X does not have valid feature names, but"


In [22]:
tv_sales_1

# we get 561 which is value of y. So, if we spend 10000 dollars we can sell 561 products
# answer arrived after calculating y from 0.05546477x + 6.974821488229891

array([561.62252618])

In [23]:
# Predicting for multiple value

l1 = [100,20,34,24,555,100000]

for i in l1:
    print(model1.predict([[i]]))
    


[12.52129854]
[8.0841169]
[8.86062368]
[8.30597598]
[37.7577691]
[5553.45186844]


  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"
  "X does not have valid feature names, but"


In [24]:
## loading model

saved_model = pickle.load(open(file1,'rb'))

In [25]:
saved_model.predict([[100000]])

  "X does not have valid feature names, but"


array([5553.45186844])

In [26]:
### Accuracy -> We need to pass test data here. X_test and y_test instead of X and y
# We should also use this function on X_train and y_train
# Gives the R2 value
saved_model.score(X,y)

0.8121757029987414

## Multiple Linear Regression

In [28]:
data = dataframe[['TV','Radio','Newspaper']]

In [29]:
data.head()

Unnamed: 0,TV,Radio,Newspaper
0,230.1,37.8,69.2
1,44.5,39.3,45.1
2,17.2,45.9,69.3
3,151.5,41.3,58.5
4,180.8,10.8,58.4


In [30]:
y.head()

0    22.1
1    10.4
2    12.0
3    16.5
4    17.9
Name: Sales, dtype: float64

In [31]:
mlr = LinearRegression()

In [32]:
mlr.fit(data,y)

LinearRegression()

In [34]:
mlr.coef_

array([0.05444578, 0.10700123, 0.00033566])

In [35]:
mlr.intercept_

4.625124078808653

In [36]:
# y = 0.05444578*TV + 0.10700123*Radio + 0.00033566*Newspaper + 4.625124078808653

In [37]:
mlr.predict([[10000,10000,100000]])

  "X does not have valid feature names, but"


array([1652.66100207])

In [39]:
mlr.score(data,y)

0.9025912899684558

## Ways to build a model
#### 1. sklearn  , 2. ols approach (statistical way of doing a impact analysis)

### Statistical Model will not give the final physical version of the model as that as a sklearn  model
### Just gives some stats 

In [47]:
import statsmodels.formula.api as smf
linear_model = smf.ols(formula='Sales ~ TV+Newspaper+Radio', data = dataframe).fit()
linear_model.summary()

0,1,2,3
Dep. Variable:,Sales,R-squared:,0.903
Model:,OLS,Adj. R-squared:,0.901
Method:,Least Squares,F-statistic:,605.4
Date:,"Thu, 28 Jul 2022",Prob (F-statistic):,8.13e-99
Time:,12:10:52,Log-Likelihood:,-383.34
No. Observations:,200,AIC:,774.7
Df Residuals:,196,BIC:,787.9
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,4.6251,0.308,15.041,0.000,4.019,5.232
TV,0.0544,0.001,39.592,0.000,0.052,0.057
Newspaper,0.0003,0.006,0.058,0.954,-0.011,0.012
Radio,0.1070,0.008,12.604,0.000,0.090,0.124

0,1,2,3
Omnibus:,16.081,Durbin-Watson:,2.251
Prob(Omnibus):,0.0,Jarque-Bera (JB):,27.655
Skew:,-0.431,Prob(JB):,9.88e-07
Kurtosis:,4.605,Cond. No.,454.0


## Observation

#### P-Value
Look for p value (significance label). If significance label is less than 0.05, we can consider that feature, else we can  discard the feature
Newspaper has 0.954 as P value.

Based on this we can do feature selection



In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV, LassoCV, ElasticNet, ElasticNetCV
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
