# Multi-Linear Regrassion Q1 & Q2

### 1.Prepare a prediction model for profit of 50_startups data.
### Do transformations for getting better predictions of profit and
### make a table containing R^2 value for each prepared model.

### R&D Spend -- Research and devolop spend in the past few years
### Administration -- spend on administration in the past few years
### Marketing Spend -- spend on Marketing in the past few years
### State -- states from which data is collected
### Profit  -- profit of each state in the past few years

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.graphics.regressionplots import influence_plot
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score


In [None]:
help(pd.read_csv)

In [None]:
df=pd.read_csv('D:/MultiLinearRegression/50_Startups.csv');

In [None]:
df

In [None]:
df.info

In [None]:
df1=df.rename({'R&D Spend':'RDS','Administration':'ADMS','Marketing Spend':'MKTS'},axis=1)

In [None]:
df1

In [None]:
df1[df1.duplicated()]

In [None]:
df1.describe()

In [None]:
df1.corr()

In [None]:
sns.set_style(style='darkgrid')
sns.pairplot(df1)

In [None]:
model=smf.ols("Profit~RDS+ADMS+MKTS",data=df1).fit()

In [None]:
model.params

In [None]:
model.tvalues , np.round(model.pvalues,5)

In [None]:
model.rsquared , model.rsquared_adj

In [None]:
slr_a=smf.ols("Profit~ADMS",data=df1).fit()
slr_a.tvalues , slr_a.pvalues

In [None]:
slr_m=smf.ols("Profit~MKTS",data=df1).fit()
slr_m.tvalues , slr_m.pvalues

In [None]:
mlr_am=smf.ols("Profit~ADMS+MKTS",data=df1).fit()
mlr_am.tvalues , mlr_am.pvalues

In [None]:
rsq_r=smf.ols("RDS~ADMS+MKTS",data=df1).fit().rsquared
vif_r=1/(1-rsq_r)

rsq_a=smf.ols("ADMS~RDS+MKTS",data=df1).fit().rsquared
vif_a=1/(1-rsq_a)

rsq_m=smf.ols("MKTS~RDS+ADMS",data=df1).fit().rsquared
vif_m=1/(1-rsq_m)

d1={'Variables':['RDS','ADMS','MKTS'],'vif':[vif_r,vif_a,vif_m]}
Vif_df=pd.DataFrame(d1)
Vif_df

In [None]:
sm.qqplot(model.resid,line='q')
plt.title("Normal Q-Q plot of residuals")
plt.show()

In [None]:
list(np.where(model.resid<-30000))

In [None]:
def standard_values(vals) : return (vals-vals.mean())/vals.std()

In [None]:
plt.scatter(standard_values(model.fittedvalues),standard_values(model.resid))
plt.title('Residual Plot')
plt.xlabel('standardized fitted values')
plt.ylabel('standardized residual values')
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'RDS',fig=fig)
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'ADMS',fig=fig)
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'MKTS',fig=fig)
plt.show()

In [None]:
(d,_)=model.get_influence().cooks_distance

In [None]:
d

In [None]:
fig=plt.figure(figsize=(20,7))
plt.stem(np.arange(len(df1)),np.round(d,5))
plt.xlabel('Row Index')
plt.ylabel('Cooks Distance')
plt.show()

In [None]:
np.argmax(d) , np.max(d)

In [None]:
influence_plot(model)
plt.show()

In [None]:
k=df1.shape[1]
n=df1.shape[0]
leverage_cutoff = (3*(k+1))/n
leverage_cutoff

In [None]:
df1[df1.index.isin([49])]

In [None]:
data2=df1.drop(df1.index[[49]],axis=0).reset_index(drop=True)
data2

In [None]:
model2=smf.ols("Profit~RDS+ADMS+MKTS",data=data2).fit()

In [None]:
while model2.rsquared < 0.99:
    for d in [np.max(d)>1]:
        model2=smf.ols("Profit~RDS+ADMS+MKTS",data=data2).fit()
        (d,_)=model2.get_influence().cooks_distance
        d
        np.argmax(d) , np.max(d)
        data2=data2.drop(data2.index[[np.argmax(d)]],axis=0).reset_index(drop=True)
        data2
    else:
        final_model=smf.ols("Profit~RDS+ADMS+MKTS",data=data2).fit()
        final_model.rsquared , final_model.aic
        print("Thus model accuracy is improved to",final_model.rsquared)

In [None]:
final_model.rsquared

In [None]:
data2

In [None]:
new_data=pd.DataFrame({'RDS':70000,"ADMS":90000,"MKTS":140000},index=[0])

In [None]:
new_data

In [None]:
final_model.predict(new_data)

In [None]:
pred_y=final_model.predict(data2)

In [None]:
pred_y

In [None]:
d2={'Prep_Models':['Model','Final_Model'],'Rsquared':[model.rsquared,final_model.rsquared]}
table=pd.DataFrame(d2)
table

In [None]:
# View the shape of the dataframe
shape=df.shape
print("Dataset contains {} rows and {} columns".format(shape[0],shape[1]))

In [None]:
df.columns  # view all the columns in the dataframe

In [None]:
# Statistical details of the dataset
df.describe()

In [None]:
# Define X and Y
x=df.iloc[:,:4]
y=df.iloc[:,4]  # This is like extracting dependent and independent variables.

In [None]:
# Perform One hot Encoding
from sklearn.preprocessing import OneHotEncoder
ohe=OneHotEncoder(sparse=False)
x1=ohe.fit_transform(data2[['State']])

In [None]:
x1

In [None]:
ohe.categories_

In [None]:
# Change columns using Column Transformer
from sklearn.compose import make_column_transformer

In [None]:
col_trans=make_column_transformer((OneHotEncoder(handle_unknown='ignore'),['State']),remainder='passthrough')

In [None]:
x=col_trans.fit_transform(x)

In [None]:
# Split the Data set into Train Set and Test Set
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=0)

In [None]:
#shape of the splitted data
print("X_train:",x_train.shape)
print("X_test:",x_test.shape)
print("Y_train:",y_train.shape)
print("Y_test:",y_test.shape)

In [None]:
#Train the model
linreg=LinearRegression()
linreg.fit(x_train,y_train)

In [None]:
#predict the Test Results
y_pred=linreg.predict(x_test)
y_pred

In [None]:
#Evaluate the model
Accuracy=r2_score(y_test,y_pred)*100
print("Accuracy of the model is %.2f" %Accuracy)

# Plot a graph Between Actual values & Predicted values

In [None]:
#Plot the results
plt.scatter(y_test,y_pred);
plt.xlabel('Actual');
plt.ylabel('Predicted');

In [None]:
sns.regplot(x=y_test,y=y_pred,ci=None,color ='red');

In [None]:
#Predicted Values
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})

In [None]:
pred_df

========================================================First Question=========================================================

### 2.Consider only the below columns and prepare a prediction model for predicting Price.

### Corolla<-Corolla[c("Price","Age_08_04","KM","HP","cc","Doors","Gears","Quarterly_Tax","Weight")]

 

#### Model -- model of the car
#### Price  -- Offer Price in EUROs	
#### Age_08_04 -- Age in months as in August 2004	
#### Mfg_Month -- Manufacturing month (1-12)	
Mfg_Year	-- Manufacturing Year
KM -- Accumulated Kilometers on odometer
Fuel_Type	 -- Fuel Type (Petrol, Diesel, CNG)
HP -- Horse Power
Met_Color	 -- Metallic Color?  (Yes=1, No=0)
Color -- Color (Blue, Red, Grey, Silver, Black, etc.)
Automatic	-- Automatic ( (Yes=1, No=0)
cc -- Cylinder Volume in cubic centimeters
Doors -- Number of doors
Cylinders	-- Number of cylinders
Gears -- Number of gear positions
Quarterly_Tax -- Quarterly road tax in EUROs
Weight -- Weight in Kilograms
Mfr_Guarantee -- Within Manufacturer's Guarantee period  (Yes=1, No=0)
BOVAG_Guarantee -- BOVAG (Dutch dealer network) Guarantee  (Yes=1, No=0)
Guarantee_Period -- 	Guarantee period in months
ABS -- Anti-Lock Brake System (Yes=1, No=0)
Airbag_1 -- Driver_Airbag  (Yes=1, No=0)
Airbag_2 -- Passenger Airbag  (Yes=1, No=0)
Airco -- Airconditioning  (Yes=1, No=0)
Automatic_airco -- Automatic Airconditioning  (Yes=1, No=0)
Boardcomputer -- Boardcomputer  (Yes=1, No=0)
CD_Player -- CD Player  (Yes=1, No=0)
Central_Lock -- Central Lock  (Yes=1, No=0)
Powered_Windows -- Powered Windows  (Yes=1, No=0)
Power_Steering -- Power Steering  (Yes=1, No=0)
Radio -- Radio  (Yes=1, No=0)
Mistlamps	-- Mistlamps  (Yes=1, No=0)
Sport_Model -- Sport Model  (Yes=1, No=0)
Backseat_Divider -- Backseat Divider  (Yes=1, No=0)
Metallic_Rim --Metallic Rim  (Yes=1, No=0)
Radio_cassette -- Radio Cassette  (Yes=1, No=0)
Tow_Bar -- Tow Bar  (Yes=1, No=0)




In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
import statsmodels.api as sm
from statsmodels.graphics.regressionplots import influence_plot

In [None]:
help(pd.read_csv)

In [None]:
toco=pd.read_csv('D:/MultiLinearRegression/ToyotaCorolla.csv',encoding='latin1');

In [None]:
toco

In [None]:
toco.info()

In [None]:
toco.describe()

In [None]:
toco2=pd.concat([toco.iloc[:,2:4],toco.iloc[:,6:7],toco.iloc[:,8:9],toco.iloc[:,12:14],toco.iloc[:,15:18]],axis=1)

In [None]:
toco2

In [None]:
toco3=toco2.rename({'Age_08_04':'Age','cc':'CC','Quarterly_Tax':'QT'},axis=1)

In [None]:
toco3

In [None]:
toco3[toco3.duplicated()]

In [None]:
toco4=toco3.drop_duplicates().reset_index(drop=True)

In [None]:
toco4

In [None]:
toco4.describe()

In [None]:
toco4.corr()

In [None]:
sns.set_style(style='darkgrid')
sns.pairplot(toco4)

In [None]:
model=smf.ols('Price~Age+KM+HP+CC+Doors+Gears+QT+Weight',data=toco4).fit()

In [None]:
model.params

In [None]:
model.tvalues, np.round(model.pvalues,5)

In [None]:
model.rsquared , model.rsquared_adj

In [None]:
slr_c=smf.ols('Price~CC',data=toco4).fit()
slr_c.tvalues , slr_c.pvalues

In [None]:
slr_d=smf.ols('Price~Doors',data=toco4).fit()
slr_d.tvalues , slr_d.pvalues

In [None]:
mlr_cd=smf.ols('Price~CC+Doors',data=toco4).fit()
mlr_cd.tvalues , mlr_cd.pvalues

In [None]:
rsq_age=smf.ols('Age~KM+HP+CC+Doors+Gears+QT+Weight',data=toco4).fit().rsquared
vif_age=1/(1-rsq_age)

rsq_KM=smf.ols('KM~Age+HP+CC+Doors+Gears+QT+Weight',data=toco4).fit().rsquared
vif_KM=1/(1-rsq_KM)

rsq_HP=smf.ols('HP~Age+KM+CC+Doors+Gears+QT+Weight',data=toco4).fit().rsquared
vif_HP=1/(1-rsq_HP)

rsq_CC=smf.ols('CC~Age+KM+HP+Doors+Gears+QT+Weight',data=toco4).fit().rsquared
vif_CC=1/(1-rsq_CC)

rsq_DR=smf.ols('Doors~Age+KM+HP+CC+Gears+QT+Weight',data=toco4).fit().rsquared
vif_DR=1/(1-rsq_DR)

rsq_GR=smf.ols('Gears~Age+KM+HP+CC+Doors+QT+Weight',data=toco4).fit().rsquared
vif_GR=1/(1-rsq_GR)

rsq_QT=smf.ols('QT~Age+KM+HP+CC+Doors+Gears+Weight',data=toco4).fit().rsquared
vif_QT=1/(1-rsq_QT)

rsq_WT=smf.ols('Weight~Age+KM+HP+CC+Doors+Gears+QT',data=toco4).fit().rsquared
vif_WT=1/(1-rsq_WT)

d1={'Variables':['Age','KM','HP','CC','Doors','Gears','QT','Weight'],
    'Vif':[vif_age,vif_KM,vif_HP,vif_CC,vif_DR,vif_GR,vif_QT,vif_WT]}
Vif_df=pd.DataFrame(d1)
Vif_df


In [None]:
sm.qqplot(model.resid,line='q')
plt.title("Normal Q-Q plot of residuals")
plt.show

In [None]:
list(np.where(model.resid>6000))

In [None]:
list(np.where(model.resid<-6000))

In [None]:
def standard_values(vals) : return (vals-vals.mean())/vals.std()

In [None]:
plt.scatter(standard_values(model.fittedvalues),standard_values(model.resid))
plt.title('Residual Plot')
plt.xlabel('standardized fitted values')
plt.ylabel('standardized residuals values')
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'Age',fig=fig)
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'KM',fig=fig)
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'HP',fig=fig)
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'CC',fig=fig)
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'Doors',fig=fig)
plt.show()


In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'Gears',fig=fig)
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'QT',fig=fig)
plt.show()

In [None]:
fig=plt.figure(figsize=(15,8))
sm.graphics.plot_regress_exog(model,'Weight',fig=fig)
plt.show()

In [None]:
(c,_)=model.get_influence().cooks_distance

In [None]:
c

In [None]:
fig=plt.figure(figsize=(20,7))
plt.stem(np.arange(len(toco4)),np.round(c,3))
plt.xlabel('Row Index')
plt.ylabel('Cooks distance')
plt.show()

In [None]:
np.argmax(c), np.max(c)

In [None]:
fig,ax=plt.subplots(figsize=(20,20))
fig=influence_plot(model,ax = ax)

In [None]:
k=toco4.shape[1]
n=toco4.shape[0]
leverage_cutoff = (3*(k+1))/n
leverage_cutoff

In [None]:
toco4[toco4.index.isin([80])]

In [None]:
toco_new=toco4.copy()

In [None]:
toco_new

In [None]:
toco5=toco_new.drop(toco_new.index[[80]],axis=0).reset_index(drop=True)

In [None]:
toco5

In [None]:
while np.max(c)>0.5 :
    model=smf.ols('Price~Age+KM+HP+CC+Doors+Gears+QT+Weight',data=toco5).fit()
    (c,_)=model.get_influence().cooks_distance
    c
    np.argmax(c) , np.max(c)
    toco5=toco5.drop(toco5.index[[np.argmax(c)]],axis=0).reset_index(drop=True)
    toco5
else:
    final_model=smf.ols('Price~Age+KM+HP+CC+Doors+Gears+QT+Weight',data=toco5).fit()
    final_model.rsquared , final_model.aic
    print("Thus model accuracy is improved to",final_model.rsquared)


In [None]:
if np.max(c)>0.5:
    model=smf.ols('Price~Age+KM+HP+CC+Doors+Gears+QT+Weight',data=toco5).fit()
    (c,_)=model.get_influence().cooks_distance
    c
    np.argmax(c) , np.max(c)
    toco5=toco5.drop(toco5.index[[np.argmax(c)]],axis=0).reset_index(drop=True)
    toco5 
elif np.max(c)<0.5:
    final_model=smf.ols('Price~Age+KM+HP+CC+Doors+Gears+QT+Weight',data=toco5).fit()
    final_model.rsquared , final_model.aic
    print("Thus model accuracy is improved to",final_model.rsquared)


In [None]:
final_model.rsquared

In [None]:
toco5

In [None]:
new_data=pd.DataFrame({'Age':12,"KM":40000,"HP":80,"CC":1300,"Doors":4,"Gears":5,"QT":69,"Weight":1012},index=[0])

In [None]:
new_data

In [None]:
final_model.predict(new_data)

In [None]:
pred_y=final_model.predict(toco5)

In [None]:
pred_y

# Ridge Regression

==========================================================Second Question======================================================