# Step 1: Reading and Understanding the Data

Let us first import NumPy and Pandas and read the housing dataset

In [None]:
# Supress Warnings

import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd

In [None]:
data = pd.read_csv('CarPrice_Assignment.csv')

In [None]:
data

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.describe()

In [None]:


data.drop(['car_ID','CarName'], axis = 1, inplace = True)

In [None]:
data

# Step 2: Visualising the Data

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.pairplot(data)
plt.show()

In [None]:
plt.figure(figsize=(20, 12))

plt.subplot(2,3,1)
sns.boxplot(x = 'fueltype', y = 'price', data = data)
plt.subplot(2,3,2)
sns.boxplot(x = 'aspiration', y = 'price', data = data)
plt.subplot(2,3,3)
sns.boxplot(x = 'doornumber', y = 'price', data = data)
plt.subplot(2,3,4)
sns.boxplot(x = 'carbody', y = 'price', data = data)
plt.subplot(2,3,5)
sns.boxplot(x = 'drivewheel', y = 'price', data = data)

plt.show()

In [None]:
plt.figure(figsize=(20, 12))
plt.subplot(2,3,1)
sns.boxplot(x = 'enginelocation', y = 'price', data = data)
plt.subplot(2,3,2)
sns.boxplot(x = 'fuelsystem', y = 'price', data = data)
plt.show()

In [None]:
plt.figure(figsize = (10, 5))
sns.boxplot(x = 'doornumber', y = 'price', hue = 'fueltype', data = data)
plt.show()

# Step 3: Data Preparation

In [None]:
data

In [None]:
d1=data[['symboling', 'carlength', 'carwidth', 'curbweight', 'enginesize', 'boreratio', 'stroke', 'horsepower', 'citympg', 'highwaympg',
       'price','fueltype','aspiration','carbody','drivewheel','enginelocation','enginetype','cylindernumber','fuelsystem']]
varlist=['fueltype','aspiration','carbody','drivewheel','enginelocation','enginetype','cylindernumber','fuelsystem']
d1=encoded=pd.get_dummies(d1, columns=varlist)
d1.head()

In [None]:
data.shape

In [None]:
data.info()

# Step 4: Splitting the Data into Training and Testing Sets

In [None]:
from sklearn.model_selection import train_test_split

# We specify this so that the train and test data set always have the same rows, respectively
np.random.seed(0)
df_train, df_test = train_test_split(d1, train_size = 0.7, test_size = 0.3, random_state = 100)

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
scaler = MinMaxScaler()

In [None]:
wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight 
enginesize     
boreratio         205 non-null    float64
 19  stroke            205 non-null    float64
 20  compressionratio  205 non-null    float64
 21  horsepower        205 non-null    int64  
 22  peakrpm           205 non-null    int64  
 23  citympg           205 non-null    int64  
 24  highwaympg        205 non-null    int64  
 25  price         

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
num_vars = ['symboling', 'carlength', 'carwidth', 'curbweight', 'enginesize', 'boreratio', 'stroke', 'horsepower', 'citympg', 'highwaympg','price']
df_train[num_vars] = scaler.fit_transform(df_train[num_vars])

In [None]:
df_train.head()

In [None]:
df_train.describe()

In [None]:
# Let's check the correlation coefficients to see which variables are highly correlated

plt.figure(figsize = (100, 100))
sns.heatmap(df_train.corr(), annot = True, cmap="YlGnBu")
plt.show()

## Dividing into X and Y sets for the model building

In [None]:
y_train = df_train.pop('price')
X_train = df_train

In [None]:

X_train.shape

# Step 5: Building a linear model

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm 
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [None]:
lm = LinearRegression()
lm.fit(X_train,y_train)
rfe = RFE(lm)
rfe = rfe.fit(X_train, y_train)

In [None]:
rfe

In [None]:
X_train.columns[rfe.support_]

In [None]:
X_train_rfe = X_train[X_train.columns[rfe.support_]]
X_train_rfe.head()

In [None]:
def build_model(X,y):
    X = sm.add_constant(X) 
    lm = sm.OLS(y,X).fit() 
    print(lm.summary()) 
    return X
    
def checkVIF(X):
    vif = pd.DataFrame()
    vif['Features'] = X.columns
    vif['VIF'] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    vif['VIF'] = round(vif['VIF'], 2)
    vif = vif.sort_values(by = "VIF", ascending = False)
    return(vif)

In [None]:
X_train_new = build_model(X_train_rfe,y_train)

In [None]:
checkVIF(X_train_new)

In [None]:
X_train_new = X_train_new.drop(["curbweight"], axis = 1)


In [None]:
X_train_new = build_model(X_train_new,y_train)

In [None]:
X_train_new = X_train_new.drop(["carbody_convertible"], axis = 1)


In [None]:
X_train_new = build_model(X_train_new,y_train)

In [None]:
X_train_new = X_train_new.drop(["enginetype_dohcv"], axis = 1)

In [None]:
X_train_new = build_model(X_train_new,y_train)

In [None]:
X_train_new = X_train_new.drop(["enginelocation_rear"], axis = 1)
X_train_new = build_model(X_train_new,y_train)

In [None]:
X_train_new = X_train_new.drop(["enginetype_rotor"], axis = 1)
X_train_new = build_model(X_train_new,y_train)


In [None]:
X_train_new = X_train_new.drop(["enginesize"], axis = 1)
X_train_new = build_model(X_train_new,y_train)
checkVIF(X_train_new)

In [None]:
X_train_new = X_train_new.drop(["enginetype_ohcv"], axis = 1)
X_train_new = build_model(X_train_new,y_train)


In [None]:
X_train_new = X_train_new.drop(["cylindernumber_twelve"], axis = 1)
X_train_new = build_model(X_train_new,y_train)


In [None]:
X_train_new = X_train_new.drop(["aspiration_turbo"], axis = 1)
X_train_new = build_model(X_train_new,y_train)
checkVIF(X_train_new)

In [None]:
X_train_new = X_train_new.drop(["cylindernumber_three"], axis = 1)
X_train_new = build_model(X_train_new,y_train)


In [None]:
X_train_new = X_train_new.drop(["cylindernumber_two"], axis = 1)
X_train_new = build_model(X_train_new,y_train)


In [None]:
X_train_new = X_train_new.drop(["fuelsystem_spdi"], axis = 1)
X_train_new = build_model(X_train_new,y_train)


In [None]:
lm = sm.OLS(y_train,X_train_new).fit()
y_train_price = lm.predict(X_train_new)

In [None]:
fig = plt.figure()
sns.distplot((y_train - y_train_price), bins = 20)
fig.suptitle('Error Terms', fontsize = 20)                  # Plot heading 
plt.xlabel('Errors', fontsize = 18)      

In [None]:
num_vars = ['symboling', 'carlength', 'carwidth', 'curbweight', 'enginesize', 'boreratio', 'stroke', 'horsepower', 'citympg', 'highwaympg','price']
df_test[num_vars] = scaler.transform(df_test[num_vars])

In [None]:
df_test.describe()

In [None]:
y_test = df_test.pop('price')
X_test = df_test

In [None]:
X_train_new = X_train_new.drop('const',axis=1)
X_test_new = X_test[X_train_new.columns]
X_test_new = sm.add_constant(X_test_new)

In [None]:
y_pred = lm.predict(X_test_new)

In [None]:
from sklearn.metrics import r2_score 
r2_score(y_test, y_pred)

In [None]:
fig = plt.figure()
plt.scatter(y_test,y_pred)
fig.suptitle('y_test vs y_pred', fontsize=20)              # Plot heading 
plt.xlabel('y_test', fontsize=18)                          # X-label
plt.ylabel('y_pred', fontsize=16)   

In [None]:
print(lm.summary())