In [1]:
# import pandas library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## LinearRegression

In [2]:
cars = pd.read_csv("final_cars.csv")

In [3]:
cars.columns

Index(['engine-size', 'horsepower', 'city-mpg', 'price', 'fuel-type_diesel',
       'fuel-type_gas', 'num-of-doors_four', 'num-of-doors_two',
       'body-style_convertible', 'body-style_hardtop', 'body-style_hatchback',
       'body-style_sedan', 'body-style_wagon', 'drive-wheels_4wd',
       'drive-wheels_fwd', 'drive-wheels_rwd'],
      dtype='object')

In [4]:
## create X and Y
y = cars['price']
X = cars.drop(columns=['price'])

In [5]:
X.shape,y.shape

((201, 15), (201,))

In [6]:
from sklearn.model_selection import train_test_split

In [7]:
from sklearn.linear_model import LinearRegression

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=99)

In [9]:
model = LinearRegression(normalize=True)
model.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [10]:
# Displaying coefficient for each column
for t in zip(X_train.columns, model.coef_):
  print(f"{t[0]:25s} {t[1]}")

engine-size               98.10937882922944
horsepower                66.80342477483468
city-mpg                  -72.99584844332992
fuel-type_diesel          1154.0124486992324
fuel-type_gas             -1154.0124486992313
num-of-doors_four         127.56246118535515
num-of-doors_two          -127.56246118535479
body-style_convertible    3217.8011333809995
body-style_hardtop        -2776.302588487373
body-style_hatchback      -530.1056409520495
body-style_sedan          775.1357246406267
body-style_wagon          -948.9876698050732
drive-wheels_4wd          231.0295644189398
drive-wheels_fwd          -814.9889511680777
drive-wheels_rwd          811.5316218480831


### Calculating accuracy with train data

In [11]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
y_pred = model.predict(X_train)
print(np.sqrt(mean_squared_error(y_train,y_pred)))

3110.609181366756


In [12]:
r2score = r2_score(y_train,y_pred)
print(f"R2 Score: {r2score:0.2f}")

R2 Score: 0.82


### Testing with test data

In [13]:
y_pred = model.predict(X_test)

In [14]:
## get MSE 
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

mse = mean_squared_error(y_test,y_pred)
print(f"Mean Squared Error : {mse:0.2f}")

mae = mean_absolute_error(y_test,y_pred)
print(f"Mean Absolute Error : {mae:0.2f}")

r2score = r2_score(y_test,y_pred)
print(f"R2 Score: {r2score:0.2f}")

Mean Squared Error : 17211038.19
Mean Absolute Error : 3004.61
R2 Score: 0.80


## Regularization 

In [15]:
from sklearn.linear_model import LassoCV
from sklearn.metrics import r2_score

In [16]:
lm = LassoCV(alphas=[0.10,0.5,1.0], cv=5, normalize=True)
lm.fit(X,y)

LassoCV(alphas=[0.1, 0.5, 1.0], copy_X=True, cv=5, eps=0.001,
        fit_intercept=True, max_iter=1000, n_alphas=100, n_jobs=None,
        normalize=True, positive=False, precompute='auto', random_state=None,
        selection='cyclic', tol=0.0001, verbose=False)

In [17]:
# Display coefficient for each column
for t in zip(X_train.columns, lm.coef_):
  print(f"{t[0]:25s} {t[1]}")

engine-size               94.90192356563755
horsepower                51.84457401896925
city-mpg                  -130.26334357430622
fuel-type_diesel          3025.7156531830283
fuel-type_gas             -1.197461207360927e-11
num-of-doors_four         355.97541408794484
num-of-doors_two          -0.0
body-style_convertible    2802.665990293213
body-style_hardtop        455.6302334363587
body-style_hatchback      -1216.1885874637012
body-style_sedan          0.0
body-style_wagon          -1091.0168097273513
drive-wheels_4wd          0.0
drive-wheels_fwd          -784.9444260913275
drive-wheels_rwd          1250.0912458063178


In [18]:
y_pred = lm.predict(X_test)

In [19]:
r2score = r2_score(y_test,y_pred)
print(f"R2 Score: {r2score:0.2f}")

R2 Score: 0.87


## RandomForestRegressor 

In [20]:
from sklearn.ensemble import RandomForestRegressor

In [21]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
print('Train score ', model.score(X_train,y_train))
y_pred = model.predict(X_test)
r2score = r2_score(y_test,y_pred)
print(f"R2 Score: {r2score:0.2f}")

Train score  0.9785950697269045
R2 Score: 0.94


### SGDRegressor

In [22]:
cars = pd.read_csv("final_cars.csv")

In [23]:
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import r2_score

In [24]:
# create X and Y
y = cars['price']
X = cars.drop(columns=['price'])

In [25]:
## scale values
from sklearn.preprocessing import scale
X['engine-size'] = scale(X['engine-size'])
X['horsepower'] = scale(X['horsepower'])
X['city-mpg'] = scale(X['city-mpg'])

In [26]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [27]:
model = SGDRegressor(random_state=100, penalty="l1")
model.fit(X_train,y_train)
print('Train score ', model.score(X_train,y_train))
y_pred = model.predict(X_test)
r2score = r2_score(y_test,y_pred)
print(f"R2 Score: {r2score:0.2f}")

Train score  0.8246795170649844
R2 Score: 0.82
