# Loading in the Dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
df1 = pd.read_csv("car_prices.csv")
df1.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,alfa-romero giulia,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,alfa-romero stelvio,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,alfa-romero Quadrifoglio,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,audi 100 ls,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,audi 100ls,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


# Data Cleaning - through excel and python

In [3]:
df1['CarName'] = df1['CarName'].replace(['vokswagen','Nissan','toyouta','vw','porcshce'],['volkswagen','nissan','toyota','volkswagen','porsche'])


In [4]:
df2 = df1.drop(['symboling','car_ID'],axis='columns')
df2.shape

(205, 24)

In [5]:
df3 = df2.dropna()
df3.isnull().sum()

CarName             0
fueltype            0
aspiration          0
doornumber          0
carbody             0
drivewheel          0
enginelocation      0
wheelbase           0
carlength           0
carwidth            0
carheight           0
curbweight          0
enginetype          0
cylindernumber      0
enginesize          0
fuelsystem          0
boreratio           0
stroke              0
compressionratio    0
horsepower          0
peakrpm             0
citympg             0
highwaympg          0
price               0
dtype: int64

# Creating Dummy Variables for Categorical Data

In [6]:
df3.dtypes

CarName              object
fueltype             object
aspiration           object
doornumber           object
carbody              object
drivewheel           object
enginelocation       object
wheelbase           float64
carlength           float64
carwidth            float64
carheight           float64
curbweight            int64
enginetype           object
cylindernumber       object
enginesize            int64
fuelsystem           object
boreratio           float64
stroke              float64
compressionratio    float64
horsepower            int64
peakrpm               int64
citympg               int64
highwaympg            int64
price               float64
dtype: object

In [7]:
X = df3[['CarName', 'fueltype', 'aspiration', 'doornumber', 'carbody', 'drivewheel','enginelocation','enginetype','cylindernumber','fuelsystem']]
X = pd.get_dummies(data=X, drop_first=True)
X.head()
y = df3['price']

In [8]:
df4 = df3.drop(['CarName', 'fueltype', 'aspiration', 'doornumber', 'carbody', 'drivewheel','enginelocation','enginetype','cylindernumber','fuelsystem'],axis='columns')


In [9]:
df5 = pd.concat([df4,X],axis='columns')
df5.head()

Unnamed: 0,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,...,cylindernumber_three,cylindernumber_twelve,cylindernumber_two,fuelsystem_2bbl,fuelsystem_4bbl,fuelsystem_idi,fuelsystem_mfi,fuelsystem_mpfi,fuelsystem_spdi,fuelsystem_spfi
0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,...,0,0,0,0,0,0,0,1,0,0
1,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,...,0,0,0,0,0,0,0,1,0,0
2,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,...,0,0,0,0,0,0,0,1,0,0
3,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,...,0,0,0,0,0,0,0,1,0,0
4,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,...,0,0,0,0,0,0,0,1,0,0


In [10]:
df5.dtypes

wheelbase          float64
carlength          float64
carwidth           float64
carheight          float64
curbweight           int64
                    ...   
fuelsystem_idi       uint8
fuelsystem_mfi       uint8
fuelsystem_mpfi      uint8
fuelsystem_spdi      uint8
fuelsystem_spfi      uint8
Length: 65, dtype: object

# Creating a Linear Regression Model

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

0.7889066080053134

# Alternative Model Creating

# Feature selection 

## Splitting categorical and numerical data 

## Using Kendall's tau to reduce categorical features 

In [12]:
df_categorical = df5.drop(['carlength', 'wheelbase', 'carwidth', 'carheight','curbweight', 'enginesize', 'boreratio', 'stroke', 'compressionratio', 'horsepower','peakrpm','citympg','highwaympg'],axis='columns')
X = df_categorical.drop(['price'],axis='columns')
y = df_categorical.price
df_categorical.head()

Unnamed: 0,price,CarName_audi,CarName_bmw,CarName_buick,CarName_chevrolet,CarName_dodge,CarName_honda,CarName_isuzu,CarName_jaguar,CarName_maxda,...,cylindernumber_three,cylindernumber_twelve,cylindernumber_two,fuelsystem_2bbl,fuelsystem_4bbl,fuelsystem_idi,fuelsystem_mfi,fuelsystem_mpfi,fuelsystem_spdi,fuelsystem_spfi
0,13495.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,16500.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,16500.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,13950.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,17450.0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [13]:
from scipy.stats import kendalltau
import math
j = []
columns_del = []
for i in X.columns:
    tau, p = kendalltau(df5[i], y)
    if abs(tau) < .5 or p > .15:
        j.append([tau,p,i])
        columns_del.append(str(i))
a = pd.DataFrame(j,columns=['Kendalls tau', 'p-value', 'feature'])

kept_categorical = df_categorical.drop(columns_del,axis='columns')
kept_categorical.head()

Unnamed: 0,price,drivewheel_fwd,drivewheel_rwd,fuelsystem_2bbl,fuelsystem_mpfi
0,13495.0,0,1,0,1
1,16500.0,0,1,0,1
2,16500.0,0,1,0,1
3,13950.0,1,0,0,1
4,17450.0,0,0,0,1


## Using Pearson's correlation to reduce numerical features

In [14]:
numerical = df4
X = numerical.drop(['price'],axis='columns')
y = numerical.price
numerical.head()

Unnamed: 0,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,13495.0
1,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,16500.0
2,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,5000,19,26,16500.0
3,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,10.0,102,5500,24,30,13950.0
4,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,8.0,115,5500,18,22,17450.0


In [15]:
import numpy as np
yval = np.array(y)
columns_del = []
for (columnName, columnData) in X.iteritems():
    xval = np.array(columnData)
    my_rho = np.corrcoef(xval, yval)
    if abs(my_rho[0][1]) < .7:
        columns_del.append(str(columnName))
print(columns_del)
        
kept_numerical = numerical.drop(columns_del,axis='columns')
kept_numerical.head()

['wheelbase', 'carlength', 'carheight', 'boreratio', 'stroke', 'compressionratio', 'peakrpm', 'citympg', 'highwaympg']


Unnamed: 0,carwidth,curbweight,enginesize,horsepower,price
0,64.1,2548,130,111,13495.0
1,64.1,2548,130,111,16500.0
2,65.5,2823,152,154,16500.0
3,66.2,2337,109,102,13950.0
4,66.4,2824,136,115,17450.0


## Combining datasets into one 

In [16]:
a = kept_categorical.drop('price',axis='columns')
final_df = pd.concat([kept_numerical, a], axis=1, join='inner')
final_df.head()

Unnamed: 0,carwidth,curbweight,enginesize,horsepower,price,drivewheel_fwd,drivewheel_rwd,fuelsystem_2bbl,fuelsystem_mpfi
0,64.1,2548,130,111,13495.0,0,1,0,1
1,64.1,2548,130,111,16500.0,0,1,0,1
2,65.5,2823,152,154,16500.0,0,1,0,1
3,66.2,2337,109,102,13950.0,1,0,0,1
4,66.4,2824,136,115,17450.0,0,0,0,1


## Creating our Final Model 

In [20]:
X = final_df.drop('price',axis='columns')
y = final_df['price']

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=10)
from sklearn.linear_model import LinearRegression
lr_clf = LinearRegression()
lr_clf.fit(X_train,y_train)
lr_clf.score(X_test,y_test)

0.6411945031266154