## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
df= pd.read_csv(r'F:\Chrome Downloads\CSV files\Data Analysis Challenge Data Set.csv')

## EDA

In [3]:
df['normalized-losses']=df['normalized-losses'].replace('?',int(100))

In [4]:
df['normalized-losses']=df['normalized-losses'].astype(int)

In [5]:
df['num-of-doors']=df['num-of-doors'].replace('?','four')

In [6]:
#df['drive-wheels']=df['drive-wheels'].replace('4wd','fwd')

In [7]:
df['engine-type']=df['engine-type'].replace({'dohcv':'dohc','ohcv':'ohc','ohcf':'ohc',})

In [8]:
df['price']=df['price'].replace('?',10000).astype(int)

In [9]:
df['peak-rpm']=df['peak-rpm'].replace('?',5200).astype(int)

In [10]:
df['horsepower']=df['horsepower'].replace('?',86).astype(int)

In [11]:
df['stroke']=df['stroke'].replace('?',3.4).astype(float)

In [12]:
df['bore']=df['bore'].replace('?',3.4).astype(float)

In [98]:
#df.info()

In [97]:
#df.T.head(30)

## Feature Selection

In [15]:
df.make.describe()

count        205
unique        22
top       toyota
freq          32
Name: make, dtype: object

In [16]:
# symboling : Its assigned insurance risk rating
#             A value of +3 indicates that the auto is risky,
#             -3 that it is probably pretty safe.(Categorical)

In [17]:
df_comp_avg_price = df[['make','price']].groupby("make", as_index = False).mean().rename(columns={'price':'brand_avg_price'})

In [18]:
df_comp_avg_price

Unnamed: 0,make,brand_avg_price
0,alfa-romero,15498.333333
1,audi,16736.428571
2,bmw,26118.75
3,chevrolet,6007.0
4,dodge,7875.444444
5,honda,8184.692308
6,isuzu,9458.25
7,jaguar,34600.0
8,mazda,10652.882353
9,mercedes-benz,33647.0


In [19]:
df = df.merge(df_comp_avg_price, on = 'make')

In [20]:
df['brand_category'] = df['brand_avg_price'].apply(lambda x : "Budget" if x < 10000 
                                                     else ("Mid_Range" if 10000 <= x < 20000
                                                           else "Luxury"))

In [21]:
# A single variable mileage can be calculated taking the weighted average of 55% city and 45% highways.

In [22]:
df['mileage'] = df['city-mpg']*0.55 + df['highway-mpg']*0.45

In [23]:
auto = df[['fuel-type', 'aspiration', 'body-style', 'drive-wheels', 'wheel-base', 'length', 'width', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size',  'bore', 'horsepower', 'price', 'brand_category', 'mileage']]

In [24]:
df.columns

Index(['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
       'num-of-doors', 'body-style', 'drive-wheels', 'engine-location',
       'wheel-base', 'length', 'width', 'height', 'curb-weight', 'engine-type',
       'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke',
       'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg',
       'highway-mpg', 'price', 'brand_avg_price', 'brand_category', 'mileage'],
      dtype='object')

In [96]:
#auto.head()

## Creating Dummies

In [26]:
cyl_no = pd.get_dummies(auto['num-of-cylinders'], drop_first = True)


In [27]:
auto = pd.concat([auto, cyl_no], axis = 1)

In [28]:
brand_cat = pd.get_dummies(auto['brand_category'], drop_first = True)

In [29]:
auto = pd.concat([auto, brand_cat], axis = 1)

In [30]:
eng_typ = pd.get_dummies(auto['engine-type'], drop_first = True)


In [31]:
auto = pd.concat([auto, eng_typ], axis = 1)

In [32]:
drwh = pd.get_dummies(auto['drive-wheels'], drop_first = True)

In [33]:
auto = pd.concat([auto, drwh], axis = 1)

In [34]:
carb = pd.get_dummies(auto['body-style'], drop_first = True)

In [35]:
auto = pd.concat([auto, carb], axis = 1)

In [36]:
asp = pd.get_dummies(auto['aspiration'], drop_first = True)

In [37]:
auto = pd.concat([auto, asp], axis = 1)

In [38]:
fuelt = pd.get_dummies(auto['fuel-type'], drop_first = True)

In [39]:
auto = pd.concat([auto, fuelt], axis = 1)

In [40]:
auto.drop(['fuel-type', 'aspiration', 'body-style', 'drive-wheels', 'engine-type', 'num-of-cylinders','brand_category'], axis = 1, inplace = True)

In [41]:
auto.head()

Unnamed: 0,wheel-base,length,width,curb-weight,engine-size,bore,horsepower,price,mileage,five,...,ohc,rotor,fwd,rwd,hardtop,hatchback,sedan,wagon,turbo,gas
0,88.6,168.8,64.1,2548,130,3.47,111,13495,23.7,0,...,0,0,0,1,0,0,0,0,0,1
1,88.6,168.8,64.1,2548,130,3.47,111,16500,23.7,0,...,0,0,0,1,0,0,0,0,0,1
2,94.5,171.2,65.5,2823,152,2.68,154,16500,22.15,0,...,1,0,0,1,0,1,0,0,0,1
3,99.8,176.6,66.2,2337,109,3.19,102,13950,26.7,0,...,1,0,1,0,0,0,1,0,0,1
4,99.4,176.6,66.4,2824,136,3.19,115,17450,19.8,1,...,1,0,0,0,0,0,1,0,0,1


## Training Model

In [42]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge

In [43]:
y=auto['price']

In [44]:
X=auto.drop('price',1)

In [79]:
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=0,test_size=.3)

In [80]:
scaler=MinMaxScaler()

In [81]:
X_train_scaled=scaler.fit_transform(X_train)

In [82]:
X_test_scaled=scaler.transform(X_test)

In [83]:
lin=Ridge().fit(X_train_scaled,y_train)

In [84]:
lin.score(X_test_scaled,y_test)

0.6969706081409262

In [85]:
lin_pred=lin.predict(X_test_scaled)

In [86]:
from sklearn.metrics import r2_score 
r2_score(y_test, lin_pred)

0.6969706081409262

In [93]:
from sklearn.ensemble import RandomForestRegressor

In [94]:
rf=RandomForestRegressor().fit(X_train_scaled,y_train)

In [95]:
rf.score(X_test_scaled,y_test)

0.7352680007611659