In [2]:
## Basic imports to do Data Science
import numpy as np
import pandas as pd

## Import to preprocess the data
from sklearn.preprocessing import MinMaxScaler
from sklearn import preprocessing

## Import for spliting the data for train and test
from sklearn.model_selection import train_test_split

In [56]:
df = pd.read_csv('imports-85.data',header=None)
df.columns=['symboling','normalized-losses','make','fuel-type',
            'aspiration','num-of-doors','body-style','drive-wheels',
            'engine-location','wheel-base','length','width','heigth',
            'curb-weigth','engine-size','engine-type','num-of-cylinders',
            'fuel-system','bore','stroke','compression-ratio',
            'horsepower','peak-rpm','city-mpg','highway-mpg','price']
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,num-of-cylinders,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [61]:
## Remove the rows with misisg data

# Replace the label of the dataset for missing data for NAN
df[df=='?']=np.NAN 

# Delete the rows with NANs
df.dropna(inplace=True)
df.head()

# Change datatypes to float to better performing of categorical 
# processing and normalizations taks

df['normalized-losses'] = df['normalized-losses'].astype(float)
df['bore'] = df['bore'].astype(float)
df['stroke'] = df['stroke'].astype(float)
df['horsepower'] = df['horsepower'].astype(float)
df['peak-rpm'] = df['peak-rpm'].astype(float)
df['price'] = df['price'].astype(float)


In [65]:
## One-hot-encoder

def processCategorical(df,column):

    # Create dummies columns
    dummies=pd.get_dummies(column)
    # Drop the current columns and add the new dummies columns
    df.drop(column.name,axis=1,inplace=True)
    d=pd.concat([df,dummies],axis=1)
    return d

## Split in categorical and numerical

categorical_cols=[col for col in df.columns if df[col].dtype=='O']

## for categorical data

for col in categorical_cols:
    if len(df[col].unique())==2:
        ## Replace the binary columns for 0 and 1
        elements=df[col].unique()
        df[col]=df[col].replace({elements[0]:0,elements[1]:1})

    else:
        df=processCategorical(df,df[col])

In [66]:
df.head()

Unnamed: 0,symboling,normalized-losses,fuel-type,aspiration,num-of-doors,wheel-base,length,width,heigth,curb-weigth,...,five,four,six,three,1bbl,2bbl,idi,mfi,mpfi,spdi
3,2,164.0,0,0,0,99.8,176.6,66.2,54.3,2337,...,0,1,0,0,0,0,0,0,1,0
4,2,164.0,0,0,0,99.4,176.6,66.4,54.3,2824,...,1,0,0,0,0,0,0,0,1,0
6,1,158.0,0,0,0,105.8,192.7,71.4,55.7,2844,...,1,0,0,0,0,0,0,0,1,0
8,1,158.0,0,1,0,105.8,192.7,71.4,55.9,3086,...,1,0,0,0,0,0,0,0,1,0
10,2,192.0,0,0,1,101.2,176.8,64.8,54.3,2395,...,0,1,0,0,0,0,0,0,1,0


In [93]:
#MinMaxScaler

# to avoid warnign about data types 
df.astype('float64')

scaler = MinMaxScaler()
scaled_df = scaler.fit_transform(df)
scaled_df = pd.DataFrame(scaled_df,columns=df.columns)
scaled_df.head()

Unnamed: 0,symboling,normalized-losses,fuel-type,aspiration,num-of-doors,wheel-base,length,width,heigth,curb-weigth,...,five,four,six,three,1bbl,2bbl,idi,mfi,mpfi,spdi
0,0.8,0.518325,0.0,0.0,0.0,0.455172,0.577236,0.517544,0.471154,0.329325,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.8,0.518325,0.0,0.0,0.0,0.441379,0.577236,0.535088,0.471154,0.518231,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.6,0.486911,0.0,0.0,0.0,0.662069,0.839024,0.973684,0.605769,0.525989,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.6,0.486911,0.0,1.0,0.0,0.662069,0.839024,0.973684,0.625,0.61986,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.8,0.664921,0.0,0.0,1.0,0.503448,0.580488,0.394737,0.471154,0.351823,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [75]:
# Split in target and input data

target = scaled_df['price']
data = scaled_df.drop('price',axis=1)


In [76]:
## Split in train and test

X_train,X_test,y_train,y_test=train_test_split(
    data,
    target,
    random_state=42)

In [89]:
## 1st model liner regression

from sklearn.linear_model import RidgeCV

model1 = RidgeCV()
model1.fit(X_train,y_train)

score1 = model1.score(X_test,y_test)
print("Score for model 1: ",score1)

Score for model 1:  0.8872558907251997


In [88]:
# 2nd model Grandient Boosting tree

from sklearn.ensemble import GradientBoostingRegressor

model2 = GradientBoostingRegressor(random_state =0)
model2.fit(X_train,y_train)

score2 = model2.score(X_test,y_test)
print("Score for model 2:", score2)

Score for model 2: 0.8487955281522919


In [86]:
print('Ranking of performance of the models:')
print("")
print("model 1:",score1)
print('model 2:',score2)

Ranking of performance of the models:

model 1: 0.8872558907251997
model 2: 0.8487955281522919
