# Fearure Selection

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LinearRegression

import warnings
warnings.filterwarnings("ignore")

In [10]:
cars = pd.read_csv("cars.csv")

In [11]:
cars.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,13495
1,3,?,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111,21,27,16500
2,1,?,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154,19,26,16500
3,2,164,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102,24,30,13950
4,2,164,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115,18,22,17450


# Cleaning

In [12]:
cars["normalized-losses"].replace("?",np.nan,inplace=True)
cars["normalized-losses"] = cars["normalized-losses"].astype("float")
cars["normalized-losses"].fillna(cars["normalized-losses"].mean(),inplace=True)

cars["horsepower"].replace("?",np.nan,inplace=True)
cars["horsepower"] = cars["horsepower"].astype("float")
cars["horsepower"].fillna(cars["horsepower"].mean(),inplace=True)

cars.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,body-style,drive-wheels,engine-location,width,height,engine-type,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,122.0,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111.0,21,27,13495
1,3,122.0,alfa-romero,gas,convertible,rwd,front,64.1,48.8,dohc,130,111.0,21,27,16500
2,1,122.0,alfa-romero,gas,hatchback,rwd,front,65.5,52.4,ohcv,152,154.0,19,26,16500
3,2,164.0,audi,gas,sedan,fwd,front,66.2,54.3,ohc,109,102.0,24,30,13950
4,2,164.0,audi,gas,sedan,4wd,front,66.4,54.3,ohc,136,115.0,18,22,17450


In [15]:
cars_cat = cars.select_dtypes("object")

cars_num = cars.select_dtypes(["int64","float64"])

cars_num.head()

Unnamed: 0,symboling,normalized-losses,width,height,engine-size,horsepower,city-mpg,highway-mpg,price
0,3,122.0,64.1,48.8,130,111.0,21,27,13495
1,3,122.0,64.1,48.8,130,111.0,21,27,16500
2,1,122.0,65.5,52.4,152,154.0,19,26,16500
3,2,164.0,66.2,54.3,109,102.0,24,30,13950
4,2,164.0,66.4,54.3,136,115.0,18,22,17450


In [16]:
cars_cat.head()

Unnamed: 0,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,alfa-romero,gas,convertible,rwd,front,dohc
1,alfa-romero,gas,convertible,rwd,front,dohc
2,alfa-romero,gas,hatchback,rwd,front,ohcv
3,audi,gas,sedan,fwd,front,ohc
4,audi,gas,sedan,4wd,front,ohc


In [17]:
for col in cars_cat:
    le = LabelEncoder()
    cars_cat[col] = le.fit_transform(cars_cat[col])

In [18]:
cars_cat.head()

Unnamed: 0,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,0,1,0,2,0,0
1,0,1,0,2,0,0
2,0,1,2,2,0,5
3,1,1,3,1,0,3
4,1,1,3,0,0,3


In [19]:
df_new = pd.concat([cars_num,cars_cat],axis = 1)

In [20]:
df_new.head()

Unnamed: 0,symboling,normalized-losses,width,height,engine-size,horsepower,city-mpg,highway-mpg,price,make,fuel-type,body-style,drive-wheels,engine-location,engine-type
0,3,122.0,64.1,48.8,130,111.0,21,27,13495,0,1,0,2,0,0
1,3,122.0,64.1,48.8,130,111.0,21,27,16500,0,1,0,2,0,0
2,1,122.0,65.5,52.4,152,154.0,19,26,16500,0,1,2,2,0,5
3,2,164.0,66.2,54.3,109,102.0,24,30,13950,1,1,3,1,0,3
4,2,164.0,66.4,54.3,136,115.0,18,22,17450,1,1,3,0,0,3


# Baseline Model

In [21]:
x = df_new.drop("price",axis=1)
y = df_new["price"]

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=1)

In [23]:
lin = LinearRegression()
lin.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [25]:
lin.score(x_test,y_test)

0.796556678039738

# Feature Selection Technique

<ol>
    <li>
    <ul>Filter Method
        <li>Correlation Coefficient</li>
        <li>Chi square test</li>
        <li>Anova test</li>
    </ul>
    </li>
    <li>
    <ul>Wrapper Method
        <li>Forward Selection</li>
        <li>Backward Selection</li>
    </ul>
    </li>
    <li>
    <ul>Embeded Method
        <li>Ridge and Lasso</li>
        <li>Decision Tree</li>
    </ul>
    </li>
    <li>
    <ul>Principal Componenet Analisys</ul>
    </li>
</ol>

# Chi square and Annova test

In [26]:
from sklearn.feature_selection import chi2
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest

In [38]:
annova = SelectKBest(score_func = f_regression,k=6)

In [39]:
x_train_f = annova.fit_transform(x_train,y_train)

In [40]:
x_test_f = annova.transform(x_test)

In [41]:
annova.get_support()

array([False, False,  True, False,  True,  True,  True,  True, False,
       False, False,  True, False, False])

In [42]:
lin_annova = LinearRegression()

In [43]:
lin_annova.fit(x_train_f,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [44]:
lin_annova.score(x_test_f,y_test)

0.7982503502453706

In [45]:
# Chi2

In [52]:
#chi = SelectKBest(score_func = chi2,k=6)

# Wrapper Methods

In [53]:
#forward selection

In [54]:
columns = []

In [57]:
for col in x:
    columns.append(col)
    
    x_new = df_new[columns]
    
    x_train, x_test, y_train, y_test = train_test_split(x_new,y,test_size=0.3,random_state=1)
    
    lin = LinearRegression()
    lin.fit(x_train,y_train)
    
    score = lin.score(x_test,y_test)
    
    print("col:",col,"score: ",score)

col: symboling score:  -0.0017837050450488778
col: normalized-losses score:  0.041612508167990114
col: width score:  0.6171737569085428
col: height score:  0.6183283499764445
col: engine-size score:  0.7589560567217417
col: horsepower score:  0.7741546338326669
col: city-mpg score:  0.7768961441883282
col: highway-mpg score:  0.7777776382689499
col: make score:  0.7924572281432337
col: fuel-type score:  0.7948545416112247
col: body-style score:  0.7972954924014395
col: drive-wheels score:  0.8135463420259219
col: engine-location score:  0.7934456703438277
col: engine-type score:  0.7965566780397382


In [58]:
#Backword selection - Homework

# Principal component analysis

In [59]:
x = df_new.drop("price",axis=1)
y = df_new["price"]

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=1)

In [60]:
from sklearn.decomposition import PCA

In [68]:
pc = PCA(n_components = 12,random_state = 1)

In [69]:
x_train_pc = pc.fit_transform(x_train,y_train)

In [70]:
x_test_pc = pc.transform(x_test)

In [71]:
lin_pc = LinearRegression()

In [72]:
lin_pc.fit(x_train_pc,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [73]:
lin_pc.score(x_test_pc,y_test)

0.8152074680020286