### Scaling Features: After feature engineering, I decided to scale the features into z-score to normalize, because the unit of each feature is different. After scaling features, I will conduct feature selection by RFECV and then RFE, and then I will inverse the scaling.

### However, to calculate the z-score, I need mean and standard deviation, so I need to remove the NaN from target variable, and impute means to the NaNs in other features.

### Pulling my data with engineered features

### Start by retrieving my data with engineered features in the previous step.

In [1]:
import pandas as pd

engineered_df = pd.read_csv('https://raw.githubusercontent.com/mhan1/Data-Science/master/Machine%20Learning_Linear%20Regression%20project_Minyeong%20Han_Data%20Science/engineered_features.csv')
engineered_df.head(3)

Unnamed: 0.1,Unnamed: 0,symboling,num_of_doors,wheel-base,length,width,height,curb_weight,num_of_cylinders,engine_size,...,engine_type_ohc,engine_type_ohcf,engine_type_ohcv,engine_type_other,fuel_system_1bbl,fuel_system_2bbl,fuel_system_idi,fuel_system_mpfi,fuel_system_other,normalized_losses
0,0,3,2.0,88.6,168.8,64.1,48.8,2548,4,130,...,0,0,0,0,0,0,0,1,0,
1,1,3,2.0,88.6,168.8,64.1,48.8,2548,4,130,...,0,0,0,0,0,0,0,1,0,
2,2,1,2.0,94.5,171.2,65.5,52.4,2823,6,152,...,0,0,1,0,0,0,0,1,0,


In [2]:
engineered_df.columns

Index(['Unnamed: 0', 'symboling', 'num_of_doors', 'wheel-base', 'length',
       'width', 'height', 'curb_weight', 'num_of_cylinders', 'engine_size',
       'bore', 'stroke', 'compression_ratio', 'horsepower', 'peak_rpm',
       'city_mpg', 'highway_mpg', 'price', 'engine_location_is_front',
       'fuel_type_is_gas', 'aspiration_is_std', 'make_honda', 'make_mazda',
       'make_mitsubishi', 'make_nissan', 'make_other', 'make_peugot',
       'make_subaru', 'make_toyota', 'make_volkswagen', 'make_volvo',
       'body_style_hatchback', 'body_style_other', 'body_style_sedan',
       'body_style_wagon', 'drive_wheels_fwd', 'drive_wheels_other',
       'drive_wheels_rwd', 'engine_type_dohc', 'engine_type_l',
       'engine_type_ohc', 'engine_type_ohcf', 'engine_type_ohcv',
       'engine_type_other', 'fuel_system_1bbl', 'fuel_system_2bbl',
       'fuel_system_idi', 'fuel_system_mpfi', 'fuel_system_other',
       'normalized_losses'],
      dtype='object')

In [3]:
#removing unnecessary column 
engineered_df.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
engineered_df[0:3]

Unnamed: 0,symboling,num_of_doors,wheel-base,length,width,height,curb_weight,num_of_cylinders,engine_size,bore,...,engine_type_ohc,engine_type_ohcf,engine_type_ohcv,engine_type_other,fuel_system_1bbl,fuel_system_2bbl,fuel_system_idi,fuel_system_mpfi,fuel_system_other,normalized_losses
0,3,2.0,88.6,168.8,64.1,48.8,2548,4,130,3.47,...,0,0,0,0,0,0,0,1,0,
1,3,2.0,88.6,168.8,64.1,48.8,2548,4,130,3.47,...,0,0,0,0,0,0,0,1,0,
2,1,2.0,94.5,171.2,65.5,52.4,2823,6,152,2.68,...,0,0,1,0,0,0,0,1,0,


### Cleaning missing values in the target variable

In [5]:
# checking if there is null value in each column
engineered_df.isnull().any(axis=0)

symboling                   False
num_of_doors                 True
wheel-base                  False
length                      False
width                       False
height                      False
curb_weight                 False
num_of_cylinders            False
engine_size                 False
bore                         True
stroke                       True
compression_ratio           False
horsepower                   True
peak_rpm                     True
city_mpg                    False
highway_mpg                 False
price                        True
engine_location_is_front    False
fuel_type_is_gas            False
aspiration_is_std           False
make_honda                  False
make_mazda                  False
make_mitsubishi             False
make_nissan                 False
make_other                  False
make_peugot                 False
make_subaru                 False
make_toyota                 False
make_volkswagen             False
make_volvo    

In [6]:
# looking for na values.
def nas_sorted(df):
    return df.isnull().sum().sort_values(ascending=False)

In [7]:
sorted_nas = nas_sorted(engineered_df)
sorted_nas.head(10)

normalized_losses    41
price                 4
bore                  4
stroke                4
peak_rpm              2
num_of_doors          2
horsepower            2
make_mazda            0
make_honda            0
aspiration_is_std     0
dtype: int64

In [8]:
# returning only the columns that has NaNs.
def some_nans(df):
    some_nans_bools = pd.isnull(df).any()
    return some_nans_bools.index[some_nans_bools]

In [9]:
some_nans(engineered_df)

Index(['num_of_doors', 'bore', 'stroke', 'horsepower', 'peak_rpm', 'price',
       'normalized_losses'],
      dtype='object')

In [10]:
#imputing means to the NaN values in each column that has NaNs.
def impute_means(df):
    nan_cols = some_nans(df)
    col_means = df[nan_cols].mean()
    imputed_df = df.fillna(col_means)
    return imputed_df

In [11]:
imputed_df = impute_means(engineered_df)
imputed_df.head()

Unnamed: 0,symboling,num_of_doors,wheel-base,length,width,height,curb_weight,num_of_cylinders,engine_size,bore,...,engine_type_ohc,engine_type_ohcf,engine_type_ohcv,engine_type_other,fuel_system_1bbl,fuel_system_2bbl,fuel_system_idi,fuel_system_mpfi,fuel_system_other,normalized_losses
0,3,2.0,88.6,168.8,64.1,48.8,2548,4,130,3.47,...,0,0,0,0,0,0,0,1,0,122.0
1,3,2.0,88.6,168.8,64.1,48.8,2548,4,130,3.47,...,0,0,0,0,0,0,0,1,0,122.0
2,1,2.0,94.5,171.2,65.5,52.4,2823,6,152,2.68,...,0,0,1,0,0,0,0,1,0,122.0
3,2,4.0,99.8,176.6,66.2,54.3,2337,4,109,3.19,...,1,0,0,0,0,0,0,1,0,164.0
4,2,4.0,99.4,176.6,66.4,54.3,2824,5,136,3.19,...,1,0,0,0,0,0,0,1,0,164.0


In [12]:
imputed_df.head().T

Unnamed: 0,0,1,2,3,4
symboling,3.0,3.0,1.0,2.0,2.0
num_of_doors,2.0,2.0,2.0,4.0,4.0
wheel-base,88.6,88.6,94.5,99.8,99.4
length,168.8,168.8,171.2,176.6,176.6
width,64.1,64.1,65.5,66.2,66.4
height,48.8,48.8,52.4,54.3,54.3
curb_weight,2548.0,2548.0,2823.0,2337.0,2824.0
num_of_cylinders,4.0,4.0,6.0,4.0,5.0
engine_size,130.0,130.0,152.0,109.0,136.0
bore,3.47,3.47,2.68,3.19,3.19


In [13]:
imputed_df.isnull().values.any()

False

### Now, imputation is complete. I am ready to scale the features into z-score. Before scaling the features,  I will export the imputed_df as csv file for later use.

In [14]:
#exporting the dataframe with engineered & imputed features as csv file for later use.
imputed_df.to_csv(r'C:\Users\Minyeong\Desktop\imputed_engineered_features.csv')

In [15]:
y = imputed_df.normalized_losses
X = imputed_df.drop(columns='normalized_losses')

In [16]:
X[0:2]

Unnamed: 0,symboling,num_of_doors,wheel-base,length,width,height,curb_weight,num_of_cylinders,engine_size,bore,...,engine_type_l,engine_type_ohc,engine_type_ohcf,engine_type_ohcv,engine_type_other,fuel_system_1bbl,fuel_system_2bbl,fuel_system_idi,fuel_system_mpfi,fuel_system_other
0,3,2.0,88.6,168.8,64.1,48.8,2548,4,130,3.47,...,0,0,0,0,0,0,0,0,1,0
1,3,2.0,88.6,168.8,64.1,48.8,2548,4,130,3.47,...,0,0,0,0,0,0,0,0,1,0


In [17]:
y.head(3)

0    122.0
1    122.0
2    122.0
Name: normalized_losses, dtype: float64

What we really want to know when judging feature importances is the following: 

* How much does our dependent variable change given an expected amount of movement in the feature.

### Reviewing the Z-score

Here's the formula for translating each of our feature variables into their standard deviation from the average.

$z = \frac{X - \hat{X}}{\sigma}$

In [18]:
from scipy.stats import zscore

In [19]:
scaled_df = pd.DataFrame(zscore(imputed_df, axis=0), columns = imputed_df.columns)
scaled_df.head()

Unnamed: 0,symboling,num_of_doors,wheel-base,length,width,height,curb_weight,num_of_cylinders,engine_size,bore,...,engine_type_ohc,engine_type_ohcf,engine_type_ohcv,engine_type_other,fuel_system_1bbl,fuel_system_2bbl,fuel_system_idi,fuel_system_mpfi,fuel_system_other,normalized_losses
0,1.74347,-1.13733,-1.690772,-0.426521,-0.844782,-2.020417,-0.014566,-0.352887,0.074449,0.519089,...,-1.611363,-0.280976,-0.260208,-0.158114,-0.23812,-0.689072,-0.328798,1.08667,-0.270737,0.0
1,1.74347,-1.13733,-1.690772,-0.426521,-0.844782,-2.020417,-0.014566,-0.352887,0.074449,0.519089,...,-1.611363,-0.280976,-0.260208,-0.158114,-0.23812,-0.689072,-0.328798,1.08667,-0.270737,0.0
2,0.133509,-1.13733,-0.708596,-0.231513,-0.190566,-0.543527,0.514882,1.502032,0.604046,-2.404862,...,-1.611363,-0.280976,3.843076,-0.158114,-0.23812,-0.689072,-0.328798,1.08667,-0.270737,0.0
3,0.93849,0.887915,0.173698,0.207256,0.136542,0.235942,-0.420797,-0.352887,-0.431076,-0.517248,...,0.620593,-0.280976,-0.260208,-0.158114,-0.23812,-0.689072,-0.328798,1.08667,-0.270737,1.328961
4,0.93849,0.887915,0.10711,0.207256,0.230001,0.235942,0.516807,0.574572,0.218885,-0.517248,...,0.620593,-0.280976,-0.260208,-0.158114,-0.23812,-0.689072,-0.328798,1.08667,-0.270737,1.328961


### Using SKLearn

The Sklearn library has it's own method for changing each of our feature variables into their respective Z-scores.

In [20]:
from sklearn.preprocessing import StandardScaler

In [21]:
scaler = StandardScaler()

In [22]:
scaler.fit(imputed_df)

  return self.partial_fit(X, y)


StandardScaler(copy=True, with_mean=True, with_std=True)

In [23]:
imputed_df[0:2]

Unnamed: 0,symboling,num_of_doors,wheel-base,length,width,height,curb_weight,num_of_cylinders,engine_size,bore,...,engine_type_ohc,engine_type_ohcf,engine_type_ohcv,engine_type_other,fuel_system_1bbl,fuel_system_2bbl,fuel_system_idi,fuel_system_mpfi,fuel_system_other,normalized_losses
0,3,2.0,88.6,168.8,64.1,48.8,2548,4,130,3.47,...,0,0,0,0,0,0,0,1,0,122.0
1,3,2.0,88.6,168.8,64.1,48.8,2548,4,130,3.47,...,0,0,0,0,0,0,0,1,0,122.0


In [24]:
scaled_data = scaler.transform(imputed_df)

  """Entry point for launching an IPython kernel.


In [25]:
scaled_data = scaler.fit_transform(imputed_df)
scaled_data

  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


array([[ 1.74347043, -1.13732958, -1.6907718 , ...,  1.08666971,
        -0.27073683,  0.        ],
       [ 1.74347043, -1.13732958, -1.6907718 , ...,  1.08666971,
        -0.27073683,  0.        ],
       [ 0.133509  , -1.13732958, -0.70859588, ...,  1.08666971,
        -0.27073683,  0.        ],
       ...,
       [-1.47645244,  0.8879152 ,  1.72187336, ...,  1.08666971,
        -0.27073683, -0.85433189],
       [-1.47645244,  0.8879152 ,  1.72187336, ..., -0.92024282,
        -0.27073683, -0.85433189],
       [-1.47645244,  0.8879152 ,  1.72187336, ...,  1.08666971,
        -0.27073683, -0.85433189]])

In [26]:
scaled_df = pd.DataFrame(scaled_data, columns = imputed_df.columns)
scaled_df.head()

Unnamed: 0,symboling,num_of_doors,wheel-base,length,width,height,curb_weight,num_of_cylinders,engine_size,bore,...,engine_type_ohc,engine_type_ohcf,engine_type_ohcv,engine_type_other,fuel_system_1bbl,fuel_system_2bbl,fuel_system_idi,fuel_system_mpfi,fuel_system_other,normalized_losses
0,1.74347,-1.13733,-1.690772,-0.426521,-0.844782,-2.020417,-0.014566,-0.352887,0.074449,0.519089,...,-1.611363,-0.280976,-0.260208,-0.158114,-0.23812,-0.689072,-0.328798,1.08667,-0.270737,0.0
1,1.74347,-1.13733,-1.690772,-0.426521,-0.844782,-2.020417,-0.014566,-0.352887,0.074449,0.519089,...,-1.611363,-0.280976,-0.260208,-0.158114,-0.23812,-0.689072,-0.328798,1.08667,-0.270737,0.0
2,0.133509,-1.13733,-0.708596,-0.231513,-0.190566,-0.543527,0.514882,1.502032,0.604046,-2.404862,...,-1.611363,-0.280976,3.843076,-0.158114,-0.23812,-0.689072,-0.328798,1.08667,-0.270737,0.0
3,0.93849,0.887915,0.173698,0.207256,0.136542,0.235942,-0.420797,-0.352887,-0.431076,-0.517248,...,0.620593,-0.280976,-0.260208,-0.158114,-0.23812,-0.689072,-0.328798,1.08667,-0.270737,1.328961
4,0.93849,0.887915,0.10711,0.207256,0.230001,0.235942,0.516807,0.574572,0.218885,-0.517248,...,0.620593,-0.280976,-0.260208,-0.158114,-0.23812,-0.689072,-0.328798,1.08667,-0.270737,1.328961


In [27]:
#exporting the dataframe with scaled_engineered & imputed features as csv file for later use.
scaled_df.to_csv(r'C:\Users\Minyeong\Desktop\scaled_imputed_engineered_features.csv')

In [28]:
#inversing the scaled_features into original values.
scaler.inverse_transform(scaled_data)

array([[  3. ,   2. ,  88.6, ...,   1. ,   0. , 122. ],
       [  3. ,   2. ,  88.6, ...,   1. ,   0. , 122. ],
       [  1. ,   2. ,  94.5, ...,   1. ,   0. , 122. ],
       ...,
       [ -1. ,   4. , 109.1, ...,   1. ,   0. ,  95. ],
       [ -1. ,   4. , 109.1, ...,   0. ,   0. ,  95. ],
       [ -1. ,   4. , 109.1, ...,   1. ,   0. ,  95. ]])

Now we can model with our scaled data.

In [29]:
scaled_X = scaled_data[:, :-1]
scaled_y = scaled_data[:, -1]

In [30]:
scaled_X[0:2]

array([[ 1.74347043, -1.13732958, -1.6907718 , -0.42652147, -0.84478235,
        -2.0204173 , -0.01456628, -0.35288699,  0.07444893,  0.51908935,
        -1.83940375, -0.28834891,  0.17106493, -0.263484  , -0.64655303,
        -0.54605874,  0.03667351,  0.12186667,  0.32879797,  0.46929532,
        -0.26020825, -0.30070838, -0.26020825, -0.31025261,  1.45122728,
        -0.23811978, -0.24935149, -0.43008266, -0.24935149, -0.23811978,
        -0.7200823 ,  3.69362385, -0.93847426, -0.372678  , -1.18817705,
        -0.21428571,  1.30283093,  4.01040314, -0.24935149, -1.61136316,
        -0.28097574, -0.26020825, -0.15811388, -0.23811978, -0.68907194,
        -0.32879797,  1.08666971, -0.27073683],
       [ 1.74347043, -1.13732958, -1.6907718 , -0.42652147, -0.84478235,
        -2.0204173 , -0.01456628, -0.35288699,  0.07444893,  0.51908935,
        -1.83940375, -0.28834891,  0.17106493, -0.263484  , -0.64655303,
        -0.54605874,  0.41949787,  0.12186667,  0.32879797,  0.46929532,
   

In [31]:
scaled_y[0:10]

array([0.        , 0.        , 0.        , 1.32896072, 1.32896072,
       0.        , 1.13910919, 0.        , 1.13910919, 0.        ])

In [32]:
from sklearn.linear_model import LinearRegression

from sklearn.model_selection import train_test_split

In [33]:
X_train, X_test, y_train, y_test = train_test_split(scaled_X, scaled_y, test_size=0.2, random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [34]:
model = LinearRegression()
model.fit(X_train, y_train)
model.coef_

array([ 3.59610504e-01, -7.67566466e-02,  4.60025492e-02,  3.51770709e-02,
        3.16184419e-02, -1.42809313e-01, -1.67268342e-01,  1.31037020e-01,
        2.57737356e-01, -1.18245825e-01,  1.13887399e-01, -1.34008296e-01,
       -4.66319152e-01,  3.58741398e-01, -7.72455406e-01,  6.61365228e-01,
        3.75436826e-02,  3.76949048e+12,  8.13023006e+12, -1.37440285e-01,
       -5.08611488e+11, -5.75529064e+11, -5.08611488e+11, -5.90637382e+11,
       -9.75086722e+11,  1.52192181e+13, -7.85904879e+12, -7.57463858e+11,
       -4.89929006e+11, -4.70284952e+11, -2.10514079e+09, -1.11981413e+09,
       -2.21520594e+09, -1.45268516e+09, -1.16811404e+13, -4.85774202e+12,
       -1.14521442e+13, -3.77207589e+12, -2.01169379e+13, -7.19912029e+12,
        3.99023882e+12, -3.91591661e+12, -2.47862682e+12, -2.10372242e+11,
       -4.36184869e+11,  7.85322220e+12, -4.65175298e+11, -2.35489707e+11])

In [35]:
model.score(X_test, y_test)

-1.8189210223752728e+26

In [36]:
model.score(X_val, y_val)

0.2927849668439543

### I will detect the outliers from imputed features as below.

In [37]:
imputed_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
symboling,205.0,0.834146,1.245307,-2.0,0.0,1.0,2.0,3.0
num_of_doors,205.0,3.123153,0.989952,2.0,2.0,4.0,4.0,4.0
wheel-base,205.0,98.756585,6.021776,86.6,94.5,97.0,102.4,120.9
length,205.0,174.049268,12.337289,141.1,166.3,173.2,183.1,208.1
width,205.0,65.907805,2.145204,60.3,64.1,65.5,66.9,72.3
height,205.0,53.724878,2.443522,47.8,52.0,54.1,55.5,59.8
curb_weight,205.0,2555.565854,520.680204,1488.0,2145.0,2414.0,2935.0,4066.0
num_of_cylinders,205.0,4.380488,1.080854,2.0,4.0,4.0,4.0,12.0
engine_size,205.0,126.907317,41.642693,61.0,97.0,120.0,141.0,326.0
bore,205.0,3.329751,0.270844,2.54,3.15,3.31,3.58,3.94


In [38]:
from scipy import stats

In [39]:
import numpy as np

def percentiles(column):
    z_scores = stats.zscore(column)
    # z_score: segment based on number of standard deviations away from the mean     
    hist, bin_edges = np.histogram(z_scores, bins=np.arange(-3, 4, 1), density=True)
    return np.stack((hist, bin_edges[1:]))

In [40]:
percentiles(imputed_df['normalized_losses'])

array([[ 0.        ,  0.13300493,  0.31034483,  0.39901478,  0.11330049,
         0.04433498],
       [-2.        , -1.        ,  0.        ,  1.        ,  2.        ,
         3.        ]])

In [41]:
import numpy as np

def too_many_outliers(column, threshold = .05):
    #  expected .021 if normal distribution
    z_less_neg_two = percentiles(column)[0, 0]
    z_gt_two = percentiles(column)[0, -1]
    if z_less_neg_two > threshold or z_gt_two > threshold:
        return np.hstack((column.name, z_less_neg_two, z_gt_two))
    else:
        print("False")

In [42]:
too_many_outliers(imputed_df['normalized_losses'])

False


In [43]:
def outlier_columns(df, threshold = .05):
    numeric_columns = df.select_dtypes(include=['float64', 'int64']).columns
    outlier_columns = np.array([too_many_outliers(df[column]) for column in numeric_columns])
    return np.array([column for column in outlier_columns if column is not None])

In [44]:
outlier_columns(imputed_df)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


array([['width', '0.004878048780487805', '0.06829268292682927'],
       ['compression_ratio', '0.0', '0.05612244897959184'],
       ['aspiration_is_std', '0.18048780487804877', '0.0'],
       ['make_toyota', '0.0', '0.15609756097560976'],
       ['body_style_wagon', '0.0', '0.12195121951219512']], dtype='<U32')

In [45]:
def select_outliers(column, upper_tail = True):
    if upper_tail:
        return column[stats.zscore(column) > 2]
    else:
        return column[stats.zscore(column) < -2]

In [46]:
select_outliers(imputed_df['normalized_losses'])

10     192.0
11     192.0
12     188.0
13     188.0
104    194.0
105    194.0
106    231.0
125    186.0
178    197.0
179    197.0
190    256.0
Name: normalized_losses, dtype: float64

In [47]:
select_outliers(imputed_df['normalized_losses']).value_counts().sum()

11

In [48]:
len(imputed_df['normalized_losses'])

205

In [49]:
select_outliers(imputed_df['normalized_losses']).value_counts().sum() / len(imputed_df['normalized_losses'])

0.05365853658536585

### For the standard normal distribution, P(-1.96 < Z < 1.96) = 0.95, i.e., there is a 95% probability that a standard normal variable, Z, will fall between -1.96 and 1.96. If desired confidence interval is 99%, P(-2.576 <Z < 2.576) = 0.99
### Based on above, my dataset has 11 outliers (outside the ~95%) in the target variable, 'normalized_losses'.
### I will go ahead to the next step, feature selection by RFECV and RFE. 