In [1]:
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt

# Lasso is a characteristic that it applies penality to the coefficients to the feature in order to try and reduce the over fitting.
# It does in such a way that it can reduce the overfitting feature coefficients to zero. So in essence we are multiplying the features with zero,
# which means that those features are not adding anything for the prediction.
from sklearn.linear_model import Lasso

# To select features for the model (here it is lasso regression)
from sklearn.feature_selection import SelectFromModel


In [2]:
pd.set_option('display.max_columns', None)

# Importing required data

In [3]:
X_train = pd.read_csv('xtrain.csv')
X_test = pd.read_csv('xtest.csv')

X_train.head()

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15,month,day,year,month_name
0,0.121212,0.1875,0.394705,0.325923,0.0,0.0,0.0,0.5,0.5,0.491606,0.051867,0.565217,0.0,0.307692,0.857739,0.201163,0.589897,1.0,0.363636,0.933333,0.0,0.818182
1,0.090909,0.1875,0.405596,0.327905,0.0,0.0,0.0,0.5,0.5,0.562964,0.0,0.530435,0.0,0.307692,0.986239,0.183278,0.544952,0.0,0.181818,0.533333,1.0,0.545455
2,0.151515,0.34375,0.582438,0.289762,0.4,0.0,0.0,0.5,0.666667,0.744138,0.0,0.913043,0.0,0.807692,0.823433,0.424382,0.792008,0.0,0.636364,0.333333,0.0,0.636364
3,0.060606,0.125,0.322738,0.256842,0.0,0.0,0.0,0.75,0.5,0.466276,0.0,0.078261,0.0,0.596154,0.539395,0.227139,0.594254,0.0,0.727273,0.833333,0.0,0.363636
4,0.060606,0.125,0.365339,0.582515,0.0,0.0,0.0,0.5,0.5,0.517006,0.0,0.53913,0.0,0.903846,0.583825,0.464978,0.799262,0.0,0.545455,0.3,0.0,0.727273


In [4]:
y_train = pd.read_csv('ytrain.csv')
y_test = pd.read_csv('ytest.csv')

y_train.head()

Unnamed: 0,price
0,12.89922
1,12.97154
2,13.487006
3,12.880292
4,13.132314


# Feature Selection

In [5]:
# Inorder to get good alpha value we need to do cross validation, or use trying different values.

# In this course it is given directly as 0.001
sel_ = SelectFromModel(Lasso(alpha=0.001, random_state=0))

sel_.fit(X_train, y_train)

SelectFromModel(estimator=Lasso(alpha=0.001, random_state=0))

In [6]:
sel_.get_support().sum() # get_support().sum() -> returns no of features were selected.

19

# To see how many features are selected

In [7]:
selected_features = X_train.columns[(sel_.get_support())]


print('Total number of features: ', X_train.shape[1])
print('Selected features: ', len(selected_features))
print('Features with co-efficients 0 (Lasso Made them to 0): ', np.sum(sel_.estimator_.coef_ == 0))



Total number of features:  22
Selected features:  19
Features with co-efficients 0 (Lasso Made them to 0):  3


In [8]:
selected_features

Index(['bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view',
       'condition', 'grade', 'sqft_above', 'yr_built', 'yr_renovated',
       'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'day', 'year',
       'month_name'],
      dtype='object')

# Saving the Selected Features in the csv file

In [9]:
pd.Series(selected_features).to_csv('selected_features.csv', index=False)