In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
dataset = pd.read_csv('D:\Data Science\Practise\House Price.csv')
dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
feature_scale = [feature for feature in dataset.columns if feature not in ['Id', 'SalePrice']]
feature_scale

['MSSubClass',
 'MSZoning',
 'LotFrontage',
 'LotArea',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinSF1',
 'BsmtFinType2',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Functional',
 'Fireplaces',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'Enc

In [5]:
categorical_features=[feature for feature in dataset.columns if dataset[feature].dtype=='O']

for feature in categorical_features:
    temp=dataset.groupby(feature)['SalePrice'].count()/len(dataset)
    temp_df=temp[temp>0.01].index
    dataset[feature]=np.where(dataset[feature].isin(temp_df),dataset[feature],'Rare_var')
    
for feature in categorical_features:
    labels_ordered=dataset.groupby([feature])['SalePrice'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    dataset[feature]=dataset[feature].map(labels_ordered)

In [6]:
scaling_feature=[feature for feature in dataset.columns if feature not in ['Id','SalePerice'] ]
len(scaling_feature)

80

In [7]:
dataset.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [8]:
dataset.dropna(inplace = True)

## Feature Scaling

In [9]:
feature_scale=[feature for feature in dataset.columns if feature not in ['Id','SalePrice']]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(dataset[feature_scale])

MinMaxScaler()

In [10]:
scaler.transform(dataset[feature_scale])

array([[0.23529412, 0.75      , 0.15068493, ..., 0.5       , 0.66666667,
        0.75      ],
       [0.        , 0.75      , 0.20205479, ..., 0.25      , 0.66666667,
        0.75      ],
       [0.23529412, 0.75      , 0.1609589 , ..., 0.5       , 0.66666667,
        0.75      ],
       ...,
       [0.29411765, 0.75      , 0.15410959, ..., 1.        , 0.66666667,
        0.75      ],
       [0.        , 0.75      , 0.1609589 , ..., 1.        , 0.66666667,
        0.75      ],
       [0.        , 0.75      , 0.18493151, ..., 0.5       , 0.66666667,
        0.75      ]])

In [11]:
# transform the train and test set, and add on the Id and SalePrice variables
data = pd.concat([dataset[['Id', 'SalePrice']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(dataset[feature_scale]), columns=feature_scale)],
                    axis=1)

In [12]:
data_copy = dataset.copy()

In [13]:
dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,3,65.0,8450,1,2,0,1,1,...,0,0,3,1,0,2,2008,2,3,208500
1,2,20,3,80.0,9600,1,2,0,1,1,...,0,0,3,1,0,5,2007,2,3,181500
2,3,60,3,68.0,11250,1,2,1,1,1,...,0,0,3,1,0,9,2008,2,3,223500
3,4,70,3,60.0,9550,1,2,1,1,1,...,0,0,3,1,0,2,2006,2,0,140000
4,5,60,3,84.0,14260,1,2,1,1,1,...,0,0,3,1,0,12,2008,2,3,250000


In [14]:
X_train = dataset.drop(['Id'], axis = 1)

In [15]:
y_train = X_train.pop('SalePrice')

In [16]:
X_train.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,3,65.0,8450,1,2,0,1,1,0,...,0,0,0,3,1,0,2,2008,2,3
1,20,3,80.0,9600,1,2,0,1,1,1,...,0,0,0,3,1,0,5,2007,2,3
2,60,3,68.0,11250,1,2,1,1,1,0,...,0,0,0,3,1,0,9,2008,2,3
3,70,3,60.0,9550,1,2,1,1,1,2,...,0,0,0,3,1,0,2,2006,2,0
4,60,3,84.0,14260,1,2,1,1,1,1,...,0,0,0,3,1,0,12,2008,2,3


In [17]:
y_train.head()

0    208500
1    181500
2    223500
3    140000
4    250000
Name: SalePrice, dtype: int64

In [18]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [19]:
feature_sel_model = SelectFromModel(Lasso(alpha = 0.05, random_state = 0))
feature_sel_model.fit(X_train, y_train)

  model = cd_fast.enet_coordinate_descent(


SelectFromModel(estimator=Lasso(alpha=0.05, random_state=0))

In [20]:
feature_sel_model.get_support()

array([ True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True, False,
        True,  True,  True,  True,  True,  True,  True])

In [21]:
print(X_train.columns[feature_sel_model.get_support()])

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood',
       'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual',
       'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath',
       'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr',
       'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces',
       'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish',
       'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive',
       'WoodDeckSF', 'O

In [22]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

In [24]:
RFE_selector = RFE(LogisticRegression(), n_features_to_select=10, step=10, verbose=0)
RFE_selector.fit(X_train, y_train)
X_train.columns[RFE_selector.get_support()]

X_train

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,60,3,65.0,8450,1,2,0,1,1,0,...,0,0,0,3,1,0,2,2008,2,3
1,20,3,80.0,9600,1,2,0,1,1,1,...,0,0,0,3,1,0,5,2007,2,3
2,60,3,68.0,11250,1,2,1,1,1,0,...,0,0,0,3,1,0,9,2008,2,3
3,70,3,60.0,9550,1,2,1,1,1,2,...,0,0,0,3,1,0,2,2006,2,0
4,60,3,84.0,14260,1,2,1,1,1,1,...,0,0,0,3,1,0,12,2008,2,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,60,3,62.0,7917,1,2,0,1,1,0,...,0,0,0,3,1,0,8,2007,2,3
1456,20,3,85.0,13175,1,2,0,1,1,0,...,0,0,0,1,1,0,2,2010,2,3
1457,70,3,66.0,9042,1,2,0,1,1,0,...,0,0,0,2,0,2500,5,2010,2,3
1458,20,3,68.0,9717,1,2,0,1,1,0,...,0,0,0,3,1,0,4,2010,2,3


In [27]:
X_train.columns[RFE_selector.get_support()]

Index(['LotArea', 'YearBuilt', 'YearRemodAdd', 'BsmtFinSF1', 'BsmtUnfSF',
       'TotalBsmtSF', '2ndFlrSF', 'GrLivArea', 'GarageYrBlt', 'YrSold'],
      dtype='object')