In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
data = pd.read_csv("D:\house prd/train.csv")
data.head(50)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


In [3]:
#always remember there way always be a chance of data leakage so we need to split the data 
#first and then apply feature

from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(data, data['SalePrice'] , test_size = 0.1 , random_state = 0)

In [4]:
x_train.shape , x_test.shape

((1314, 81), (146, 81))

In [5]:
#all nan values of categorical feature
feature_nan = [feature for feature in data.columns if data[feature].isnull().sum()>1 and data[feature].dtypes == 'O']

for feature in feature_nan:
    
    print(f"{feature}: {data[feature].isnull().mean()}% values")

Alley: 0.9376712328767123% values
MasVnrType: 0.005479452054794521% values
BsmtQual: 0.025342465753424658% values
BsmtCond: 0.025342465753424658% values
BsmtExposure: 0.026027397260273973% values
BsmtFinType1: 0.025342465753424658% values
BsmtFinType2: 0.026027397260273973% values
FireplaceQu: 0.4726027397260274% values
GarageType: 0.05547945205479452% values
GarageFinish: 0.05547945205479452% values
GarageQual: 0.05547945205479452% values
GarageCond: 0.05547945205479452% values
PoolQC: 0.9952054794520548% values
Fence: 0.8075342465753425% values
MiscFeature: 0.963013698630137% values


In [6]:
def replace_cat_feature(data , feature_nam):
    data1 = data.copy()
    data1[feature_nan] = data1[feature_nan].fillna('Missing')
    return data1

data = replace_cat_feature(data , feature_nan)

In [7]:
data[feature_nan].isnull().sum()

Alley           0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

In [8]:
numerical_nan = [feature for feature in data.columns if data[feature].isnull().sum()>1 and data[feature].dtypes != 'O']

for feature in numerical_nan:
    print(f"{feature} {data[feature].isnull().mean()}% values")

LotFrontage 0.1773972602739726% values
MasVnrArea 0.005479452054794521% values
GarageYrBlt 0.05547945205479452% values


In [9]:
for feature in numerical_nan:
    median_value = data[feature].median()
    
    data[feature+'nan'] = np.where(data[feature].isnull() , 1 , 0)
    data[feature].fillna(median_value , inplace = True)

In [10]:
data[numerical_nan].isnull().sum()

LotFrontage    0
MasVnrArea     0
GarageYrBlt    0
dtype: int64

In [11]:
#temporal variable
for feature in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
    data[feature] = data['YrSold'] - data[feature]

In [12]:
data[['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']].head()

Unnamed: 0,YearBuilt,YearRemodAdd,GarageYrBlt
0,5,5,5.0
1,31,31,31.0
2,7,6,7.0
3,91,36,8.0
4,8,8,8.0


In [13]:
#since numerical variable are skewed or not in form of gaussian distribution so
num_feature = ['LotFrontage', 'LotArea','1stFlrSF','GrLivArea','SalePrice']
for feature in num_feature:
    data[feature] = np.log(data[feature])

In [14]:
#now handle categorical feature
categorical_feature = [feature for feature in data.columns if data[feature].dtypes == 'O']
for feature in categorical_feature:
    temp = data.groupby(feature)['SalePrice'].count()/len(data)
    temp_df=temp[temp>0.01].index
    data[feature]=np.where(data[feature].isin(temp_df),data[feature],'Rare_var')
    

In [15]:
data.head(100)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,60,RL,4.174387,9.041922,Pave,Missing,Reg,Lvl,AllPub,...,Missing,0,2,2008,WD,Normal,12.247694,0,0,0
1,2,20,RL,4.382027,9.169518,Pave,Missing,Reg,Lvl,AllPub,...,Missing,0,5,2007,WD,Normal,12.109011,0,0,0
2,3,60,RL,4.219508,9.328123,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,9,2008,WD,Normal,12.317167,0,0,0
3,4,70,RL,4.094345,9.164296,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,2,2006,WD,Abnorml,11.849398,0,0,0
4,5,60,RL,4.430817,9.565214,Pave,Missing,IR1,Lvl,AllPub,...,Missing,0,12,2008,WD,Normal,12.429216,0,0,0
5,6,50,RL,4.442651,9.554993,Pave,Missing,IR1,Lvl,AllPub,...,Shed,700,10,2009,WD,Normal,11.870600,0,0,0
6,7,20,RL,4.317488,9.218705,Pave,Missing,Reg,Lvl,AllPub,...,Missing,0,8,2007,WD,Normal,12.634603,0,0,0
7,8,60,RL,4.234107,9.247829,Pave,Missing,IR1,Lvl,AllPub,...,Shed,350,11,2009,WD,Normal,12.206073,1,0,0
8,9,50,RM,3.931826,8.719317,Pave,Missing,Reg,Lvl,AllPub,...,Missing,0,4,2008,WD,Abnorml,11.774520,0,0,0
9,10,190,RL,3.912023,8.911934,Pave,Missing,Reg,Lvl,AllPub,...,Missing,0,1,2008,WD,Normal,11.678440,0,0,0


In [26]:
for feature in categorical_feature:
    labels_ordered=data.groupby([feature])['SalePrice'].mean().sort_values().index
    labels_ordered={k:i for i,k in enumerate(labels_ordered,0)}
    data[feature]=data[feature].map(labels_ordered)

In [27]:
feature_scale=[feature for feature in data.columns if feature not in ['Id','SalePerice'] ]
feature_scale

['MSSubClass',
 'MSZoning',
 'LotFrontage',
 'LotArea',
 'Street',
 'Alley',
 'LotShape',
 'LandContour',
 'Utilities',
 'LotConfig',
 'LandSlope',
 'Neighborhood',
 'Condition1',
 'Condition2',
 'BldgType',
 'HouseStyle',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'RoofStyle',
 'RoofMatl',
 'Exterior1st',
 'Exterior2nd',
 'MasVnrType',
 'MasVnrArea',
 'ExterQual',
 'ExterCond',
 'Foundation',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinSF1',
 'BsmtFinType2',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'Heating',
 'HeatingQC',
 'CentralAir',
 'Electrical',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'KitchenQual',
 'TotRmsAbvGrd',
 'Functional',
 'Fireplaces',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageCars',
 'GarageArea',
 'GarageQual',
 'GarageCond',
 'PavedDrive',
 'WoodDeckSF',
 'OpenPorchSF',
 'Enc

In [28]:
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(data[feature_scale])

MinMaxScaler(copy=True, feature_range=(0, 1))

In [29]:
scaler.transform(data[feature_scale])

array([[0.23529412, 0.75      , 0.41820812, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.75      , 0.49506375, ..., 0.        , 0.        ,
        0.        ],
       [0.23529412, 0.75      , 0.434909  , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.29411765, 0.75      , 0.42385922, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.75      , 0.434909  , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.75      , 0.47117546, ..., 0.        , 0.        ,
        0.        ]])

In [30]:
# transform the train and test set, and add on the Id and SalePrice variables
data = pd.concat([data[['Id', 'SalePrice']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(data[feature_scale]), columns=feature_scale)],
                    axis=1)

In [31]:
data.head()

Unnamed: 0,Id,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,...,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice.1,LotFrontagenan,MasVnrAreanan,GarageYrBltnan
0,1,12.247694,0.235294,0.75,0.418208,0.366344,1.0,1.0,0.0,0.333333,...,1.0,0.0,0.090909,0.5,0.666667,0.75,0.581431,0.0,0.0,0.0
1,2,12.109011,0.0,0.75,0.495064,0.391317,1.0,1.0,0.0,0.333333,...,1.0,0.0,0.363636,0.25,0.666667,0.75,0.536319,0.0,0.0,0.0
2,3,12.317167,0.235294,0.75,0.434909,0.422359,1.0,1.0,0.333333,0.333333,...,1.0,0.0,0.727273,0.5,0.666667,0.75,0.604029,0.0,0.0,0.0
3,4,11.849398,0.294118,0.75,0.388581,0.390295,1.0,1.0,0.333333,0.333333,...,1.0,0.0,0.090909,0.0,0.666667,0.0,0.451871,0.0,0.0,0.0
4,5,12.429216,0.235294,0.75,0.513123,0.468761,1.0,1.0,0.333333,0.333333,...,1.0,0.0,1.0,0.5,0.666667,0.75,0.640477,0.0,0.0,0.0


# feature selection


In [32]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

In [33]:
pd.pandas.set_option('display.max_columns' , None)

In [34]:
y_train = data[['SalePrice']]

In [35]:
x_train = data.drop(['Id' , 'SalePrice'] , axis= 1)

In [36]:
#first i specify the Lasso Regressor model and
#I select a suitable alpha (eqivalent of penalty)
#the bigger the alpha the less feature will be selected 

#then i use the selectfrommodel object from sklearn , which will slect the feature which coefficient are non zero

In [37]:
feature_sel_model = SelectFromModel(Lasso(alpha = 0.005 , random_state= 0))
feature_sel_model.fit(x_train,y_train)

SelectFromModel(estimator=Lasso(alpha=0.005, copy_X=True, fit_intercept=True,
                                max_iter=1000, normalize=False, positive=False,
                                precompute=False, random_state=0,
                                selection='cyclic', tol=0.0001,
                                warm_start=False),
                max_features=None, norm_order=1, prefit=False, threshold=None)

In [38]:
feature_sel_model.get_support()

array([ True,  True, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False,  True, False,
       False,  True,  True, False, False, False, False, False, False,
       False, False,  True, False,  True, False, False, False, False,
       False, False, False,  True,  True, False,  True, False, False,
        True,  True, False, False, False, False, False,  True, False,
       False,  True,  True,  True, False,  True,  True, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False])

In [39]:
select_feat = x_train.columns[(feature_sel_model.get_support())]

In [40]:
x_train = x_train[select_feat]

In [41]:
x_train.head()

Unnamed: 0,MSSubClass,MSZoning,Neighborhood,OverallQual,YearRemodAdd,RoofStyle,BsmtQual,BsmtExposure,HeatingQC,CentralAir,1stFlrSF,GrLivArea,BsmtFullBath,KitchenQual,Fireplaces,FireplaceQu,GarageType,GarageFinish,GarageCars,PavedDrive,SaleCondition
0,0.235294,0.75,0.636364,0.666667,0.098361,0.0,0.75,0.25,1.0,1.0,0.356155,0.577712,0.333333,0.666667,0.0,0.2,0.8,0.666667,0.5,1.0,0.75
1,0.0,0.75,0.5,0.555556,0.52459,0.0,0.75,1.0,1.0,1.0,0.503056,0.470245,0.0,0.333333,0.333333,0.6,0.8,0.666667,0.5,1.0,0.75
2,0.235294,0.75,0.636364,0.666667,0.114754,0.0,0.75,0.5,1.0,1.0,0.383441,0.593095,0.333333,0.666667,0.333333,0.6,0.8,0.666667,0.5,1.0,0.75
3,0.294118,0.75,0.727273,0.666667,0.606557,0.0,0.5,0.25,0.75,1.0,0.399941,0.579157,0.333333,0.666667,0.333333,0.8,0.4,0.333333,0.75,1.0,0.0
4,0.235294,0.75,1.0,0.777778,0.147541,0.0,0.75,0.75,1.0,1.0,0.466237,0.666523,0.333333,0.666667,0.333333,0.6,0.8,0.666667,0.75,1.0,0.75
