In [36]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.pandas.set_option('display.max_columns', None)

In [37]:
dataset = pd.read_csv('datasets/house-prices-advanced-regression-techniques/test.csv')

In [38]:
categorical_features_with_nan = [feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtype == 'O']

for feature in categorical_features_with_nan:
    print("{}:{} % missing values".format(feature,np.round(dataset[feature].isnull().sum()/len(dataset[feature])*100)))

MSZoning:0.0 % missing values
Alley:93.0 % missing values
Utilities:0.0 % missing values
MasVnrType:61.0 % missing values
BsmtQual:3.0 % missing values
BsmtCond:3.0 % missing values
BsmtExposure:3.0 % missing values
BsmtFinType1:3.0 % missing values
BsmtFinType2:3.0 % missing values
Functional:0.0 % missing values
FireplaceQu:50.0 % missing values
GarageType:5.0 % missing values
GarageFinish:5.0 % missing values
GarageQual:5.0 % missing values
GarageCond:5.0 % missing values
PoolQC:100.0 % missing values
Fence:80.0 % missing values
MiscFeature:97.0 % missing values


In [39]:
#For categorical features with missing values, we can replace them with the string "missing" to indicate that they are missing which is a category in itself and is a common prractice
def replace_cat_feature(dataset,categorical_features_with_nan):
    data = dataset.copy()
    # for feature in categorical_features_with_nan:
    data[categorical_features_with_nan] = data[categorical_features_with_nan].fillna("Missing")
    return data
    
dataset = replace_cat_feature(dataset,categorical_features_with_nan)

dataset[categorical_features_with_nan].isnull().sum() #Check if there are any missing values


MSZoning        0
Alley           0
Utilities       0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
Functional      0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

In [40]:
#Now we can check the categorical features with missing values
# dataset.head(100)
categorical_features_with_nan
for feature in categorical_features_with_nan:
    print("{}:{} % missing values".format(feature,np.round(dataset[feature].isnull().sum()/len(dataset[feature])*100)))

MSZoning:0.0 % missing values
Alley:0.0 % missing values
Utilities:0.0 % missing values
MasVnrType:0.0 % missing values
BsmtQual:0.0 % missing values
BsmtCond:0.0 % missing values
BsmtExposure:0.0 % missing values
BsmtFinType1:0.0 % missing values
BsmtFinType2:0.0 % missing values
Functional:0.0 % missing values
FireplaceQu:0.0 % missing values
GarageType:0.0 % missing values
GarageFinish:0.0 % missing values
GarageQual:0.0 % missing values
GarageCond:0.0 % missing values
PoolQC:0.0 % missing values
Fence:0.0 % missing values
MiscFeature:0.0 % missing values


In [41]:
#Now we fill the missing values for numerical feature with median values in the feature to avoid the effect of outliers in the data/feature/variable
numerical_features_with_nan = [feature for feature in dataset.columns if dataset[feature].isnull().sum() > 0 and dataset[feature].dtype != 'O' ]
# numerical_features_with_nan
for feature in numerical_features_with_nan:
    print("{}:{} % missing values".format(feature, np.round(dataset[feature].isnull().sum()/len(dataset[feature])*100)))

LotFrontage:16.0 % missing values
MasVnrArea:1.0 % missing values
BsmtFinSF1:0.0 % missing values
BsmtFinSF2:0.0 % missing values
BsmtUnfSF:0.0 % missing values
TotalBsmtSF:0.0 % missing values
BsmtFullBath:0.0 % missing values
BsmtHalfBath:0.0 % missing values
GarageYrBlt:5.0 % missing values
GarageCars:0.0 % missing values
GarageArea:0.0 % missing values


In [42]:
## Here we will replace the missing values with the median value of the feature
## Also we will create a new feature namely feature_name+'_NaN' to indicate the original feature values which were missing and which were not

for feature in numerical_features_with_nan:
    median_value = dataset[feature].median()

    dataset[feature+'_NaN'] = np.where(dataset[feature].isnull(),1,0)
    dataset[feature] = dataset[feature].fillna(median_value)

dataset[numerical_features_with_nan].isnull().sum()


LotFrontage     0
MasVnrArea      0
BsmtFinSF1      0
BsmtFinSF2      0
BsmtUnfSF       0
TotalBsmtSF     0
BsmtFullBath    0
BsmtHalfBath    0
GarageYrBlt     0
GarageCars      0
GarageArea      0
dtype: int64

In [43]:
# Handling temporal variables 
year_features = [feature for feature in dataset.columns if 'Yr' in feature or 'Year' in feature]
# year_features.remove('YrSold')
year_features

['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold', 'GarageYrBlt_NaN']

In [44]:
for feature in year_features:
    if(feature == 'YrSold'):
        pass
    else:
        dataset[feature] = dataset['YrSold'] - dataset[feature]



In [45]:
dataset[year_features].head(10)

Unnamed: 0,YearBuilt,YearRemodAdd,GarageYrBlt,YrSold,GarageYrBlt_NaN
0,49,49,49.0,2010,2010
1,52,52,52.0,2010,2010
2,13,12,13.0,2010,2010
3,12,12,12.0,2010,2010
4,18,18,18.0,2010,2010
5,17,16,17.0,2010,2010
6,18,3,18.0,2010,2010
7,12,12,12.0,2010,2010
8,20,20,20.0,2010,2010
9,40,40,40.0,2010,2010


In [46]:
dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontage_NaN,MasVnrArea_NaN,BsmtFinSF1_NaN,BsmtFinSF2_NaN,BsmtUnfSF_NaN,TotalBsmtSF_NaN,BsmtFullBath_NaN,BsmtHalfBath_NaN,GarageYrBlt_NaN,GarageCars_NaN,GarageArea_NaN
0,1461,20,RH,80.0,11622,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,49,49,Gable,CompShg,VinylSd,VinylSd,Missing,0.0,TA,TA,CBlock,TA,TA,No,Rec,468.0,LwQ,144.0,270.0,882.0,GasA,TA,Y,SBrkr,896,0,0,896,0.0,0.0,1,0,2,1,TA,5,Typ,0,Missing,Attchd,49.0,Unf,1.0,730.0,TA,TA,Y,140,0,0,0,120,0,Missing,MnPrv,Missing,0,6,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0
1,1462,20,RL,81.0,14267,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,52,52,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,108.0,TA,TA,CBlock,TA,TA,No,ALQ,923.0,Unf,0.0,406.0,1329.0,GasA,TA,Y,SBrkr,1329,0,0,1329,0.0,0.0,1,1,3,1,Gd,6,Typ,0,Missing,Attchd,52.0,Unf,1.0,312.0,TA,TA,Y,393,36,0,0,0,0,Missing,Missing,Gar2,12500,6,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0
2,1463,60,RL,74.0,13830,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,13,12,Gable,CompShg,VinylSd,VinylSd,Missing,0.0,TA,TA,PConc,Gd,TA,No,GLQ,791.0,Unf,0.0,137.0,928.0,GasA,Gd,Y,SBrkr,928,701,0,1629,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,13.0,Fin,2.0,482.0,TA,TA,Y,212,34,0,0,0,0,Missing,MnPrv,Missing,0,3,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0
3,1464,60,RL,78.0,9978,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,12,12,Gable,CompShg,VinylSd,VinylSd,BrkFace,20.0,TA,TA,PConc,TA,TA,No,GLQ,602.0,Unf,0.0,324.0,926.0,GasA,Ex,Y,SBrkr,926,678,0,1604,0.0,0.0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,12.0,Fin,2.0,470.0,TA,TA,Y,360,36,0,0,0,0,Missing,Missing,Missing,0,6,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0
4,1465,120,RL,43.0,5005,Pave,Missing,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,18,18,Gable,CompShg,HdBoard,HdBoard,Missing,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,263.0,Unf,0.0,1017.0,1280.0,GasA,Ex,Y,SBrkr,1280,0,0,1280,0.0,0.0,2,0,2,1,Gd,5,Typ,0,Missing,Attchd,18.0,RFn,2.0,506.0,TA,TA,Y,0,82,0,0,144,0,Missing,Missing,Missing,0,1,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0


In [47]:
#numerical_features 
numerical_features  = [feature for feature in dataset.columns if dataset[feature].dtype != 'O']
# numerical_features
discrete_feature = [feature for feature in numerical_features if len(dataset[feature].unique())<25 and feature not in year_features+['Id']]
continuous_feature = [feature for feature in numerical_features if feature not in discrete_feature+year_features+['Id']]
print("Continuous feature Count {}".format(len(continuous_feature)))


Continuous feature Count 16


In [48]:
continuous_feature

['LotFrontage',
 'LotArea',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'BsmtUnfSF',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'GrLivArea',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 'ScreenPorch',
 'MiscVal']

In [49]:
dataset[continuous_feature].head(10)

Unnamed: 0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,GrLivArea,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,ScreenPorch,MiscVal
0,80.0,11622,0.0,468.0,144.0,270.0,882.0,896,0,896,730.0,140,0,0,120,0
1,81.0,14267,108.0,923.0,0.0,406.0,1329.0,1329,0,1329,312.0,393,36,0,0,12500
2,74.0,13830,0.0,791.0,0.0,137.0,928.0,928,701,1629,482.0,212,34,0,0,0
3,78.0,9978,20.0,602.0,0.0,324.0,926.0,926,678,1604,470.0,360,36,0,0,0
4,43.0,5005,0.0,263.0,0.0,1017.0,1280.0,1280,0,1280,506.0,0,82,0,144,0
5,75.0,10000,0.0,0.0,0.0,763.0,763.0,763,892,1655,440.0,157,84,0,0,0
6,67.0,7980,0.0,935.0,0.0,233.0,1168.0,1187,0,1187,420.0,483,21,0,0,500
7,63.0,8402,0.0,0.0,0.0,789.0,789.0,789,676,1465,393.0,0,75,0,0,0
8,85.0,10176,0.0,637.0,0.0,663.0,1300.0,1341,0,1341,506.0,192,0,0,0,0
9,70.0,8400,0.0,804.0,78.0,0.0,882.0,882,0,882,525.0,240,0,0,0,0


In [50]:
for feature in continuous_feature:
    if 0 in dataset[feature].unique():
        dataset[feature] = dataset[feature] + 1
        #We can add 1 to the feature to avoid log(0) which is undefined
    dataset[feature] = np.log(dataset[feature])
    
#We can see that the distribution of the features is now normal and we can use these features for our model

In [51]:
# continuous_feature_with_log = ['LotFrontage', 'LotArea', '1stFlrSF', 'GrLivArea']
# for feature in continuous_feature_with_log:
#     dataset[feature] = np.log(dataset[feature])

In [52]:
# dataset[continuous_feature_with_log].head(10)

In [53]:
categorical_features=[feature for feature in dataset.columns if dataset[feature].dtype=='O']
dataset.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontage_NaN,MasVnrArea_NaN,BsmtFinSF1_NaN,BsmtFinSF2_NaN,BsmtUnfSF_NaN,TotalBsmtSF_NaN,BsmtFullBath_NaN,BsmtHalfBath_NaN,GarageYrBlt_NaN,GarageCars_NaN,GarageArea_NaN
0,1461,20,RH,4.382027,9.360655,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,49,49,Gable,CompShg,VinylSd,VinylSd,Missing,0.0,TA,TA,CBlock,TA,TA,No,Rec,6.150603,LwQ,4.976734,5.602119,6.783325,GasA,TA,Y,SBrkr,6.79794,0.0,0,6.79794,0.0,0.0,1,0,2,1,TA,5,Typ,0,Missing,Attchd,49.0,Unf,1.0,6.594413,TA,TA,Y,4.94876,0.0,0.0,0,4.795791,0,Missing,MnPrv,Missing,0.0,6,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0
1,1462,20,RL,4.394449,9.565704,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,52,52,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,4.691348,TA,TA,CBlock,TA,TA,No,ALQ,6.828712,Unf,0.0,6.008813,7.192934,GasA,TA,Y,SBrkr,7.192182,0.0,0,7.192182,0.0,0.0,1,1,3,1,Gd,6,Typ,0,Missing,Attchd,52.0,Unf,1.0,5.746203,TA,TA,Y,5.976351,3.610918,0.0,0,0.0,0,Missing,Missing,Gar2,9.433564,6,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0
2,1463,60,RL,4.304065,9.534595,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,13,12,Gable,CompShg,VinylSd,VinylSd,Missing,0.0,TA,TA,PConc,Gd,TA,No,GLQ,6.674561,Unf,0.0,4.927254,6.834109,GasA,Gd,Y,SBrkr,6.833032,6.553933,0,7.395722,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,13.0,Fin,2.0,6.180017,TA,TA,Y,5.361292,3.555348,0.0,0,0.0,0,Missing,MnPrv,Missing,0.0,3,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0
3,1464,60,RL,4.356709,9.208138,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,12,12,Gable,CompShg,VinylSd,VinylSd,BrkFace,3.044522,TA,TA,PConc,TA,TA,No,GLQ,6.401917,Unf,0.0,5.783825,6.831954,GasA,Ex,Y,SBrkr,6.830874,6.520621,0,7.380256,0.0,0.0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,12.0,Fin,2.0,6.154858,TA,TA,Y,5.888878,3.610918,0.0,0,0.0,0,Missing,Missing,Missing,0.0,6,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0
4,1465,120,RL,3.7612,8.518193,Pave,Missing,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,18,18,Gable,CompShg,HdBoard,HdBoard,Missing,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,5.575949,Unf,0.0,6.925595,7.155396,GasA,Ex,Y,SBrkr,7.154615,0.0,0,7.154615,0.0,0.0,2,0,2,1,Gd,5,Typ,0,Missing,Attchd,18.0,RFn,2.0,6.228511,TA,TA,Y,0.0,4.418841,0.0,0,4.976734,0,Missing,Missing,Missing,0.0,1,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0
5,1466,60,RL,4.317488,9.21034,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,5,17,16,Gable,CompShg,HdBoard,HdBoard,Missing,0.0,TA,TA,PConc,Gd,TA,No,Unf,0.0,Unf,0.0,6.638568,6.638568,GasA,Gd,Y,SBrkr,6.637258,6.794587,0,7.411556,0.0,0.0,2,1,3,1,TA,7,Typ,1,TA,Attchd,17.0,Fin,2.0,6.089045,TA,TA,Y,5.062595,4.442651,0.0,0,0.0,0,Missing,Missing,Missing,0.0,4,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0
6,1467,20,RL,4.204693,8.984694,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,1Story,6,7,18,3,Gable,CompShg,HdBoard,HdBoard,Missing,0.0,TA,Gd,PConc,Gd,TA,No,ALQ,6.841615,Unf,0.0,5.455321,7.063904,GasA,Ex,Y,SBrkr,7.079184,0.0,0,7.079184,1.0,0.0,2,0,3,1,TA,6,Typ,0,Missing,Attchd,18.0,Fin,2.0,6.042633,TA,TA,Y,6.182085,3.091042,0.0,0,0.0,0,Missing,GdPrv,Shed,6.216606,3,2010,WD,Normal,1,0,0,0,0,0,0,0,2010,0,0
7,1468,60,RL,4.143135,9.036225,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,5,12,12,Gable,CompShg,VinylSd,VinylSd,Missing,0.0,TA,TA,PConc,Gd,TA,No,Unf,0.0,Unf,0.0,6.672033,6.672033,GasA,Gd,Y,SBrkr,6.670766,6.517671,0,7.289611,0.0,0.0,2,1,3,1,TA,7,Typ,1,Gd,Attchd,12.0,Fin,2.0,5.976351,TA,TA,Y,0.0,4.330733,0.0,0,0.0,0,Missing,Missing,Missing,0.0,5,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0
8,1469,20,RL,4.442651,9.227787,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,1Story,7,5,20,20,Gable,CompShg,HdBoard,HdBoard,Missing,0.0,TA,TA,PConc,Gd,TA,Gd,GLQ,6.458338,Unf,0.0,6.498282,7.170888,GasA,Gd,Y,SBrkr,7.201171,0.0,0,7.201171,1.0,0.0,1,1,2,1,Gd,5,Typ,1,Po,Attchd,20.0,Unf,2.0,6.228511,TA,TA,Y,5.26269,0.0,0.0,0,0.0,0,Missing,Missing,Missing,0.0,2,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0
9,1470,20,RL,4.248495,9.035987,Pave,Missing,Reg,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,4,5,40,40,Gable,CompShg,Plywood,Plywood,Missing,0.0,TA,TA,CBlock,TA,TA,No,ALQ,6.690842,Rec,4.369448,0.0,6.783325,GasA,TA,Y,SBrkr,6.782192,0.0,0,6.782192,1.0,0.0,1,0,2,1,TA,4,Typ,0,Missing,Attchd,40.0,Fin,2.0,6.265301,TA,TA,Y,5.484797,0.0,0.0,0,0.0,0,Missing,MnPrv,Missing,0.0,4,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0


In [54]:
for feature in categorical_features:
    temp=dataset.groupby(feature).count()/len(dataset)
    temp_df=temp[temp>0.01].index
    dataset[feature]=np.where(dataset[feature].isin(temp_df),dataset[feature],'Rare_var')
    
    

In [55]:
dataset.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontage_NaN,MasVnrArea_NaN,BsmtFinSF1_NaN,BsmtFinSF2_NaN,BsmtUnfSF_NaN,TotalBsmtSF_NaN,BsmtFullBath_NaN,BsmtHalfBath_NaN,GarageYrBlt_NaN,GarageCars_NaN,GarageArea_NaN
0,1461,20,RH,4.382027,9.360655,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Feedr,Norm,1Fam,1Story,5,6,49,49,Gable,CompShg,VinylSd,VinylSd,Missing,0.0,TA,TA,CBlock,TA,TA,No,Rec,6.150603,LwQ,4.976734,5.602119,6.783325,GasA,TA,Y,SBrkr,6.79794,0.0,0,6.79794,0.0,0.0,1,0,2,1,TA,5,Typ,0,Missing,Attchd,49.0,Unf,1.0,6.594413,TA,TA,Y,4.94876,0.0,0.0,0,4.795791,0,Missing,MnPrv,Missing,0.0,6,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0
1,1462,20,RL,4.394449,9.565704,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,6,52,52,Hip,CompShg,Wd Sdng,Wd Sdng,BrkFace,4.691348,TA,TA,CBlock,TA,TA,No,ALQ,6.828712,Unf,0.0,6.008813,7.192934,GasA,TA,Y,SBrkr,7.192182,0.0,0,7.192182,0.0,0.0,1,1,3,1,Gd,6,Typ,0,Missing,Attchd,52.0,Unf,1.0,5.746203,TA,TA,Y,5.976351,3.610918,0.0,0,0.0,0,Missing,Missing,Gar2,9.433564,6,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0
2,1463,60,RL,4.304065,9.534595,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,5,5,13,12,Gable,CompShg,VinylSd,VinylSd,Missing,0.0,TA,TA,PConc,Gd,TA,No,GLQ,6.674561,Unf,0.0,4.927254,6.834109,GasA,Gd,Y,SBrkr,6.833032,6.553933,0,7.395722,0.0,0.0,2,1,3,1,TA,6,Typ,1,TA,Attchd,13.0,Fin,2.0,6.180017,TA,TA,Y,5.361292,3.555348,0.0,0,0.0,0,Missing,MnPrv,Missing,0.0,3,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0
3,1464,60,RL,4.356709,9.208138,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,6,12,12,Gable,CompShg,VinylSd,VinylSd,BrkFace,3.044522,TA,TA,PConc,TA,TA,No,GLQ,6.401917,Unf,0.0,5.783825,6.831954,GasA,Ex,Y,SBrkr,6.830874,6.520621,0,7.380256,0.0,0.0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,12.0,Fin,2.0,6.154858,TA,TA,Y,5.888878,3.610918,0.0,0,0.0,0,Missing,Missing,Missing,0.0,6,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0
4,1465,120,RL,3.7612,8.518193,Pave,Missing,IR1,HLS,AllPub,Inside,Gtl,StoneBr,Norm,Norm,TwnhsE,1Story,8,5,18,18,Gable,CompShg,HdBoard,HdBoard,Missing,0.0,Gd,TA,PConc,Gd,TA,No,ALQ,5.575949,Unf,0.0,6.925595,7.155396,GasA,Ex,Y,SBrkr,7.154615,0.0,0,7.154615,0.0,0.0,2,0,2,1,Gd,5,Typ,0,Missing,Attchd,18.0,RFn,2.0,6.228511,TA,TA,Y,0.0,4.418841,0.0,0,4.976734,0,Missing,Missing,Missing,0.0,1,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0
5,1466,60,RL,4.317488,9.21034,Pave,Missing,IR1,Lvl,AllPub,Corner,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,5,17,16,Gable,CompShg,HdBoard,HdBoard,Missing,0.0,TA,TA,PConc,Gd,TA,No,Unf,0.0,Unf,0.0,6.638568,6.638568,GasA,Gd,Y,SBrkr,6.637258,6.794587,0,7.411556,0.0,0.0,2,1,3,1,TA,7,Typ,1,TA,Attchd,17.0,Fin,2.0,6.089045,TA,TA,Y,5.062595,4.442651,0.0,0,0.0,0,Missing,Missing,Missing,0.0,4,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0
6,1467,20,RL,4.204693,8.984694,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,1Story,6,7,18,3,Gable,CompShg,HdBoard,HdBoard,Missing,0.0,TA,Gd,PConc,Gd,TA,No,ALQ,6.841615,Unf,0.0,5.455321,7.063904,GasA,Ex,Y,SBrkr,7.079184,0.0,0,7.079184,1.0,0.0,2,0,3,1,TA,6,Typ,0,Missing,Attchd,18.0,Fin,2.0,6.042633,TA,TA,Y,6.182085,3.091042,0.0,0,0.0,0,Missing,GdPrv,Shed,6.216606,3,2010,WD,Normal,1,0,0,0,0,0,0,0,2010,0,0
7,1468,60,RL,4.143135,9.036225,Pave,Missing,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,5,12,12,Gable,CompShg,VinylSd,VinylSd,Missing,0.0,TA,TA,PConc,Gd,TA,No,Unf,0.0,Unf,0.0,6.672033,6.672033,GasA,Gd,Y,SBrkr,6.670766,6.517671,0,7.289611,0.0,0.0,2,1,3,1,TA,7,Typ,1,Gd,Attchd,12.0,Fin,2.0,5.976351,TA,TA,Y,0.0,4.330733,0.0,0,0.0,0,Missing,Missing,Missing,0.0,5,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0
8,1469,20,RL,4.442651,9.227787,Pave,Missing,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,1Story,7,5,20,20,Gable,CompShg,HdBoard,HdBoard,Missing,0.0,TA,TA,PConc,Gd,TA,Gd,GLQ,6.458338,Unf,0.0,6.498282,7.170888,GasA,Gd,Y,SBrkr,7.201171,0.0,0,7.201171,1.0,0.0,1,1,2,1,Gd,5,Typ,1,Po,Attchd,20.0,Unf,2.0,6.228511,TA,TA,Y,5.26269,0.0,0.0,0,0.0,0,Missing,Missing,Missing,0.0,2,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0
9,1470,20,RL,4.248495,9.035987,Pave,Missing,Reg,Lvl,AllPub,Corner,Gtl,NAmes,Norm,Norm,1Fam,1Story,4,5,40,40,Gable,CompShg,Plywood,Plywood,Missing,0.0,TA,TA,CBlock,TA,TA,No,ALQ,6.690842,Rec,4.369448,0.0,6.783325,GasA,TA,Y,SBrkr,6.782192,0.0,0,6.782192,1.0,0.0,1,0,2,1,TA,4,Typ,0,Missing,Attchd,40.0,Fin,2.0,6.265301,TA,TA,Y,5.484797,0.0,0.0,0,0.0,0,Missing,MnPrv,Missing,0.0,4,2010,WD,Normal,0,0,0,0,0,0,0,0,2010,0,0


In [56]:
for feature in categorical_features:
    # Step 1: Compute frequency of each category
    category_counts = dataset[feature].value_counts()  # Get count of each category
    
    # Step 2: Sort categories by frequency in ascending order (least common first)
    categories_sorted = category_counts.sort_values().index  
    
    # Step 3: Create ranking dictionary (lowest frequency = rank 0, highest = max rank)
    labels_ordered = {cat: i for i, cat in enumerate(categories_sorted)}
    
    # Step 4: Assign 101 explicitly to "Rare_var"
    labels_ordered["Rare_var"] = -1  
    
    # Step 5: Apply encoding to the dataset
    dataset[feature] = dataset[feature].map(labels_ordered)

In [57]:
dataset.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontage_NaN,MasVnrArea_NaN,BsmtFinSF1_NaN,BsmtFinSF2_NaN,BsmtUnfSF_NaN,TotalBsmtSF_NaN,BsmtFullBath_NaN,BsmtHalfBath_NaN,GarageYrBlt_NaN,GarageCars_NaN,GarageArea_NaN
0,1461,20,1,4.382027,9.360655,1,2,3,3,1,4,2,24,7,4,4,6,5,6,49,49,5,3,13,15,3,0.0,3,4,4,4,4,4,3,6.150603,3,4.976734,5.602119,6.783325,3,3,1,3,6.79794,0.0,0,6.79794,0.0,0.0,1,0,2,1,4,5,7,0,5,6,49.0,3,1.0,6.594413,4,5,2,4.94876,0.0,0.0,0,4.795791,0,2,3,3,0.0,6,2010,9,5,0,0,0,0,0,0,0,0,2010,0,0
1,1462,20,5,4.394449,9.565704,1,2,2,3,1,3,2,24,8,4,4,6,6,6,52,52,4,3,10,12,2,4.691348,3,4,4,4,4,4,4,6.828712,6,0.0,6.008813,7.192934,3,3,1,3,7.192182,0.0,0,7.192182,0.0,0.0,1,1,3,1,3,6,7,0,5,6,52.0,3,1.0,5.746203,4,5,2,5.976351,3.610918,0.0,0,0.0,0,2,4,1,9.433564,6,2010,9,5,0,0,0,0,0,0,0,0,2010,0,0
2,1463,60,5,4.304065,9.534595,1,2,2,3,1,4,2,18,8,4,4,5,5,5,13,12,5,3,13,15,3,0.0,3,4,5,3,4,4,6,6.674561,6,0.0,4.927254,6.834109,3,2,1,3,6.833032,6.553933,0,7.395722,0.0,0.0,2,1,3,1,4,6,7,1,3,6,13.0,1,2.0,6.180017,4,5,2,5.361292,3.555348,0.0,0,0.0,0,2,3,3,0.0,3,2010,9,5,0,0,0,0,0,0,0,0,2010,0,0
3,1464,60,5,4.356709,9.208138,1,2,2,3,1,4,2,18,8,4,4,5,6,6,12,12,5,3,13,15,2,3.044522,3,4,5,4,4,4,6,6.401917,6,0.0,5.783825,6.831954,3,4,1,3,6.830874,6.520621,0,7.380256,0.0,0.0,2,1,3,1,3,7,7,1,4,6,12.0,1,2.0,6.154858,4,5,2,5.888878,3.610918,0.0,0,0.0,0,2,4,3,0.0,6,2010,9,5,0,0,0,0,0,0,0,0,2010,0,0
4,1465,120,5,3.7612,8.518193,1,2,2,2,1,4,2,8,8,4,3,6,8,5,18,18,5,3,11,13,3,0.0,2,4,5,3,4,4,4,5.575949,6,0.0,6.925595,7.155396,3,4,1,3,7.154615,0.0,0,7.154615,0.0,0.0,2,0,2,1,3,5,7,0,5,6,18.0,2,2.0,6.228511,4,5,2,0.0,4.418841,0.0,0,4.976734,0,2,4,3,0.0,1,2010,9,5,0,0,0,0,0,0,0,0,2010,0,0
5,1466,60,5,4.317488,9.21034,1,2,2,3,1,3,2,18,8,4,4,5,6,5,17,16,5,3,11,13,3,0.0,3,4,5,3,4,4,5,0.0,6,0.0,6.638568,6.638568,3,2,1,3,6.637258,6.794587,0,7.411556,0.0,0.0,2,1,3,1,4,7,7,1,3,6,17.0,1,2.0,6.089045,4,5,2,5.062595,4.442651,0.0,0,0.0,0,2,4,3,0.0,4,2010,9,5,0,0,0,0,0,0,0,0,2010,0,0
6,1467,20,5,4.204693,8.984694,1,2,2,3,1,4,2,18,8,4,4,6,6,7,18,3,5,3,11,13,3,0.0,3,3,5,3,4,4,4,6.841615,6,0.0,5.455321,7.063904,3,4,1,3,7.079184,0.0,0,7.079184,1.0,0.0,2,0,3,1,4,6,7,0,5,6,18.0,1,2.0,6.042633,4,5,2,6.182085,3.091042,0.0,0,0.0,0,2,2,2,6.216606,3,2010,9,5,1,0,0,0,0,0,0,0,2010,0,0
7,1468,60,5,4.143135,9.036225,1,2,2,3,1,4,2,18,8,4,4,5,6,5,12,12,5,3,13,15,3,0.0,3,4,5,3,4,4,5,0.0,6,0.0,6.672033,6.672033,3,2,1,3,6.670766,6.517671,0,7.289611,0.0,0.0,2,1,3,1,4,7,7,1,4,6,12.0,1,2.0,5.976351,4,5,2,0.0,4.330733,0.0,0,0.0,0,2,4,3,0.0,5,2010,9,5,0,0,0,0,0,0,0,0,2010,0,0
8,1469,20,5,4.442651,9.227787,1,2,3,3,1,4,2,18,8,4,4,6,7,5,20,20,5,3,11,13,3,0.0,3,4,5,3,4,2,6,6.458338,6,0.0,6.498282,7.170888,3,2,1,3,7.201171,0.0,0,7.201171,1.0,0.0,1,1,2,1,3,5,7,1,1,6,20.0,3,2.0,6.228511,4,5,2,5.26269,0.0,0.0,0,0.0,0,2,4,3,0.0,2,2010,9,5,0,0,0,0,0,0,0,0,2010,0,0
9,1470,20,5,4.248495,9.035987,1,2,3,3,1,3,2,24,8,4,4,6,4,5,40,40,5,3,9,11,3,0.0,3,4,4,4,4,4,4,6.690842,5,4.369448,0.0,6.783325,3,3,1,3,6.782192,0.0,0,6.782192,1.0,0.0,1,0,2,1,4,4,7,0,5,6,40.0,1,2.0,6.265301,4,5,2,5.484797,0.0,0.0,0,0.0,0,2,3,3,0.0,4,2010,9,5,0,0,0,0,0,0,0,0,2010,0,0


In [58]:
feature_scale=[feature for feature in dataset.columns if feature not in ['Id','SalePrice']]

from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()
scaler.fit(dataset[feature_scale])

In [59]:
scaler.transform(dataset[feature_scale])

array([[0.        , 0.2       , 0.59344538, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.59895721, ..., 1.        , 0.        ,
        0.        ],
       [0.23529412, 1.        , 0.55885415, ..., 1.        , 0.        ,
        0.        ],
       ...,
       [0.        , 1.        , 0.90099208, ..., 0.2       , 0.        ,
        0.        ],
       [0.38235294, 1.        , 0.48035069, ..., 0.        , 0.        ,
        0.        ],
       [0.23529412, 1.        , 0.55885415, ..., 0.2       , 0.        ,
        0.        ]], shape=(1459, 90))

In [60]:
# transform the train and test set, and add on the Id and SalePrice variables
data = pd.concat([dataset[['Id']].reset_index(drop=True),
                    pd.DataFrame(scaler.transform(dataset[feature_scale]), columns=feature_scale)],
                    axis=1)

In [61]:
data.head(10)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontage_NaN,MasVnrArea_NaN,BsmtFinSF1_NaN,BsmtFinSF2_NaN,BsmtUnfSF_NaN,TotalBsmtSF_NaN,BsmtFullBath_NaN,BsmtHalfBath_NaN,GarageYrBlt_NaN,GarageCars_NaN,GarageArea_NaN
0,1461,0.0,0.2,0.593445,0.56636,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.875,1.0,1.0,1.0,0.444444,0.625,0.384615,0.822581,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.8,1.0,1.0,1.0,0.5,0.741323,0.5,0.678856,0.730486,0.794653,1.0,0.75,1.0,1.0,0.312253,0.0,0.0,0.312253,0.0,0.0,0.25,0.0,0.333333,0.5,1.0,0.166667,1.0,0.0,1.0,1.0,0.792994,1.0,0.2,0.90262,1.0,1.0,1.0,0.681466,0.0,0.0,0.0,0.754311,0.0,1.0,0.75,1.0,0.0,0.454545,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1462,0.0,1.0,0.598957,0.622527,1.0,1.0,0.666667,1.0,1.0,0.75,1.0,1.0,1.0,1.0,1.0,1.0,0.555556,0.625,0.407692,0.870968,0.8,1.0,0.785714,0.8125,0.666667,0.654926,1.0,1.0,0.8,1.0,1.0,1.0,0.666667,0.823054,1.0,0.0,0.783517,0.842638,1.0,0.75,1.0,1.0,0.468253,0.0,0.0,0.468253,0.0,0.0,0.25,0.5,0.5,0.5,0.8,0.25,1.0,0.0,1.0,1.0,0.802548,1.0,0.2,0.78652,1.0,1.0,1.0,0.82297,0.546224,0.0,0.0,0.0,0.0,1.0,1.0,0.333333,0.968436,0.454545,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1463,0.235294,1.0,0.558854,0.614005,1.0,1.0,0.666667,1.0,1.0,1.0,1.0,0.75,1.0,1.0,1.0,0.833333,0.444444,0.5,0.107692,0.225806,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.75,1.0,1.0,1.0,0.804475,1.0,0.0,0.642487,0.800602,1.0,0.5,1.0,1.0,0.326139,0.870383,0.0,0.548792,0.0,0.0,0.5,0.5,0.5,0.5,1.0,0.25,1.0,0.25,0.6,1.0,0.678344,0.333333,0.4,0.845899,1.0,1.0,1.0,0.738274,0.537818,0.0,0.0,0.0,0.0,1.0,0.75,1.0,0.0,0.181818,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1464,0.235294,1.0,0.582212,0.524583,1.0,1.0,0.666667,1.0,1.0,1.0,1.0,0.75,1.0,1.0,1.0,0.833333,0.555556,0.625,0.1,0.225806,1.0,1.0,1.0,1.0,0.666667,0.425024,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.771613,1.0,0.0,0.75418,0.80035,1.0,1.0,1.0,1.0,0.325285,0.865959,0.0,0.542672,0.0,0.0,0.5,0.5,0.5,0.5,0.8,0.333333,1.0,0.25,0.8,1.0,0.675159,0.333333,0.4,0.842455,1.0,1.0,1.0,0.810925,0.546224,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.454545,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1465,0.588235,1.0,0.317987,0.335596,1.0,1.0,0.666667,0.666667,1.0,1.0,1.0,0.333333,1.0,1.0,0.75,1.0,0.777778,0.5,0.146154,0.322581,1.0,1.0,0.857143,0.875,1.0,0.0,0.666667,1.0,1.0,0.75,1.0,1.0,0.666667,0.672061,1.0,0.0,0.90306,0.83824,1.0,1.0,1.0,1.0,0.453388,0.0,0.0,0.453388,0.0,0.0,0.5,0.0,0.333333,0.5,0.8,0.166667,1.0,0.0,1.0,1.0,0.694268,0.666667,0.4,0.852536,1.0,1.0,1.0,0.0,0.668438,0.0,0.0,0.782771,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
5,1466,0.235294,1.0,0.56481,0.525186,1.0,1.0,0.666667,1.0,1.0,0.75,1.0,0.75,1.0,1.0,1.0,0.833333,0.555556,0.5,0.138462,0.290323,1.0,1.0,0.857143,0.875,1.0,0.0,1.0,1.0,1.0,0.75,1.0,1.0,0.833333,0.0,1.0,0.0,0.865633,0.777695,1.0,0.5,1.0,1.0,0.248672,0.902342,0.0,0.555058,0.0,0.0,0.5,0.5,0.5,0.5,1.0,0.333333,1.0,0.25,0.6,1.0,0.691083,0.333333,0.4,0.833447,1.0,1.0,1.0,0.697142,0.67204,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.272727,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6,1467,0.0,1.0,0.514763,0.463378,1.0,1.0,0.666667,1.0,1.0,1.0,1.0,0.75,1.0,1.0,1.0,1.0,0.555556,0.75,0.146154,0.080645,1.0,1.0,0.857143,0.875,1.0,0.0,1.0,0.75,1.0,0.75,1.0,1.0,0.666667,0.824609,1.0,0.0,0.711345,0.827522,1.0,1.0,1.0,1.0,0.42354,0.0,0.0,0.42354,0.333333,0.0,0.5,0.0,0.5,0.5,1.0,0.25,1.0,0.0,1.0,1.0,0.694268,0.333333,0.4,0.827094,1.0,1.0,1.0,0.851301,0.467582,0.0,0.0,0.0,0.0,1.0,0.5,0.666667,0.638188,0.181818,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
7,1468,0.235294,1.0,0.48745,0.477493,1.0,1.0,0.666667,1.0,1.0,1.0,1.0,0.75,1.0,1.0,1.0,0.833333,0.555556,0.5,0.1,0.225806,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.75,1.0,1.0,0.833333,0.0,1.0,0.0,0.869997,0.781615,1.0,0.5,1.0,1.0,0.261931,0.865567,0.0,0.506805,0.0,0.0,0.5,0.5,0.5,0.5,1.0,0.333333,1.0,0.25,0.8,1.0,0.675159,0.333333,0.4,0.818022,1.0,1.0,1.0,0.0,0.65511,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.363636,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
8,1469,0.0,1.0,0.620344,0.529965,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.75,1.0,1.0,1.0,1.0,0.666667,0.5,0.161538,0.354839,1.0,1.0,0.857143,0.875,1.0,0.0,1.0,1.0,1.0,0.75,1.0,0.5,1.0,0.778414,1.0,0.0,0.847341,0.840055,1.0,0.5,1.0,1.0,0.471809,0.0,0.0,0.471809,0.333333,0.0,0.25,0.5,0.333333,0.5,0.8,0.166667,1.0,0.25,0.2,1.0,0.700637,1.0,0.4,0.852536,1.0,1.0,1.0,0.724696,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.090909,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
9,1470,0.0,1.0,0.534198,0.477428,1.0,1.0,1.0,1.0,1.0,0.75,1.0,1.0,1.0,1.0,1.0,1.0,0.333333,0.5,0.315385,0.677419,1.0,1.0,0.714286,0.75,1.0,0.0,1.0,1.0,0.8,1.0,1.0,1.0,0.666667,0.806437,0.833333,0.596019,0.0,0.794653,1.0,0.75,1.0,1.0,0.306022,0.0,0.0,0.306022,0.333333,0.0,0.25,0.0,0.333333,0.5,1.0,0.083333,1.0,0.0,1.0,1.0,0.764331,0.333333,0.4,0.857572,1.0,1.0,1.0,0.755281,0.0,0.0,0.0,0.0,0.0,1.0,0.75,1.0,0.0,0.272727,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [62]:
data.to_csv('X_test.csv',index=False)

In [63]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split

In [64]:
dataset_test=pd.read_csv('X_test.csv')


In [65]:
dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,LotFrontage_NaN,MasVnrArea_NaN,BsmtFinSF1_NaN,BsmtFinSF2_NaN,BsmtUnfSF_NaN,TotalBsmtSF_NaN,BsmtFullBath_NaN,BsmtHalfBath_NaN,GarageYrBlt_NaN,GarageCars_NaN,GarageArea_NaN
0,1461,20,1,4.382027,9.360655,1,2,3,3,1,4,2,24,7,4,4,6,5,6,49,49,5,3,13,15,3,0.0,3,4,4,4,4,4,3,6.150603,3,4.976734,5.602119,6.783325,3,3,1,3,6.79794,0.0,0,6.79794,0.0,0.0,1,0,2,1,4,5,7,0,5,6,49.0,3,1.0,6.594413,4,5,2,4.94876,0.0,0.0,0,4.795791,0,2,3,3,0.0,6,2010,9,5,0,0,0,0,0,0,0,0,2010,0,0
1,1462,20,5,4.394449,9.565704,1,2,2,3,1,3,2,24,8,4,4,6,6,6,52,52,4,3,10,12,2,4.691348,3,4,4,4,4,4,4,6.828712,6,0.0,6.008813,7.192934,3,3,1,3,7.192182,0.0,0,7.192182,0.0,0.0,1,1,3,1,3,6,7,0,5,6,52.0,3,1.0,5.746203,4,5,2,5.976351,3.610918,0.0,0,0.0,0,2,4,1,9.433564,6,2010,9,5,0,0,0,0,0,0,0,0,2010,0,0
2,1463,60,5,4.304065,9.534595,1,2,2,3,1,4,2,18,8,4,4,5,5,5,13,12,5,3,13,15,3,0.0,3,4,5,3,4,4,6,6.674561,6,0.0,4.927254,6.834109,3,2,1,3,6.833032,6.553933,0,7.395722,0.0,0.0,2,1,3,1,4,6,7,1,3,6,13.0,1,2.0,6.180017,4,5,2,5.361292,3.555348,0.0,0,0.0,0,2,3,3,0.0,3,2010,9,5,0,0,0,0,0,0,0,0,2010,0,0
3,1464,60,5,4.356709,9.208138,1,2,2,3,1,4,2,18,8,4,4,5,6,6,12,12,5,3,13,15,2,3.044522,3,4,5,4,4,4,6,6.401917,6,0.0,5.783825,6.831954,3,4,1,3,6.830874,6.520621,0,7.380256,0.0,0.0,2,1,3,1,3,7,7,1,4,6,12.0,1,2.0,6.154858,4,5,2,5.888878,3.610918,0.0,0,0.0,0,2,4,3,0.0,6,2010,9,5,0,0,0,0,0,0,0,0,2010,0,0
4,1465,120,5,3.7612,8.518193,1,2,2,2,1,4,2,8,8,4,3,6,8,5,18,18,5,3,11,13,3,0.0,2,4,5,3,4,4,4,5.575949,6,0.0,6.925595,7.155396,3,4,1,3,7.154615,0.0,0,7.154615,0.0,0.0,2,0,2,1,3,5,7,0,5,6,18.0,2,2.0,6.228511,4,5,2,0.0,4.418841,0.0,0,4.976734,0,2,4,3,0.0,1,2010,9,5,0,0,0,0,0,0,0,0,2010,0,0


In [66]:
X_test=dataset.drop(['Id'],axis=1)

In [67]:
selected_feat = [
    'OverallQual', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1',
       'CentralAir', 'GrLivArea', 'FullBath', 'Fireplaces', 'GarageCars',
       'WoodDeckSF', 'OpenPorchSF'
]

In [68]:
X_test_selected = X_test[selected_feat] # Extract matching columns from X_test
X_test_selected.shape
# X_test_selected.head(10)
X_test_selected_nan = [feature for feature in X_test_selected.columns if X_test_selected[feature].isnull().sum() > 0]
X_test_selected_nan

[]

In [69]:
import joblib
model_loaded = joblib.load("linear_regression_model.pkl")  # Load trained model

In [70]:
# Assume 'new_data' is a DataFrame with the same feature columns as X_train
predicted_price = model_loaded.predict(X_test_selected)

# import numpy as np

# Apply inverse log transformation
predicted_price_original = np.exp(predicted_price)
print(f"Predicted SalePrice : {predicted_price}")
print(f"Predicted SalePrice (Original Scale): {predicted_price_original}")

Predicted SalePrice : [ 8.75928861  9.24026828 21.59770357 ... 15.53514404 19.52106558
 23.73427862]
Predicted SalePrice (Original Scale): [6.36957873e+03 1.03038025e+04 2.39752686e+09 ... 5.58248201e+06
 3.00532236e+08 2.03079570e+10]
