In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
df = pd.read_csv('houseprice.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [15]:
df.shape

(1460, 81)

In [16]:
df['Id'].nunique()

1460

In [17]:
df.drop('Id',axis=1,inplace=True)
df.shape

(1460, 80)

In [18]:
nv = df.isnull().sum()
nv[nv>0]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [19]:
df.drop(['PoolQC','Alley','MiscFeature'],axis=1,inplace=True)
nv = df.isnull().sum()
nv[nv>0]

LotFrontage      259
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
Fence           1179
dtype: int64

In [20]:
cat_nan = [i for i in df.columns if df[i].isnull().sum()>0 and df[i].dtypes=='object']
num_nan = [i for i in df.columns if df[i].isnull().sum()>0 and df[i].dtypes!='object']
print(cat_nan)
print(len(cat_nan))
print(num_nan)
print(len(num_nan))

['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'Fence']
13
['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
3


#### Basement null value Handling

In [21]:
bs_cols = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
for i in bs_cols:
    df[i].fillna('No_Base',inplace=True)
nv = df.isnull().sum()
nv[nv>0]

LotFrontage      259
MasVnrType         8
MasVnrArea         8
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
Fence           1179
dtype: int64

In [22]:
gar_cols = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
for i in gar_cols:
    df[i].fillna('No_Garage',inplace=True)
nv = df.isnull().sum()
nv[nv>0]

LotFrontage     259
MasVnrType        8
MasVnrArea        8
Electrical        1
FireplaceQu     690
GarageYrBlt      81
Fence          1179
dtype: int64

In [25]:
for i in ['LotFrontage','MasVnrArea']:
    df[i].fillna(df[i].mean(),inplace=True)
nv = df.isnull().sum()
nv[nv>0]

MasVnrType        8
Electrical        1
FireplaceQu     690
GarageYrBlt      81
Fence          1179
dtype: int64

In [29]:
nv = df.isnull().sum()
nv[nv>0]

MasVnrType        8
Electrical        1
FireplaceQu     690
GarageYrBlt      81
Fence          1179
dtype: int64

In [34]:
for i in nv[nv>0].index:
    print(df[i].value_counts())

None       864
BrkFace    445
Stone      128
BrkCmn      15
Name: MasVnrType, dtype: int64
SBrkr    1334
FuseA      94
FuseF      27
FuseP       3
Mix         1
Name: Electrical, dtype: int64
Gd    380
TA    313
Fa     33
Ex     24
Po     20
Name: FireplaceQu, dtype: int64
2005.0    65
2006.0    59
2004.0    53
2003.0    50
2007.0    49
          ..
1906.0     1
1908.0     1
1933.0     1
1900.0     1
1927.0     1
Name: GarageYrBlt, Length: 97, dtype: int64
MnPrv    157
GdPrv     59
GdWo      54
MnWw      11
Name: Fence, dtype: int64


In [35]:
nv[nv>0].index

Index(['MasVnrType', 'Electrical', 'FireplaceQu', 'GarageYrBlt', 'Fence'], dtype='object')

In [36]:
for i in ['MasVnrType', 'Electrical', 'FireplaceQu', 'Fence']:
    x = df[i].mode().max()
    df[i].fillna(x,inplace=True)

In [37]:
nv = df.isnull().sum()
nv[nv>0]

GarageYrBlt    81
dtype: int64

In [38]:
df['GarageYrBlt'].fillna(2005.0,inplace=True)
nv = df.isnull().sum()
nv[nv>0]

Series([], dtype: int64)

In [39]:
df.shape

(1460, 77)

### Outlier Handling

In [40]:
df.describe(percentiles=[0.97,0.98,0.99]).T

Unnamed: 0,count,mean,std,min,50%,97%,98%,99%,max
MSSubClass,1460.0,56.89726,42.300571,20.0,50.0,160.0,188.2,190.0,190.0
LotFrontage,1460.0,70.049958,22.024023,21.0,70.049958,114.0,120.82,137.41,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,9478.5,21571.8,25251.62,37567.64,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,6.0,9.0,9.0,10.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,8.0,8.0,9.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1973.0,2007.0,2008.0,2009.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1994.0,2008.0,2008.0,2009.0,2010.0
MasVnrArea,1460.0,103.685262,180.569112,0.0,0.0,573.69,650.82,791.28,1600.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,383.5,1375.99,1442.64,1572.41,5644.0
BsmtFinSF2,1460.0,46.549315,161.319273,0.0,0.0,546.23,658.12,830.38,1474.0


In [41]:
df.describe(percentiles=[0.97,0.98,0.99]).T.index

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

In [42]:
out_cols_98 = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2',
               'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
              'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'HalfBath',
              'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
               'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
              'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
               'SalePrice']
print(out_cols_98)

['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'SalePrice']


In [45]:
def oh1(x):
    x = x.clip(upper=x.quantile(0.98))
    return x

In [46]:
df[out_cols_98] = df[out_cols_98].apply(oh1)

In [47]:
df.describe(percentiles=[0.01,0.02,0.03,0.05]).T

Unnamed: 0,count,mean,std,min,1%,2%,3%,5%,50%,max
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,20.0,20.0,20.0,50.0,190.0
LotFrontage,1460.0,69.405164,18.845067,21.0,21.0,24.0,30.0,35.95,70.049958,120.82
LotArea,1460.0,9884.318219,4269.103743,1300.0,1680.0,2124.74,2522.0,3311.7,9478.5,25251.62
OverallQual,1460.0,6.099315,1.382997,1.0,3.0,4.0,4.0,4.0,6.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,3.0,3.0,4.0,4.0,5.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1899.18,1908.36,1910.0,1916.0,1973.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1950.0,1950.0,1950.0,1950.0,1994.0,2010.0
MasVnrArea,1460.0,99.054851,159.552455,0.0,0.0,0.0,0.0,0.0,0.0,650.82
BsmtFinSF1,1460.0,436.678219,424.424497,0.0,0.0,0.0,0.0,0.0,383.5,1442.64
BsmtFinSF2,1460.0,41.938082,136.258146,0.0,0.0,0.0,0.0,0.0,0.0,658.12


In [52]:
out_cols_01 = ['LotArea', 'OverallQual', 'OverallCond','1stFlrSF','TotRmsAbvGrd',
                'GrLivArea','SalePrice']
print(out_cols_01)

['LotArea', 'OverallQual', 'OverallCond', '1stFlrSF', 'TotRmsAbvGrd', 'GrLivArea', 'SalePrice']


In [53]:
def oh2(x):
    x = x.clip(lower=x.quantile(0.01))
    return x

In [54]:
df[out_cols_01] = df[out_cols_01].apply(oh2)

In [55]:
df.shape

(1460, 77)

### Get_Dummies

In [56]:
cat_cols = [i for i in df.columns if df[i].dtypes=='object']
print(len(cat_cols))
print(cat_cols)

40
['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'Fence', 'SaleType', 'SaleCondition']


In [57]:
df1 = pd.get_dummies(data=df,columns=cat_cols,drop_first=True)
df1.shape

(1460, 249)

In [62]:
x = df1.drop('SalePrice',axis=1)
y = df1['SalePrice']
print(x.shape)
print(y.shape)

(1460, 248)
(1460,)


In [63]:
from sklearn.preprocessing import StandardScaler

In [64]:
sc = StandardScaler()
x_sc = sc.fit_transform(x)
x_sc.shape

(1460, 248)

### PCA from skelarn

In [65]:
from sklearn.decomposition import PCA

In [68]:
pca1 = PCA()
pc1 = pca1.fit_transform(x_sc)
print(pc1.shape)

(1460, 248)


In [71]:
eig_val1 = pd.DataFrame({'Var':pca1.explained_variance_})
eig_val1.head()

Unnamed: 0,Var
0,16.889245
1,8.089807
2,7.198392
3,5.890462
4,5.542149


In [74]:
eig_val1['Contri'] = eig_val1['Var']*100/eig_val1['Var'].sum()
eig_val1.head()

Unnamed: 0,Var,Contri
0,16.889245,6.888848
1,8.089807,3.2997
2,7.198392,2.936107
3,5.890462,2.402624
4,5.542149,2.260552


In [75]:
eig_val1['CumSum'] = eig_val1['Contri'].cumsum()
eig_val1.tail()

Unnamed: 0,Var,Contri,CumSum
243,6.048291e-32,2.466999e-32,100.0
244,6.048291e-32,2.466999e-32,100.0
245,6.048291e-32,2.466999e-32,100.0
246,4.627411e-32,1.887445e-32,100.0
247,2.6434550000000003e-32,1.078222e-32,100.0


In [76]:
eig_val1[eig_val1['CumSum']<76].tail()

Unnamed: 0,Var,Contri,CumSum
83,1.019021,0.415642,74.251955
84,1.009164,0.411621,74.663576
85,1.002222,0.40879,75.072366
86,0.993842,0.405372,75.477738
87,0.978277,0.399023,75.876761


In [77]:
res_df = pd.DataFrame(pc1)
res_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,238,239,240,241,242,243,244,245,246,247
0,-4.464266,1.864573,-0.442591,-0.19839,-1.442747,-2.064312,-0.583703,-1.332883,1.683657,-0.987843,...,0.001264,1.746938e-15,-3.833133e-16,-2.503281e-16,2.785685e-16,9.759917000000001e-17,3.225347e-16,-3.2644280000000005e-17,3.6987530000000005e-17,8.533404000000001e-17
1,-0.10244,-3.101643,0.375792,-1.028276,0.580757,-0.839022,0.912768,-1.174784,-0.263074,-0.199181,...,0.0436,-2.1765090000000003e-17,1.67396e-16,9.943667e-17,-8.860665000000001e-17,-2.611317e-16,-2.570838e-16,2.892276e-16,2.017975e-16,-1.567469e-16
2,-5.152943,1.075597,0.064882,0.333054,-0.703609,-2.513562,0.007636,-1.981288,1.815666,-0.791633,...,-0.013865,-1.432877e-15,1.341544e-16,-2.51709e-16,-1.893685e-16,-3.9793950000000005e-17,-9.825584e-17,2.5344890000000003e-17,8.476779000000001e-17,6.27271e-17
3,1.317366,0.372536,-0.794489,2.993878,-0.539634,0.981081,0.865942,-0.85297,-0.922942,-1.604405,...,0.003966,-6.209334e-16,2.965726e-16,5.047826e-16,-1.318774e-16,7.755787000000001e-17,-1.530063e-16,-7.167035e-17,9.579173e-19,-2.46501e-16
4,-6.891859,0.377635,0.802507,2.566966,0.745329,-3.0653,-0.449504,-0.834175,2.748574,-0.930788,...,-0.003807,1.050813e-15,-2.513606e-16,-4.058885e-16,-2.831473e-16,-2.128887e-16,5.784383e-16,-2.143634e-16,-2.689726e-16,2.418104e-16


In [80]:
x_reg = res_df.iloc[:,:86]

y_reg = y
print(x_reg.shape)
print(y_reg.shape)

(1460, 86)
(1460,)


In [81]:
from sklearn.model_selection import train_test_split
x_train1,x_test1,y_train1,y_test1 = train_test_split(x_reg,y_reg,test_size=0.25)
print(x_train1.shape)
print(x_test1.shape)
print(y_train1.shape)
print(y_test1.shape)

(1095, 86)
(365, 86)
(1095,)
(365,)


In [83]:
from sklearn.linear_model import LinearRegression

In [84]:
m1 = LinearRegression()
m1.fit(x_train1,y_train1)
print('Training_score',m1.score(x_train1,y_train1))
print('Testing_score',m1.score(x_test1,y_test1))

Training_score 0.8874792635641522
Testing_score 0.8413379710739936


In [86]:
ypred = m1.predict(x_test1)

In [None]:
# MSE,MAE,RMSE, R2_score