In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('houseprice.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
df.shape

(1460, 81)

In [4]:
df['Id'].nunique()

1460

In [5]:
df.drop('Id',axis=1,inplace=True)
df.shape

(1460, 80)

In [6]:
nv = df.isnull().sum()
nv[nv>0]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [7]:
df.drop(['PoolQC','Alley','MiscFeature'],axis=1,inplace=True)
nv = df.isnull().sum()
nv[nv>0]

LotFrontage      259
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
Fence           1179
dtype: int64

In [8]:
cat_nan = [i for i in df.columns if df[i].isnull().sum()>0 and df[i].dtypes=='object']
num_nan = [i for i in df.columns if df[i].isnull().sum()>0 and df[i].dtypes!='object']
print(cat_nan)
print(len(cat_nan))
print(num_nan)
print(len(num_nan))

['MasVnrType', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'Fence']
13
['LotFrontage', 'MasVnrArea', 'GarageYrBlt']
3


#### Basement null value Handling

In [9]:
bs_cols = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']
for i in bs_cols:
    df[i].fillna('No_Base',inplace=True)
nv = df.isnull().sum()
nv[nv>0]

LotFrontage      259
MasVnrType         8
MasVnrArea         8
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
Fence           1179
dtype: int64

In [10]:
gar_cols = ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
for i in gar_cols:
    df[i].fillna('No_Garage',inplace=True)
nv = df.isnull().sum()
nv[nv>0]

LotFrontage     259
MasVnrType        8
MasVnrArea        8
Electrical        1
FireplaceQu     690
GarageYrBlt      81
Fence          1179
dtype: int64

In [11]:
for i in ['LotFrontage','MasVnrArea']:
    df[i].fillna(df[i].mean(),inplace=True)
nv = df.isnull().sum()
nv[nv>0]

MasVnrType        8
Electrical        1
FireplaceQu     690
GarageYrBlt      81
Fence          1179
dtype: int64

In [12]:
nv = df.isnull().sum()
nv[nv>0]

MasVnrType        8
Electrical        1
FireplaceQu     690
GarageYrBlt      81
Fence          1179
dtype: int64

In [13]:
for i in nv[nv>0].index:
    print(df[i].value_counts())

None       864
BrkFace    445
Stone      128
BrkCmn      15
Name: MasVnrType, dtype: int64
SBrkr    1334
FuseA      94
FuseF      27
FuseP       3
Mix         1
Name: Electrical, dtype: int64
Gd    380
TA    313
Fa     33
Ex     24
Po     20
Name: FireplaceQu, dtype: int64
2005.0    65
2006.0    59
2004.0    53
2003.0    50
2007.0    49
          ..
1906.0     1
1908.0     1
1933.0     1
1900.0     1
1927.0     1
Name: GarageYrBlt, Length: 97, dtype: int64
MnPrv    157
GdPrv     59
GdWo      54
MnWw      11
Name: Fence, dtype: int64


In [14]:
nv[nv>0].index

Index(['MasVnrType', 'Electrical', 'FireplaceQu', 'GarageYrBlt', 'Fence'], dtype='object')

In [15]:
for i in ['MasVnrType', 'Electrical', 'FireplaceQu', 'Fence']:
    x = df[i].mode().max()
    df[i].fillna(x,inplace=True)

In [16]:
nv = df.isnull().sum()
nv[nv>0]

GarageYrBlt    81
dtype: int64

In [17]:
df['GarageYrBlt'].fillna(2005.0,inplace=True)
nv = df.isnull().sum()
nv[nv>0]

Series([], dtype: int64)

In [18]:
df.shape

(1460, 77)

### Outlier Handling

In [19]:
df.describe(percentiles=[0.97,0.98,0.99]).T

Unnamed: 0,count,mean,std,min,50%,97%,98%,99%,max
MSSubClass,1460.0,56.89726,42.300571,20.0,50.0,160.0,188.2,190.0,190.0
LotFrontage,1460.0,70.049958,22.024023,21.0,70.049958,114.0,120.82,137.41,313.0
LotArea,1460.0,10516.828082,9981.264932,1300.0,9478.5,21571.8,25251.62,37567.64,215245.0
OverallQual,1460.0,6.099315,1.382997,1.0,6.0,9.0,9.0,10.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,5.0,8.0,8.0,9.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1973.0,2007.0,2008.0,2009.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1994.0,2008.0,2008.0,2009.0,2010.0
MasVnrArea,1460.0,103.685262,180.569112,0.0,0.0,573.69,650.82,791.28,1600.0
BsmtFinSF1,1460.0,443.639726,456.098091,0.0,383.5,1375.99,1442.64,1572.41,5644.0
BsmtFinSF2,1460.0,46.549315,161.319273,0.0,0.0,546.23,658.12,830.38,1474.0


In [20]:
df.describe(percentiles=[0.97,0.98,0.99]).T.index

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold', 'SalePrice'],
      dtype='object')

In [21]:
out_cols_98 = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2',
               'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
              'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'HalfBath',
              'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
               'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
              'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
               'SalePrice']
print(out_cols_98)

['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'SalePrice']


In [22]:
def oh1(x):
    x = x.clip(upper=x.quantile(0.98))
    return x

In [23]:
df[out_cols_98] = df[out_cols_98].apply(oh1)

In [24]:
df.describe(percentiles=[0.01,0.02,0.03,0.05]).T

Unnamed: 0,count,mean,std,min,1%,2%,3%,5%,50%,max
MSSubClass,1460.0,56.89726,42.300571,20.0,20.0,20.0,20.0,20.0,50.0,190.0
LotFrontage,1460.0,69.405164,18.845067,21.0,21.0,24.0,30.0,35.95,70.049958,120.82
LotArea,1460.0,9884.318219,4269.103743,1300.0,1680.0,2124.74,2522.0,3311.7,9478.5,25251.62
OverallQual,1460.0,6.099315,1.382997,1.0,3.0,4.0,4.0,4.0,6.0,10.0
OverallCond,1460.0,5.575342,1.112799,1.0,3.0,3.0,4.0,4.0,5.0,9.0
YearBuilt,1460.0,1971.267808,30.202904,1872.0,1899.18,1908.36,1910.0,1916.0,1973.0,2010.0
YearRemodAdd,1460.0,1984.865753,20.645407,1950.0,1950.0,1950.0,1950.0,1950.0,1994.0,2010.0
MasVnrArea,1460.0,99.054851,159.552455,0.0,0.0,0.0,0.0,0.0,0.0,650.82
BsmtFinSF1,1460.0,436.678219,424.424497,0.0,0.0,0.0,0.0,0.0,383.5,1442.64
BsmtFinSF2,1460.0,41.938082,136.258146,0.0,0.0,0.0,0.0,0.0,0.0,658.12


In [25]:
out_cols_01 = ['LotArea', 'OverallQual', 'OverallCond','1stFlrSF','TotRmsAbvGrd',
                'GrLivArea','SalePrice']
print(out_cols_01)

['LotArea', 'OverallQual', 'OverallCond', '1stFlrSF', 'TotRmsAbvGrd', 'GrLivArea', 'SalePrice']


In [26]:
def oh2(x):
    x = x.clip(lower=x.quantile(0.01))
    return x

In [27]:
df[out_cols_01] = df[out_cols_01].apply(oh2)

In [80]:
df.shape

(1460, 77)

In [81]:
df1 = df.copy()
df1.shape

(1460, 77)

In [82]:
cat_cols = [i for i in df.columns if df[i].dtypes=='object']
print(len(cat_cols))

40


In [83]:
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
for i in cat_cols:
    df1[i] = lb.fit_transform(df1[i])
df1.dtypes.value_counts()

int32      40
int64      19
float64    18
dtype: int64

In [84]:
x = df1.drop('SalePrice',axis=1)
y = df1['SalePrice']
print(x.shape)
print(y.shape)

(1460, 76)
(1460,)


In [85]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(1095, 76)
(365, 76)
(1095,)
(365,)


In [86]:
m1 = LinearRegression()
m1.fit(x_train,y_train)
print('Training_score',m1.score(x_train,y_train))
print('Testing_score',m1.score(x_test,y_test))

Training_score 0.8975795484903208
Testing_score 0.8775319555943935


In [96]:
sc = StandardScaler()
x_train_sc = sc.fit_transform(x_train) 
x_test_sc = sc.fit_transform(x_test) 
# x_train_sc = x_train_sc.transform(x_train) 
# x_test_sc = sc.fit(x_test) 
# x_test_sc = x_test_sc.transform(x_test)
# print(x_train_sc.shape)
# print(x_test_sc.shape)

In [97]:
m1 = LinearRegression()
m1.fit(x_train_sc,y_train)
print('Training_score',m1.score(x_train_sc,y_train))
print('Testing_score',m1.score(x_test_sc,y_test))

Training_score 0.8975793601076862
Testing_score 0.8703350703302973


In [92]:
sc = StandardScaler()
x_train_sc = sc.fit(x_train) 
x_train_sc = x_train_sc.transform(x_train) 
x_test_sc = sc.fit(x_test) 
x_test_sc = x_test_sc.transform(x_test)
print(x_train_sc.shape)
print(x_test_sc.shape)

(1095, 76)
(365, 76)


In [95]:
m1 = LinearRegression()
m1.fit(x_train_sc,y_train)
print('Training_score',m1.score(x_train_sc,y_train))
print('Testing_score',m1.score(x_test_sc,y_test))

Training_score 0.8975793601076862
Testing_score 0.8703350703302973


#### PCA

In [114]:
pca1 = PCA()
pc_x_train = pca1.fit_transform(x_train_sc)
pc_x_test = pca1.transform(x_test_sc)
print(pc_x_train.shape)
print(pc_x_test.shape)

(1095, 76)
(365, 76)


In [115]:
eig_val = pd.DataFrame({'Var':pca1.explained_variance_})
eig_val.head()

Unnamed: 0,Var
0,10.384868
1,4.197843
2,3.74648
3,3.140968
4,2.361428


In [116]:
eig_val['Contri'] = eig_val['Var']*100/eig_val['Var'].sum()
eig_val['CumSum'] = eig_val['Contri'].cumsum()
eig_val.head()

Unnamed: 0,Var,Contri,CumSum
0,10.384868,14.212856,14.212856
1,4.197843,5.745219,19.958075
2,3.74648,5.127477,25.085551
3,3.140968,4.298766,29.384318
4,2.361428,3.231879,32.616197


In [118]:
eig_val[eig_val['CumSum']<80].tail(7)

Unnamed: 0,Var,Contri,CumSum
27,0.884707,1.210821,72.91639
28,0.853366,1.167927,74.084317
29,0.842906,1.153611,75.237928
30,0.830766,1.136996,76.374924
31,0.800538,1.095625,77.47055
32,0.784583,1.07379,78.54434
33,0.776902,1.063277,79.607617


In [124]:
pc_train = pd.DataFrame(pc_x_train)
pc_train = pc_train.iloc[:,:30]
print(pc_train.shape)
pc_train.head()

(1095, 30)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,-2.989308,0.976813,-0.846649,0.292327,-1.70224,-0.345838,-0.420542,0.655139,-1.293456,0.158067,...,-0.442029,0.328269,-1.813065,0.541405,-0.068698,-0.460453,0.679631,0.874045,-0.565088,-0.949712
1,-1.791703,2.03824,-0.939618,0.295328,-1.503408,0.792044,-1.746281,2.19792,-0.929943,-0.320377,...,0.269205,-0.124805,-0.762342,0.117262,-0.61777,-0.04864,0.469441,0.986523,-0.210479,0.751252
2,3.296313,0.058509,3.282697,-0.402919,1.958097,-1.648124,-2.627722,0.152366,1.479858,-0.467454,...,0.612104,-0.039307,0.205956,-1.511367,0.698742,-1.046994,0.040986,-0.182745,0.924314,1.59499
3,5.982496,0.52164,-0.724527,0.266458,1.412489,1.258754,-0.883326,0.676762,0.064002,-2.216966,...,-1.323492,0.464534,-0.263541,0.887727,0.482176,-1.009214,0.378742,0.661886,-0.14648,1.225249
4,8.01323,0.967727,0.891592,0.86483,3.434425,-0.611202,-2.370068,0.367729,0.819353,-0.672292,...,-0.810315,0.628391,0.745845,1.146182,1.492018,-0.897606,0.233381,-2.357366,-0.094642,-0.236181


In [122]:
pc_test = pd.DataFrame(pc_x_test)
pc_test = pc_test.iloc[:,:30]
print(pc_test.shape)
pc_test.head()

(365, 30)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,1.33085,-0.293743,-1.86209,-2.574635,-1.832428,-0.684018,0.643035,0.286528,-0.395829,0.26655,...,-1.250416,-0.417997,0.343258,-0.16907,0.086213,-0.251043,-0.200621,1.09764,-0.197166,-0.090281
1,3.016077,-0.889899,0.828192,-1.664885,-0.19044,-0.423672,-0.063394,-0.462595,0.350169,-1.471485,...,0.365406,-0.48116,-0.502337,-0.947816,0.145588,-0.220981,-0.041957,-0.768075,0.986858,-0.537813
2,1.908798,-2.172842,-3.150334,-0.961047,0.958583,-2.102296,0.535765,0.745557,1.048587,1.725041,...,0.275067,-0.792586,-0.467624,-0.700932,-0.860776,1.812595,-0.841202,-0.256301,0.368162,0.29649
3,2.702761,-1.864979,-4.048003,-1.390491,-0.090056,-0.856518,0.58851,0.748826,-2.91643,1.514294,...,-0.107085,0.302479,0.356842,-1.02062,0.825513,-0.322551,0.182413,0.495506,-0.516911,-0.276981
4,6.998859,-1.641952,0.9046,-2.619936,2.349754,0.423656,-1.404924,0.69494,-0.468636,-1.326026,...,-0.818536,-0.084805,-0.526003,0.265893,-0.023249,0.468795,-0.646781,0.089613,-0.269265,-0.347571


In [126]:
print(pc_train.shape)
print(pc_test.shape)
print(y_train.shape)
print(y_test.shape)

(1095, 30)
(365, 30)
(1095,)
(365,)


In [127]:
m1 = LinearRegression()
m1.fit(pc_train,y_train)
print('Training_score',m1.score(pc_train,y_train))
print('Testing_score',m1.score(pc_test,y_test))

Training_score 0.8747690383150638
Testing_score 0.854233928966726


### Get_Dummies

In [29]:
cat_cols = [i for i in df.columns if df[i].dtypes=='object']
print(len(cat_cols))
print(cat_cols)

40
['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'Fence', 'SaleType', 'SaleCondition']


In [30]:
df1 = pd.get_dummies(data=df,columns=cat_cols,drop_first=True)
df1.shape

(1460, 249)

In [31]:
x = df1.drop('SalePrice',axis=1)
y = df1['SalePrice']
print(x.shape)
print(y.shape)

(1460, 248)
(1460,)


In [75]:
from sklearn.preprocessing import StandardScaler

In [76]:
sc = StandardScaler()
fit = sc.fit(x)
x_sc = fit.transform(x)
print(x_sc.shape)

(1460, 248)


In [77]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_sc,y,test_size=0.25)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(1095, 248)
(365, 248)
(1095,)
(365,)


In [79]:
m1 = LinearRegression()
m1.fit(x_train,y_train)
print('Training_score',m1.score(x_train,y_train))
print('Testing_score',m1.score(x_test,y_test))

Training_score 0.9446589268956738
Testing_score -1.0834243583654046e+21


### PCA from skelarn

In [34]:
from sklearn.decomposition import PCA

In [73]:
pca1 = PCA()
pc1 = pca1.fit(x_sc)
pc1 = pc1.transform(x_sc)
print(pc1.shape)

(1460, 248)


In [36]:
eig_val1 = pd.DataFrame({'Var':pca1.explained_variance_})
eig_val1.head()

Unnamed: 0,Var
0,16.889245
1,8.089807
2,7.198392
3,5.890462
4,5.542149


In [37]:
eig_val1['Contri'] = eig_val1['Var']*100/eig_val1['Var'].sum()
eig_val1.head()

Unnamed: 0,Var,Contri
0,16.889245,6.888848
1,8.089807,3.2997
2,7.198392,2.936107
3,5.890462,2.402624
4,5.542149,2.260552


In [38]:
eig_val1['CumSum'] = eig_val1['Contri'].cumsum()
eig_val1.tail()

Unnamed: 0,Var,Contri,CumSum
243,6.048291e-32,2.466999e-32,100.0
244,6.048291e-32,2.466999e-32,100.0
245,6.048291e-32,2.466999e-32,100.0
246,4.627411e-32,1.887445e-32,100.0
247,2.6434550000000003e-32,1.078222e-32,100.0


In [39]:
eig_val1[eig_val1['CumSum']<76].tail()

Unnamed: 0,Var,Contri,CumSum
83,1.019021,0.415642,74.251955
84,1.009164,0.411621,74.663576
85,1.002222,0.40879,75.072366
86,0.993842,0.405372,75.477738
87,0.978277,0.399023,75.876761


In [40]:
res_df = pd.DataFrame(pc1)
res_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,238,239,240,241,242,243,244,245,246,247
0,-4.464266,1.864573,-0.442591,-0.19839,-1.442747,-2.064312,-0.583703,-1.332883,1.683657,-0.987843,...,0.001264,1.746938e-15,-3.833133e-16,-2.503281e-16,2.785685e-16,9.759917000000001e-17,3.225347e-16,-3.2644280000000005e-17,3.6987530000000005e-17,8.533404000000001e-17
1,-0.10244,-3.101643,0.375792,-1.028276,0.580757,-0.839022,0.912768,-1.174784,-0.263074,-0.199181,...,0.0436,-2.1765090000000003e-17,1.67396e-16,9.943667e-17,-8.860665000000001e-17,-2.611317e-16,-2.570838e-16,2.892276e-16,2.017975e-16,-1.567469e-16
2,-5.152943,1.075597,0.064882,0.333054,-0.703609,-2.513562,0.007636,-1.981288,1.815666,-0.791633,...,-0.013865,-1.432877e-15,1.341544e-16,-2.51709e-16,-1.893685e-16,-3.9793950000000005e-17,-9.825584e-17,2.5344890000000003e-17,8.476779000000001e-17,6.27271e-17
3,1.317366,0.372536,-0.794489,2.993878,-0.539634,0.981081,0.865942,-0.85297,-0.922942,-1.604405,...,0.003966,-6.209334e-16,2.965726e-16,5.047826e-16,-1.318774e-16,7.755787000000001e-17,-1.530063e-16,-7.167035e-17,9.579173e-19,-2.46501e-16
4,-6.891859,0.377635,0.802507,2.566966,0.745329,-3.0653,-0.449504,-0.834175,2.748574,-0.930788,...,-0.003807,1.050813e-15,-2.513606e-16,-4.058885e-16,-2.831473e-16,-2.128887e-16,5.784383e-16,-2.143634e-16,-2.689726e-16,2.418104e-16


In [42]:
x_reg = res_df.iloc[:,:86]

y_reg = y
print(x_reg.shape)
print(y_reg.shape)

(1460, 86)
(1460,)


In [44]:
from sklearn.model_selection import train_test_split
x_train1,x_test1,y_train1,y_test1 = train_test_split(x_reg,y_reg,test_size=0.25)
print(x_train1.shape)
print(x_test1.shape)
print(y_train1.shape)
print(y_test1.shape)

(1095, 86)
(365, 86)
(1095,)
(365,)


In [45]:
from sklearn.linear_model import LinearRegression

In [46]:
m1 = LinearRegression()
m1.fit(x_train1,y_train1)
print('Training_score',m1.score(x_train1,y_train1))
print('Testing_score',m1.score(x_test1,y_test1))

Training_score 0.8936064015085051
Testing_score 0.818911813985381


In [47]:
ypred = m1.predict(x_test1)

In [49]:
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score

In [52]:
mae = mean_absolute_error(y_test1,ypred)
mse = mean_squared_error(y_test1,ypred)
rmse = mean_squared_error(y_test1,ypred,squared=False)
r2s = r2_score(y_test1,ypred)
print('MAE',mae)
print('MSE',mse)
print('RMSE',rmse)
print('R2_Score',r2s)

MAE 19510.906430757314
MSE 935988060.7579399
RMSE 30593.921957767034
R2_Score 0.818911813985381


In [53]:
print(x_reg.shape)

(1460, 86)


In [54]:
x_reg.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,76,77,78,79,80,81,82,83,84,85
0,-4.464266,1.864573,-0.442591,-0.19839,-1.442747,-2.064312,-0.583703,-1.332883,1.683657,-0.987843,...,0.107887,0.029752,0.307942,-0.436308,0.57069,0.112625,0.176048,-0.463447,0.078269,0.107281
1,-0.10244,-3.101643,0.375792,-1.028276,0.580757,-0.839022,0.912768,-1.174784,-0.263074,-0.199181,...,-0.528393,-1.528359,-0.113512,0.664872,1.226447,1.842406,1.22858,1.487395,-0.95759,0.764585
2,-5.152943,1.075597,0.064882,0.333054,-0.703609,-2.513562,0.007636,-1.981288,1.815666,-0.791633,...,0.395511,-0.322894,-0.159934,-0.035286,-0.757463,-0.276181,-0.098577,-0.188813,-0.519777,0.30203
3,1.317366,0.372536,-0.794489,2.993878,-0.539634,0.981081,0.865942,-0.85297,-0.922942,-1.604405,...,0.986687,0.681804,0.447326,-0.578144,0.355476,0.932366,0.149926,-1.283881,1.490924,-0.246296
4,-6.891859,0.377635,0.802507,2.566966,0.745329,-3.0653,-0.449504,-0.834175,2.748574,-0.930788,...,-0.791224,0.296559,0.169668,-0.455452,-1.195222,0.959332,0.94842,0.375732,1.178538,-0.336009


In [58]:
t = []
s = 'PC'   # PC = Principal component
for i in range(x_reg.shape[1]):  # x_reg.shape =(1460,86)  # i=0,1,2,3,.....,85
    d = s + str(i)
    t.append(d)
print(t)

['PC0', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10', 'PC11', 'PC12', 'PC13', 'PC14', 'PC15', 'PC16', 'PC17', 'PC18', 'PC19', 'PC20', 'PC21', 'PC22', 'PC23', 'PC24', 'PC25', 'PC26', 'PC27', 'PC28', 'PC29', 'PC30', 'PC31', 'PC32', 'PC33', 'PC34', 'PC35', 'PC36', 'PC37', 'PC38', 'PC39', 'PC40', 'PC41', 'PC42', 'PC43', 'PC44', 'PC45', 'PC46', 'PC47', 'PC48', 'PC49', 'PC50', 'PC51', 'PC52', 'PC53', 'PC54', 'PC55', 'PC56', 'PC57', 'PC58', 'PC59', 'PC60', 'PC61', 'PC62', 'PC63', 'PC64', 'PC65', 'PC66', 'PC67', 'PC68', 'PC69', 'PC70', 'PC71', 'PC72', 'PC73', 'PC74', 'PC75', 'PC76', 'PC77', 'PC78', 'PC79', 'PC80', 'PC81', 'PC82', 'PC83', 'PC84', 'PC85']


In [59]:
x_reg.columns = t
x_reg.head()

Unnamed: 0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,...,PC76,PC77,PC78,PC79,PC80,PC81,PC82,PC83,PC84,PC85
0,-4.464266,1.864573,-0.442591,-0.19839,-1.442747,-2.064312,-0.583703,-1.332883,1.683657,-0.987843,...,0.107887,0.029752,0.307942,-0.436308,0.57069,0.112625,0.176048,-0.463447,0.078269,0.107281
1,-0.10244,-3.101643,0.375792,-1.028276,0.580757,-0.839022,0.912768,-1.174784,-0.263074,-0.199181,...,-0.528393,-1.528359,-0.113512,0.664872,1.226447,1.842406,1.22858,1.487395,-0.95759,0.764585
2,-5.152943,1.075597,0.064882,0.333054,-0.703609,-2.513562,0.007636,-1.981288,1.815666,-0.791633,...,0.395511,-0.322894,-0.159934,-0.035286,-0.757463,-0.276181,-0.098577,-0.188813,-0.519777,0.30203
3,1.317366,0.372536,-0.794489,2.993878,-0.539634,0.981081,0.865942,-0.85297,-0.922942,-1.604405,...,0.986687,0.681804,0.447326,-0.578144,0.355476,0.932366,0.149926,-1.283881,1.490924,-0.246296
4,-6.891859,0.377635,0.802507,2.566966,0.745329,-3.0653,-0.449504,-0.834175,2.748574,-0.930788,...,-0.791224,0.296559,0.169668,-0.455452,-1.195222,0.959332,0.94842,0.375732,1.178538,-0.336009


#### Applying Regresion on Stanradized data

In [66]:
# x_sc
x_train2,x_test2,y_train2,y_test2 = train_test_split(x_sc,y_reg,test_size=0.25,random_state=25)
print(x_train2.shape)
print(x_test2.shape)
print(y_train2.shape)
print(y_test2.shape)

(1095, 248)
(365, 248)
(1095,)
(365,)


In [69]:
from sklearn.model_selection import KFold,cross_val_score

In [72]:
kf = KFold(n_splits=20)
model = LinearRegression()
scores = cross_val_score(model,x_train2,y_train2,scoring='r2',cv=kf)
print(scores)
print(scores.mean())

[-3.58362890e+15 -1.91291128e+17  9.42444143e-01 -1.12967003e+20
 -7.09708432e+18  8.05075611e-01 -4.73460335e+22 -3.78629680e+18
 -1.80597322e+22 -4.14769567e+20 -3.73277400e+07 -8.16632053e+17
 -1.34736601e+12  8.24177073e-01  8.50513737e-01  8.10723730e-01
 -7.82189140e+18 -8.86510683e+20 -2.60568164e+16  8.95779244e-01]
-3.3419877928581375e+21


In [67]:
m2 = LinearRegression()
m2.fit(x_train2,y_train2)
print('Training_score',m2.score(x_train2,y_train2))
print('Testing_score',m2.score(x_test2,y_test2))

Training_score 0.9440035745259744
Testing_score -6.685557423422256e+21
