In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures

In [2]:
train = pd.read_csv('datasets/train.csv')
test = pd.read_csv('datasets/test.csv')
base = pd.read_csv('datasets/baseline.csv')

In [3]:
print(train.shape)
print(test.shape)

(2051, 81)
(878, 80)


In [4]:
pd.set_option('max_colwidth', 256)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
train.isnull().sum().sort_values(ascending = False)

Pool QC            2042
Misc Feature       1986
Alley              1911
Fence              1651
Fireplace Qu       1000
Lot Frontage        330
Garage Finish       114
Garage Cond         114
Garage Qual         114
Garage Yr Blt       114
Garage Type         113
Bsmt Exposure        58
BsmtFin Type 2       56
BsmtFin Type 1       55
Bsmt Cond            55
Bsmt Qual            55
Mas Vnr Type         22
Mas Vnr Area         22
Bsmt Half Bath        2
Bsmt Full Bath        2
Garage Cars           1
Garage Area           1
Bsmt Unf SF           1
BsmtFin SF 2          1
Total Bsmt SF         1
BsmtFin SF 1          1
Overall Cond          0
Exterior 2nd          0
Exterior 1st          0
Roof Matl             0
Roof Style            0
Year Remod/Add        0
Year Built            0
SalePrice             0
Overall Qual          0
Land Contour          0
PID                   0
MS SubClass           0
MS Zoning             0
Lot Area              0
Street                0
Lot Shape       

In [5]:
test[test['Id'] == 1718]

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,Mas Vnr Area,Exter Qual,Exter Cond,Foundation,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating,Heating QC,Central Air,Electrical,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Fireplace Qu,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
67,1718,528174030,120,RL,34.0,3903,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NridgHt,Norm,Norm,Twnhs,1Story,6,5,2005,2006,Gable,CompShg,VinylSd,VinylSd,Stone,182.0,Gd,TA,PConc,Gd,TA,Av,ALQ,1030,Unf,0,272,1302,GasA,Ex,Y,SBrkr,1302,0,0,1302,1,0,1,1,1,1,Gd,5,Typ,1,Gd,Attchd,2005.0,RFn,2,631,TA,TA,Y,110,50,0,0,0,0,,,,0,5,2007,WD


In [6]:
train['SalePrice'].describe()

count      2051.000000
mean     181469.701609
std       79258.659352
min       12789.000000
25%      129825.000000
50%      162500.000000
75%      214000.000000
max      611657.000000
Name: SalePrice, dtype: float64

In [7]:
features = ['MS SubClass', 'House Style', 'Overall Qual', 'Total Bsmt SF', 'Gr Liv Area', 'SalePrice'] 

In [8]:
test_feats = ['MS SubClass', 'House Style', 'Overall Qual', 'Total Bsmt SF', 'Gr Liv Area']

In [9]:
filt = train[features]
filt_test =test[test_feats]

In [10]:
filt['Total Bsmt SF'].fillna(np.mean(filt['Total Bsmt SF']), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [11]:
print(filt.shape)
print(filt_test.shape)

(2051, 6)
(878, 5)


In [12]:
len(filt.isnull().sum())

6

In [13]:
dummy_cols = ['MS SubClass', 'House Style', 'Overall Qual']

In [14]:
dumm_cols_test = ['MS SubClass', 'House Style', 'Overall Qual']

In [15]:
dummy_filt = pd.get_dummies(filt, columns = dummy_cols, drop_first = True)
dummy_filt_test = pd.get_dummies(filt_test, columns = dummy_cols, drop_first = True)

In [16]:
dummy_filt['SalePrice']

0       130500
1       220000
2       109000
3       174000
4       138500
         ...  
2046    298751
2047     82500
2048    177000
2049    144000
2050    189000
Name: SalePrice, Length: 2051, dtype: int64

In [17]:
print(dummy_filt.shape)
print(dummy_filt_test.shape)

(2051, 34)
(878, 31)


In [18]:
dummy_filt_test['Overall Qual_2'] = 0

dummy_filt_test['MS SubClass_150'] = 0 




In [19]:
x_vars = dummy_filt.drop(columns = 'SalePrice')

In [20]:
X_train = dummy_filt[x_vars.columns]
y_train = dummy_filt['SalePrice']

X_test = dummy_filt_test[x_vars.columns]

In [21]:
X_train

Unnamed: 0,Total Bsmt SF,Gr Liv Area,MS SubClass_30,MS SubClass_40,MS SubClass_45,MS SubClass_50,MS SubClass_60,MS SubClass_70,MS SubClass_75,MS SubClass_80,MS SubClass_85,MS SubClass_90,MS SubClass_120,MS SubClass_150,MS SubClass_160,MS SubClass_180,MS SubClass_190,House Style_1.5Unf,House Style_1Story,House Style_2.5Fin,House Style_2.5Unf,House Style_2Story,House Style_SFoyer,House Style_SLvl,Overall Qual_2,Overall Qual_3,Overall Qual_4,Overall Qual_5,Overall Qual_6,Overall Qual_7,Overall Qual_8,Overall Qual_9,Overall Qual_10
0,725.0,1479,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0
1,913.0,2122,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
2,1057.0,1057,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0
3,384.0,1444,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0
4,676.0,1445,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2046,1884.0,1728,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2047,861.0,861,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0
2048,896.0,1913,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2049,1200.0,1200,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0


In [22]:
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

In [23]:
lr = LinearRegression()

In [24]:
lr.fit(Z_train, y_train)

LinearRegression()

In [25]:
lr.score(Z_train, y_train)

0.8199280916382744

In [26]:
Z_test_preds = lr.predict(Z_test)

In [27]:
Z_test_preds.shape

(878,)

In [28]:
Z_test_preds

array([155990.43853396, 175057.32596979, 191307.45189116, 107776.2506069 ,
       178135.34047839,  86300.50650233, 124442.1360952 , 153296.7046279 ,
       177471.10515023, 168216.94432922, 152602.50929989, 128579.48439796,
       176587.81564807, 222916.09888569, 159650.94024405, 137293.99929792,
       136605.68300793, 138724.4194133 , 199117.3688545 , 226244.97303124,
       137752.93373233, 123105.91884996, 182286.9270498 , 141796.53278053,
       189278.95755942, 123105.91884996, 130485.61881221, 136319.77633039,
       161678.5535906 ,  50819.34017123, 110455.17616668,  91077.58801035,
       237984.03111004, 146539.11924317, 209754.88652763, 188340.06337327,
       132618.78650137, 104991.60209707, 113697.58107888, 199437.91583281,
       122736.68060848, 218632.85103629, 153492.93731569, 154731.05985662,
       198121.77355382, 101111.05678329, 212596.07211542, 128467.418225  ,
       128486.4549292 , 136258.34037698, 117033.34691591, 248063.20172117,
       265155.1322474 , 1

In [29]:
test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,Utilities,Lot Config,Land Slope,Neighborhood,Condition 1,Condition 2,Bldg Type,House Style,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Roof Style,Roof Matl,Exterior 1st,Exterior 2nd,Mas Vnr Type,Mas Vnr Area,Exter Qual,Exter Cond,Foundation,Bsmt Qual,Bsmt Cond,Bsmt Exposure,BsmtFin Type 1,BsmtFin SF 1,BsmtFin Type 2,BsmtFin SF 2,Bsmt Unf SF,Total Bsmt SF,Heating,Heating QC,Central Air,Electrical,1st Flr SF,2nd Flr SF,Low Qual Fin SF,Gr Liv Area,Bsmt Full Bath,Bsmt Half Bath,Full Bath,Half Bath,Bedroom AbvGr,Kitchen AbvGr,Kitchen Qual,TotRms AbvGrd,Functional,Fireplaces,Fireplace Qu,Garage Type,Garage Yr Blt,Garage Finish,Garage Cars,Garage Area,Garage Qual,Garage Cond,Paved Drive,Wood Deck SF,Open Porch SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,2fmCon,2Story,6,8,1910,1950,Gable,CompShg,AsbShng,AsbShng,,0.0,TA,Fa,Stone,Fa,TA,No,Unf,0,Unf,0,1020,1020,GasA,Gd,N,FuseP,908,1020,0,1928,0,0,2,0,4,2,Fa,9,Typ,0,,Detchd,1910.0,Unf,1,440,Po,Po,Y,0,60,112,0,0,0,,,,0,4,2006,WD
1,2718,905108090,90,RL,,9662,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Sawyer,Norm,Norm,Duplex,1Story,5,4,1977,1977,Gable,CompShg,Plywood,Plywood,,0.0,TA,TA,CBlock,Gd,TA,No,Unf,0,Unf,0,1967,1967,GasA,TA,Y,SBrkr,1967,0,0,1967,0,0,2,0,6,2,TA,10,Typ,0,,Attchd,1977.0,Fin,2,580,TA,TA,Y,170,0,0,0,0,0,,,,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,7,5,2006,2006,Gable,CompShg,VinylSd,VinylSd,,0.0,Gd,TA,PConc,Gd,Gd,Av,GLQ,554,Unf,0,100,654,GasA,Ex,Y,SBrkr,664,832,0,1496,1,0,2,1,3,1,Gd,7,Typ,1,Gd,Attchd,2006.0,RFn,2,426,TA,TA,Y,100,24,0,0,0,0,,,,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Norm,Norm,1Fam,1Story,5,6,1923,2006,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,Gd,TA,CBlock,TA,TA,No,Unf,0,Unf,0,968,968,GasA,TA,Y,SBrkr,968,0,0,968,0,0,1,0,2,1,TA,5,Typ,0,,Detchd,1935.0,Unf,2,480,Fa,TA,N,0,0,184,0,0,0,,,,0,7,2007,WD
4,625,535105100,20,RL,,9500,Pave,,IR1,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,6,5,1963,1963,Gable,CompShg,Plywood,Plywood,BrkFace,247.0,TA,TA,CBlock,Gd,TA,No,BLQ,609,Unf,0,785,1394,GasA,Gd,Y,SBrkr,1394,0,0,1394,1,0,1,1,3,1,TA,6,Typ,2,Gd,Attchd,1963.0,RFn,2,514,TA,TA,Y,0,76,0,0,185,0,,,,0,7,2009,WD


In [30]:
d = {'Id': test['Id'].sort_values(), 'SalePrice': Z_test_preds}

In [31]:
df = pd.DataFrame(data = d, columns = ['Id', 'SalePrice'])

In [32]:
df.set_index('Id', inplace=True)

In [33]:
df.describe()

Unnamed: 0,SalePrice
count,878.0
mean,178550.306301
std,69716.333464
min,18048.164317
25%,129389.599552
50%,161810.550273
75%,209744.378433
max,528437.432489


In [34]:
df

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
2,155990.438534
4,175057.32597
6,191307.451891
7,107776.250607
17,178135.340478
18,86300.506502
22,124442.136095
27,153296.704628
31,177471.10515
36,168216.944329


In [35]:
df.to_csv('base_submit.csv') 

In [36]:
list(Z_test_preds)

[155990.43853395927,
 175057.3259697894,
 191307.45189116444,
 107776.25060690397,
 178135.34047838557,
 86300.50650232781,
 124442.13609519662,
 153296.70462789774,
 177471.10515023276,
 168216.94432921775,
 152602.50929989168,
 128579.48439796494,
 176587.81564806786,
 222916.0988856864,
 159650.94024405102,
 137293.99929791762,
 136605.68300792956,
 138724.41941330075,
 199117.3688544994,
 226244.97303124232,
 137752.93373232763,
 123105.91884996349,
 182286.92704980218,
 141796.53278053136,
 189278.95755942244,
 123105.91884996349,
 130485.61881220518,
 136319.77633038993,
 161678.55359060183,
 50819.34017122607,
 110455.17616667754,
 91077.58801034815,
 237984.03111003697,
 146539.11924316757,
 209754.88652762535,
 188340.06337327062,
 132618.7865013692,
 104991.60209706743,
 113697.58107888079,
 199437.91583281412,
 122736.68060848353,
 218632.85103629073,
 153492.937315688,
 154731.05985662036,
 198121.773553824,
 101111.05678329313,
 212596.07211541722,
 128467.41822499737,
 12