In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
train = pd.read_csv('datasets/train.csv')
test = pd.read_csv('datasets/test.csv')

In [3]:
print(train.shape)
print(test.shape)

(2051, 81)
(878, 80)


In [4]:
pd.set_option('max_colwidth', 256)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
train.isnull().sum().sort_values(ascending = False)

Pool QC            2042
Misc Feature       1986
Alley              1911
Fence              1651
Fireplace Qu       1000
Lot Frontage        330
Garage Finish       114
Garage Cond         114
Garage Qual         114
Garage Yr Blt       114
Garage Type         113
Bsmt Exposure        58
BsmtFin Type 2       56
BsmtFin Type 1       55
Bsmt Cond            55
Bsmt Qual            55
Mas Vnr Type         22
Mas Vnr Area         22
Bsmt Half Bath        2
Bsmt Full Bath        2
Garage Cars           1
Garage Area           1
Bsmt Unf SF           1
BsmtFin SF 2          1
Total Bsmt SF         1
BsmtFin SF 1          1
Overall Cond          0
Exterior 2nd          0
Exterior 1st          0
Roof Matl             0
Roof Style            0
Year Remod/Add        0
Year Built            0
SalePrice             0
Overall Qual          0
Land Contour          0
PID                   0
MS SubClass           0
MS Zoning             0
Lot Area              0
Street                0
Lot Shape       

In [5]:
train['SalePrice'].describe()

count      2051.000000
mean     181469.701609
std       79258.659352
min       12789.000000
25%      129825.000000
50%      162500.000000
75%      214000.000000
max      611657.000000
Name: SalePrice, dtype: float64

In [5]:
features = ['Overall Qual', 'Gr Liv Area', 'Garage Area', '1st Flr SF']

In [27]:
train['Garage Area'].fillna(np.mean(train['Garage Area']), inplace = True)

In [28]:
train['Garage Area'].replace(0, np.mean(train['Garage Area']), inplace = True)

In [29]:
test['Garage Area'].replace(0, np.mean(test['Garage Area']), inplace = True)

In [30]:
train[features].describe()

Unnamed: 0,Overall Qual,Gr Liv Area,Garage Area,1st Flr SF
count,2051.0,2051.0,2051.0,2051.0
mean,6.11214,1499.330083,499.781416,1164.488055
std,1.426271,500.447829,183.182538,396.446923
min,1.0,334.0,100.0,334.0
25%,5.0,1129.0,384.0,879.5
50%,6.0,1444.0,480.0,1093.0
75%,7.0,1728.5,576.0,1405.0
max,10.0,5642.0,1418.0,5095.0


In [31]:
X = train[features]
y = train['SalePrice']

X_1 = test[features]

In [32]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20,
                                                   random_state = 42)

In [12]:
sc = StandardScaler()
Z_train_sc = sc.fit_transform(X_train)
Z_test_sc = sc.transform(X_test)
Z_1_sc = sc.transform(X_1)

In [33]:
lr = LinearRegression()

In [34]:
lr.fit(X_train, y_train)

LinearRegression()

In [35]:
lr.score(X_train, y_train)

0.7617995320796198

In [37]:
lr.score(X_test, y_test)

0.8185219528294354

In [65]:
preds=lr.predict(X_1)

In [66]:
preds

array([183378.12220804, 197364.24206876, 186568.5761737 , 122922.66029408,
       181926.07154754,  73615.12631557,  90816.18042484, 126359.65953255,
       210579.00861364, 171194.73079139, 167836.22963525, 143921.99156167,
       176030.77374455, 279809.1715638 , 167899.53847916, 131196.41217027,
       192077.38086821, 119711.96339389, 200347.82664626, 231011.27534054,
       113706.51358588, 134411.96991172, 198299.58069648, 156558.38696124,
       202541.71547944, 106551.74199167, 125553.45596523, 139069.26077173,
       156593.09352336,  39929.60525037, 102596.51959527, 105244.06387145,
       281659.60552715, 134023.88352763, 223294.37080718, 185557.0259508 ,
       103502.32395598,  95578.44519586, 114187.50564454, 215539.0153489 ,
       154135.31944435, 210348.42939539, 157596.80641662, 135380.87594302,
       216860.59151762, 104060.53039958, 218427.01985719, 109524.17886678,
       108162.57350181, 115148.99397851, 108719.67613603, 272908.14104146,
       278219.35312668, 1

In [72]:
X_1['SalePrice'] = preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_1['SalePrice'] = preds


In [73]:
X_1

Unnamed: 0,Overall Qual,Gr Liv Area,Garage Area,1st Flr SF,SalePrice
0,6,1928,440.0,908,183378.122208
1,5,1967,580.0,1967,197364.242069
2,7,1496,426.0,664,186568.576174
3,5,968,480.0,968,122922.660294
4,6,1394,514.0,1394,181926.071548
5,4,1092,286.0,546,73615.126316
6,4,1093,308.0,1093,90816.180425
7,5,1268,252.0,1268,126359.659533
8,7,1680,588.0,840,210579.008614
9,6,1279,473.0,1279,171194.730791


In [61]:
d = {'Id': test['Id'], 'SalePrice': preds}

In [62]:
df = pd.DataFrame(data = d)

In [88]:
df_blah = df.sort_values('Id')

In [90]:
df_blah

Unnamed: 0,Id,SalePrice
703,2,136415.004972
705,4,258504.621471
119,6,173534.119722
311,7,238936.069123
400,17,241046.255387
315,18,320102.748273
790,22,195999.558396
607,27,92551.464582
390,31,103404.126138
752,36,163581.261477


In [91]:
test[test['Id'] == 40][features]

Unnamed: 0,Overall Qual,Gr Liv Area,Garage Area,1st Flr SF
323,8,1544,868.0,1544


In [45]:
lr.coef_

array([27893.200912  ,    38.75053311,    73.31638926,    28.42658686])

In [92]:
(8*27893.200912) + (1544 * 38.75053311) + (868 * 73.31638926) + (1544*28.42658686) + lr.intercept_

273743.04316572624

In [57]:
240087.15966542 + lr.intercept_

123324.49642378624

In [70]:
test[test['Id'] == 2][features]

Unnamed: 0,Overall Qual,Gr Liv Area,Garage Area,1st Flr SF
703,5,896,730.0,896


In [46]:
lr.intercept_

-116762.66324163377

In [94]:
df_blah.to_csv('blah_submit.csv', index = False) 