In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [2]:
train = pd.read_csv('../datasets/train.csv')
test = pd.read_csv('../datasets/test.csv') # CANNOT use this as validation data

In [3]:
train.shape, test.shape

((2051, 81), (878, 80))

In [4]:
len(train['Neighborhood'].unique())

28

In [5]:
len(test['Neighborhood'].unique())

26

In [6]:
test.isna().sum()

Id                0
PID               0
MS SubClass       0
MS Zoning         0
Lot Frontage    160
               ... 
Misc Feature    837
Misc Val          0
Mo Sold           0
Yr Sold           0
Sale Type         0
Length: 80, dtype: int64

In [7]:
train['Lot Frontage'].head()

0     NaN
1    43.0
2    68.0
3    73.0
4    82.0
Name: Lot Frontage, dtype: float64

In [8]:
# BAD, DO NOT
test['Lot Frontage'].fillna(test['Lot Frontage'].mean())

# DO THIS INSTEAD
test['Lot Frontage'].fillna(train['Lot Frontage'].mean())

0      69.0000
1      69.0552
2      58.0000
3      60.0000
4      69.0552
        ...   
873    80.0000
874    90.0000
875    55.0000
876    60.0000
877    70.0000
Name: Lot Frontage, Length: 878, dtype: float64

### Fit our model on training set after creating holdout

In [9]:
X = train[['Overall Qual', 'Lot Area']]
y = train['SalePrice']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [11]:
demo_model = LinearRegression()

In [12]:
demo_model.fit(X_train, y_train)

LinearRegression()

In [13]:
demo_model.score(X_train, y_train), demo_model.score(X_test, y_test)

(0.689028943221521, 0.6478326717660496)

### Predict on our test csv

In [14]:
kaggle_preds = demo_model.predict(test[['Overall Qual', 'Lot Area']])

In [15]:
test.shape

(878, 80)

In [16]:
kaggle_preds.shape

(878,)

In [21]:
kaggle_output = {'Id':test['Id'], 'SalePrice':kaggle_preds}

In [22]:
kaggle_output

{'Id': 0      2658
 1      2718
 2      2414
 3      1989
 4       625
        ... 
 873    1662
 874    1234
 875    1373
 876    1672
 877    1939
 Name: Id, Length: 878, dtype: int64,
 'SalePrice': array([173544.79022761, 132792.30911284, 243899.97991208, 128806.65794724,
        174794.23253696,  63100.23126852,  86225.38417855, 131480.04568178,
        196473.52309132, 176975.52148486, 166679.83765079, 166767.0892087 ,
        181111.24533007, 212042.69108583, 182601.50193928, 130691.29159822,
        179331.31354859, 132474.71344202, 179331.31354859, 288446.15876504,
        127850.38087248, 128733.36663859, 169601.01980981, 216510.94740771,
        217728.00259963, 124129.97444295, 100175.16325814, 130481.88785922,
        172351.18891532,  26151.91807888,  82121.07089419,  78282.00234589,
        225039.68315298, 133971.95017586, 223175.9898759 , 230288.7368772 ,
         75769.15747791,  87914.5743398 , 130104.96112903, 209756.70026844,
        155427.87674195, 223343.5128671 

In [23]:
kaggle_output = pd.DataFrame(kaggle_output)

In [24]:
kaggle_output.to_csv('./demo_submission.csv', index = False) # Be sure to set index = False or kaggle will reject