In [1]:
import pandas as pd
pd.set_option('display.max_columns', None) #display all columns of the pandas dataframe
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold

import my_tools

In [2]:
df_train, df_test = my_tools.load_clean_data('train')

In [3]:
X_train = df_train.drop(columns=['SalePrice'])
X_test = df_test.drop(columns=['SalePrice'])
y_train = df_train.SalePrice.to_numpy()
y_test = df_test.SalePrice.to_numpy()

## Baseline: Predict based on above-ground square footage alone (GrLivArea)

In [4]:
x_train = np.log(X_train.GrLivArea).to_numpy().reshape(-1, 1)
lr = LinearRegression().fit(x_train, y_train)

In [5]:
x_test = np.log(X_test.GrLivArea).to_numpy().reshape(-1, 1)
y_pred = lr.predict(x_test)

In [6]:
print(my_tools.kaggle_score(y_test, y_pred))

0.31977538120131044


#### Train on full training set

In [7]:
df_train = my_tools.load_clean_data('train', split=False)
x_train = np.log(df_train.GrLivArea.to_numpy()).reshape(-1, 1)
y_train = df_train.SalePrice.to_numpy()

model = LinearRegression().fit(x_train, y_train)

#### Predict test set and create submission file

In [8]:
test = my_tools.load_clean_data('test').set_index('Id')
x_test = np.log(test.GrLivArea.to_numpy()).reshape(-1, 1)
y_pred = model.predict(x_test)
my_tools.make_pred_df(y_pred, 'predictions/baseline.csv')