In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.metrics import explained_variance_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score

In [2]:
train = pd.read_csv('modeling_train.csv')

In [3]:
train['logSalePrice'] = np.log(train['SalePrice'])

In [4]:
train['log_sq_ft'] = np.log(train['total_sq_ft'])

In [5]:
train['log_porch_sf'] = np.log(train['total_porch_sf'] + 1)

In [6]:
train['log_garage_area'] = np.log(train['GarageArea'] + 1)

In [21]:
y = train[['logSalePrice']]
X = train[['log_sq_ft','bathrooms','TotRmsAbvGrd','log_garage_area','log_porch_sf','OverallQual']]
X.head()

Unnamed: 0,log_sq_ft,bathrooms,TotRmsAbvGrd,log_garage_area,log_porch_sf,OverallQual
0,7.789869,3.5,8,6.308098,4.127134,7
1,7.714231,2.5,6,6.133398,5.700444,6
2,7.728416,3.5,6,6.411818,3.7612,7
3,7.566828,2.0,7,6.466145,5.7301,7
4,7.956126,3.5,9,6.729824,5.624018,8


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)

In [23]:
model = LinearRegression()
param_grid = {}
gs = GridSearchCV(model, param_grid, cv=5)
gs.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
       fit_params=None, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [24]:
pred = gs.predict(X_test)
model_r = r2_score(y_test, pred)
model_mse = mean_squared_error(y_test, pred)
model_rmse = np.sqrt(model_mse)
adjustedr = 1 - (1-model_r)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)

In [25]:
print('Model R Squared: ' + str(model_r))
print('Adjusted R Squared: ' + str(adjustedr))
print('RMSE: ' + str(model_rmse)) 
print('MSE: ' + str(model_mse))

Model R Squared: 0.7462278030481378
Adjusted R Squared: 0.7419746377360954
RMSE: 0.20084283889493576
MSE: 0.04033784593537712


In [26]:
test = pd.read_csv('modeling_test.csv')

In [27]:
test['log_sq_ft'] = np.log(test['total_sq_ft'])
test['log_porch_sf'] = np.log(test['total_porch_sf'] + 1)
test['log_garage_area'] = np.log(test['GarageArea'] + 1)

In [28]:
test_model = test[['log_sq_ft','bathrooms','TotRmsAbvGrd','log_garage_area','log_porch_sf','OverallQual']]

In [29]:
test_model.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 6 columns):
log_sq_ft          1459 non-null float64
bathrooms          1459 non-null float64
TotRmsAbvGrd       1459 non-null int64
log_garage_area    1459 non-null float64
log_porch_sf       1459 non-null float64
OverallQual        1459 non-null int64
dtypes: float64(4), int64(2)
memory usage: 68.5 KB


In [30]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1095 entries, 1122 to 712
Data columns (total 6 columns):
log_sq_ft          1095 non-null float64
bathrooms          1095 non-null float64
TotRmsAbvGrd       1095 non-null int64
log_garage_area    1095 non-null float64
log_porch_sf       1095 non-null float64
OverallQual        1095 non-null int64
dtypes: float64(4), int64(2)
memory usage: 59.9 KB


In [31]:
test_pred = gs.predict(test_model)

In [32]:
test_pred = np.exp(test_pred)

In [33]:
df_dict = {'Id':np.array(list(test['Id'])), 'SalePrice':test_pred.ravel()}
df_dict

{'Id': array([1461, 1462, 1463, ..., 2917, 2918, 2919]),
 'SalePrice': array([124414.6648221 , 169447.45713915, 159434.18173802, ...,
        160305.33281418,  99683.46349658, 239851.59651626])}

In [34]:
sol_df = pd.DataFrame.from_dict(df_dict)
sol_df.head()

Unnamed: 0,Id,SalePrice
0,1461,124414.664822
1,1462,169447.457139
2,1463,159434.181738
3,1464,183238.803327
4,1465,204967.760773


In [35]:
sol_df[['Id','SalePrice']].to_csv('simple_linear2.csv', index=False)