In [None]:
# PCA number of components is how many features you want
# finds the best then the orthogonal ones to it

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
import seaborn as sns

from sklearn.linear_model import LinearRegression, ElasticNet, ElasticNetCV
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

%matplotlib inline

In [2]:
df = pd.read_csv('train.csv')
final_test = pd.read_csv('test.csv')

In [3]:
df.rename(columns= {col: col.lower().replace(' ','_') for col in df.columns}, inplace=True)
final_test.rename(columns= {col: col.lower().replace(' ','_') for col in final_test.columns}, inplace=True)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2051 entries, 0 to 2050
Data columns (total 82 columns):
id                 2051 non-null int64
pid                2051 non-null int64
ms_subclass        2051 non-null int64
ms_zoning          2051 non-null object
lot_frontage       1721 non-null float64
lot_area           2051 non-null int64
street             2051 non-null object
alley              140 non-null object
lot_shape          2051 non-null object
land_contour       2051 non-null object
utilities          2051 non-null object
lot_config         2051 non-null object
land_slope         2051 non-null object
neighborhood       2051 non-null object
condition_1        2051 non-null object
condition_2        2051 non-null object
bldg_type          2051 non-null object
house_style        2051 non-null object
overall_qual       2051 non-null int64
overall_cond       2051 non-null int64
year_built         2051 non-null int64
year_remod/add     2051 non-null int64
roof_style         20

In [5]:
y = df['saleprice']
features = ['lot_area', 'overall_qual', 'overall_cond']
X = df[features]

In [6]:
X.isnull().sum()

lot_area        0
overall_qual    0
overall_cond    0
dtype: int64

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=101)
# stratify give the percentage of each class -- for categories

In [8]:
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

In [9]:
lr = LinearRegression()
lr.fit(X_train_sc ,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [10]:
print('train', lr.score(X_train_sc, y_train))
print('test', lr.score(X_test_sc, y_test))

train 0.679105632994
test 0.703801167288


In [None]:
cross_val_score(lr, X_test_sc, y_test)

In [11]:
# ElasticNet - GridSearch

In [12]:
enet = ElasticNet()
params = {
    'alpha': np.arange(.01, .03, .005),
    'l1_ratio': np.arange(.01, 1.0, .05)
}
gs = GridSearchCV(enet, params)
gs.fit(X_train_sc, y_train)
gs.best_params_

{'alpha': 0.01, 'l1_ratio': 0.76000000000000001}

In [13]:
gs.score(X_test_sc, y_test)

0.70374897290226501

In [14]:
X_final_test = final_test[features]
X_final_test_sc = ss.transform(X_final_test)
predictions = gs.predict(X_final_test_sc)

In [15]:
final_test['SalePrice'] = predictions

final_test[['id','SalePrice']].to_csv('export_submit_2.csv', index=False)