In [80]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
%matplotlib inline
pd.set_option('display.max_columns', 500)


### Playing around with using different columns that are complete

In [359]:
train = pd.read_csv('../Chase/clean_training_60_cols_chase.csv', parse_dates=['timestamp'], index_col="id")  
test = pd.read_csv('../Chase/clean_test_59_cols_chase.csv', parse_dates=['timestamp'], index_col="id")

cols = test.columns.tolist()
cols.append('price_doc')

train_index = train.index.tolist()
test_index = test.index.tolist()

df = train[cols].copy()
df = pd.concat([df,test],ignore_index=False)



In [325]:
macro = pd.read_csv('../Chase/macro_chase.csv')
macro['quarter'] = pd.PeriodIndex(macro['Unnamed: 0'], freq='Q').strftime('Q%q-%y')
df['quarter'] = pd.PeriodIndex(df['timestamp'], freq='Q').strftime('Q%q-%y')

df = pd.merge(df,macro[['quarter','nominal_index']], how="left", on="quarter").reset_index(drop=True).set_index(df.index)

In [326]:
df['kitch_to_life'] = df.kitch_sq / df.life_sq
df['life_to_full'] = df.life_sq / df.full_sq
df['bld_type'] = 'med_rise'
df.loc[df.max_floor <= 5,'bld_type'] = 'low_rise'
df.loc[df.max_floor >= 17,'bld_type'] = 'high_rise'
df['walk_up_penalty'] = 0
df.loc[(df.floor>4) & (df.max_floor < 6),'walk_up_penalty'] = 1 

In [327]:
df['price_doc'] = df.price_doc / df.nominal_index
df['price_full'] = df.price_doc / df.full_sq
df['log_price'] = np.log(df.price_doc)
# df['price_doc'] = df.price_doc / 1000


### Multiple Linear Regression

In [342]:
from sklearn import linear_model
from sklearn.model_selection import KFold, cross_val_score

ols = linear_model.LinearRegression()

# cols to drop
# drop_cols = ['timestamp','price_doc','nominal_index','adj_price_doc','price_full','log_price','price_full']
cols = ['full_sq','life_sq','kitch_sq', 'max_floor','floor', 'kitch_to_life', 'life_to_full', 'sub_area']
lm_data = df[cols].copy()

df_obj = lm_data.select_dtypes(include=['object'])
df_num = lm_data.select_dtypes(exclude=['object'])


dummies = pd.get_dummies(df_obj)
df_all = pd.concat([df_num,dummies],axis=1)

x_train = df_all.loc[train_index,:]
y_train = df.loc[train_index,'price_full']

x_test = df_all.loc[test_index,:]



# df.log_price

In [351]:
ols.fit(x_train,y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [352]:
cv_scores = cross_val_score(ols, x_train, y_train, cv=10)
print cv_scores

[ 0.11819023  0.23812165  0.30496662  0.26559796  0.33095024  0.33033659
  0.28441396  0.40940807  0.36841847  0.33229627]


In [353]:
ols.predict(x_test)

array([ 113399.62994275,  131072.761082  ,  117530.96851896, ...,
        130642.71865854,  122099.30137363,  122800.09541939])

In [354]:
cv_scores = cross_val_score(ols, x_train, y_train, cv=10)
print cv_scores

[ 0.11819023  0.23812165  0.30496662  0.26559796  0.33095024  0.33033659
  0.28441396  0.40940807  0.36841847  0.33229627]


In [355]:
test1 = pd.DataFrame({'price_doc': ols.predict(x_test)},index=test_index)

In [356]:
test['price_doc'] = ols.predict(x_test)

In [357]:
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV


pipe  =  make_pipeline(MinMaxScaler(), Ridge())
param_grid = {'ridge__alpha': [100,10,1,0.1,0.01,0.001,0.0001,0]}
grid =  GridSearchCV(pipe, param_grid, cv=5)
lm_predictions = grid.fit(x_train, y_train)

In [358]:
# print lm_predictions.predict(x_train)
print lm_predictions.best_score_

0.298188249736


In [224]:
cols = ['timestamp','price_doc','nominal_index','adj_price_doc','price_full','log_price','price_full']

pipe  =  make_pipeline(MinMaxScaler(), Ridge())
param_grid = {'ridge__alpha': [100,10,1,0.1,0.01,0.001,0.0001,0]}
grid =  GridSearchCV(pipe, param_grid, cv=5)
grid.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('ridge', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'ridge__alpha': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [230]:
sub = grid.predict(x_test)

In [231]:
sub = pd.DataFrame({'id': test_index, 'price_doc':sub})

In [244]:
sub.loc[:,'nominal_index'] = df.loc[test_index,'nominal_index'].values

In [246]:
sub.price_doc = sub.price_doc * sub.nominal_index

In [248]:
sub.loc[:,'price_doc'].to_frame().to_csv('../Chase/submissions/052517_linear_regression.csv')

In [249]:
sub.sort_values('price_doc')

Unnamed: 0,id,price_doc,nominal_index
1247,31776,-7.431058e+06,1.214295
2986,33568,-7.421459e+06,1.194126
3259,33856,-7.395029e+06,1.194126
67,30542,-7.389825e+06,1.214295
392,30881,-7.313506e+06,1.214295
333,30817,-7.313506e+06,1.214295
196,30675,-7.313506e+06,1.214295
573,31073,-7.313506e+06,1.214295
1968,32520,-7.307632e+06,1.194126
2094,32651,-7.307632e+06,1.194126


In [237]:
# df.loc[test_index,'nominal_index']

In [247]:
sub.head()

Unnamed: 0,id,price_doc,nominal_index
0,30474,4176246.0,1.214295
1,30475,9594552.0,1.214295
2,30476,5235794.0,1.214295
3,30477,9846788.0,1.214295
4,30478,2628036.0,1.214295
