In [2]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
%matplotlib inline
pd.set_option('display.max_columns', 500)


### Playing around with using different columns that are complete

In [139]:
train = pd.read_csv('../Chase/clean_train_all_cols_chase.csv', parse_dates=['timestamp'], index_col="id")  
test = pd.read_csv('../Chase/clean_test_all_cols_chase.csv', parse_dates=['timestamp'], index_col="id")


In [140]:
train_index = train.index.tolist()
test_index = test.index.tolist()

cols = ['life_sq','full_sq','floor','max_floor','kitch_sq','sub_area','kremlin_km','price_doc','timestamp']

test['price_doc'] = np.nan

df = pd.concat([train[cols].copy(),
                test[cols].copy()],
               ignore_index=False)

df['month'] = df.timestamp.dt.month.astype(object)

In [141]:
macro = pd.read_csv('../Chase/macro_chase.csv')
macro['quarter'] = pd.PeriodIndex(macro['Unnamed: 0'], freq='Q').strftime('Q%q-%y')
df['quarter'] = pd.PeriodIndex(df['timestamp'], freq='Q').strftime('Q%q-%y')

df = pd.merge(df,macro[['quarter','nominal_index']], how="left", on="quarter").reset_index(drop=True).set_index(df.index)

In [142]:
df['kitch_to_life'] = df.kitch_sq / df.life_sq
df['life_to_full'] = df.life_sq / df.full_sq
df['bld_type'] = 'med_rise'
df.loc[df.max_floor <= 5,'bld_type'] = 'low_rise'
df.loc[df.max_floor >= 17,'bld_type'] = 'high_rise'
df['walk_up_penalty'] = 0
df.loc[(df.floor>4) & (df.max_floor < 6),'walk_up_penalty'] = 1 

In [143]:
df['price_doc'] = df.price_doc / df.nominal_index
df['price_full'] = df.price_doc / df.full_sq
df['log_price'] = np.log(df.price_doc)
# df['price_doc'] = df.price_doc / 1000


### Multiple Linear Regression

In [144]:
from sklearn import linear_model
from sklearn.model_selection import KFold, cross_val_score

ols = linear_model.LinearRegression()

# cols to drop
# drop_cols = ['timestamp','price_doc','nominal_index','adj_price_doc','price_full','log_price','price_full']
cols = ['full_sq','floor','sub_area','kremlin_km','month']
lm_data = df[cols].copy()

df_obj = lm_data.select_dtypes(include=['object'])
df_num = lm_data.select_dtypes(exclude=['object'])


dummies = pd.get_dummies(df_obj)
df_all = pd.concat([df_num,dummies],axis=1)

x_train = df_all.loc[train_index]

y_train = df.loc[train_index,'log_price']

x_test = df_all.loc[test_index,:]

ols.fit(x_train,y_train)
print('R^2: %.2f' % ols.score(x_train, y_train))
# df.log_price

R^2: 0.58


In [145]:
df.loc[test_index,'price_doc'] = np.exp(ols.predict(x_test)) * df.loc[test_index,'nominal_index']

In [146]:
# df.loc[test_index,'price_doc']
df['price_full'] = df.price_doc / df.full_sq

In [152]:
cols = ['price_doc','full_sq','price_full']
sub = df.loc[test_index,cols]

In [130]:
# submission.drop('id',axis=1).to_csv('submission_linear')

In [157]:
sub['price_doc'].to_frame().to_csv('../Chase/submissions/simple_linear_052616.csv')

In [53]:
cv_scores = cross_val_score(ols, x_train, y_train, cv=10)
print cv_scores

[ 0.42788215  0.55501799  0.5988087   0.5554951   0.54050592  0.55748369
  0.55827505  0.61427153  0.57239928  0.61094148]


In [54]:
ols.predict(x_test)

array([ 15.26927389,  15.76154285,  15.32017376, ...,  15.15358656,
        15.2948478 ,  15.72791287])

In [55]:
cv_scores = cross_val_score(ols, x_train, y_train, cv=10)
print cv_scores

[ 0.42788215  0.55501799  0.5988087   0.5554951   0.54050592  0.55748369
  0.55827505  0.61427153  0.57239928  0.61094148]


In [47]:
test1 = pd.DataFrame({'price_doc': ols.predict(x_test)},index=test_index)

In [48]:
test['price_doc'] = ols.predict(x_test)

In [56]:
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV


pipe  =  make_pipeline(MinMaxScaler(), Ridge())
param_grid = {'ridge__alpha': [100,10,1,0.1,0.01,0.001,0.0001,0]}
grid =  GridSearchCV(pipe, param_grid, cv=5)
lm_predictions = grid.fit(x_train, y_train)

In [57]:
# print lm_predictions.predict(x_train)
print lm_predictions.best_score_

0.55986024025


In [None]:
from sklearn import preprocessing 

In [None]:
np.exp(log_y_test)

In [58]:
cols = ['timestamp','price_doc','nominal_index','adj_price_doc','price_full','log_price','price_full']

pipe  =  make_pipeline(MinMaxScaler(), Ridge())
param_grid = {'ridge__alpha': [100,10,1,0.1,0.01,0.001,0.0001,0]}
grid =  GridSearchCV(pipe, param_grid, cv=5)
grid.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('ridge', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'ridge__alpha': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [59]:
sub = grid.predict(x_test)

In [60]:
sub = pd.DataFrame({'id': test_index, 'price_doc':sub})

In [61]:
sub.loc[:,'nominal_index'] = df.loc[test_index,'nominal_index'].values

In [66]:
'%f' % 1.128899e+08

'112889900.000000'

In [62]:
sub.price_doc = sub.price_doc * sub.nominal_index

In [63]:
sub.loc[:,'price_doc'].to_frame().to_csv('../Chase/submissions/052617_linear_ridge_regression.csv')

In [64]:
sub.sort_values('price_doc')

Unnamed: 0,id,price_doc,nominal_index
6938,37412,16.722780,1.162486
7654,38128,16.795930,1.162486
5143,35617,16.865221,1.173957
6092,36566,16.874353,1.173957
5422,35896,16.875906,1.173957
4552,35026,16.875906,1.173957
5166,35640,16.875906,1.173957
4112,34586,16.877455,1.173957
5638,36112,16.879989,1.173957
4617,35091,16.887796,1.173957


In [237]:
# df.loc[test_index,'nominal_index']

In [247]:
sub.head()

Unnamed: 0,id,price_doc,nominal_index
0,30474,4176246.0,1.214295
1,30475,9594552.0,1.214295
2,30476,5235794.0,1.214295
3,30477,9846788.0,1.214295
4,30478,2628036.0,1.214295
