## I removed multiples that I thought were extreme

In [110]:
import pandas as pd
from sklearn import preprocessing
import numpy as np
%matplotlib inline
pd.set_option('display.max_columns', 500)
pd.set_option('display.float_format', lambda x: '%.1f' % x)

### Playing around with using different columns that are complete

In [111]:
train = pd.read_csv('../Chase/data/clean_train_all_cols_outliers_removed_chase.csv', parse_dates=['timestamp'], index_col="id")  
test = pd.read_csv('../Chase/data/clean_test_all_cols_chase.csv', parse_dates=['timestamp'], index_col="id")


In [112]:
def getKremlinGroup(df, id):
    """ returns the group that are the same distance from the kremlin"""
    x = df.loc[id,'kremlin_km']
    return df.loc[df.kremlin_km==x,:]

In [113]:
train_index = train.index.tolist()
test_index = test.index.tolist()

cols = ['life_sq','full_sq','floor','max_floor','kitch_sq','sub_area','kremlin_km','price_doc','timestamp']

test['price_doc'] = np.nan

df = pd.concat([train[cols].copy(),
                test[cols].copy()],
               ignore_index=False)

df['month'] = df.timestamp.dt.month.astype(object)

In [114]:
macro = pd.read_csv('../Chase/data/macro_chase.csv')
macro['quarter'] = pd.PeriodIndex(macro['Unnamed: 0'], freq='Q').strftime('Q%q-%y')
df['quarter'] = pd.PeriodIndex(df['timestamp'], freq='Q').strftime('Q%q-%y')

df = pd.merge(df,macro[['quarter','nominal_index']], how="left", on="quarter").reset_index(drop=True).set_index(df.index)

In [115]:
df['kitch_to_life'] = df.kitch_sq / df.life_sq
df['life_to_full'] = df.life_sq / df.full_sq
df['bld_type'] = 'med_rise'
df.loc[df.max_floor <= 5,'bld_type'] = 'low_rise'
df.loc[df.max_floor >= 17,'bld_type'] = 'high_rise'
df['walk_up_penalty'] = 0
df.loc[(df.floor>4) & (df.max_floor < 6),'walk_up_penalty'] = 1 

In [116]:
df['price_doc'] = df.price_doc / df.nominal_index
df['price_full'] = df.price_doc / df.full_sq
df['log_price'] = np.log(df.price_doc)
# df['price_doc'] = df.price_doc / 1000


### Multiple Linear Regression

In [117]:
from sklearn import linear_model
from sklearn.model_selection import KFold, cross_val_score

ols = linear_model.LinearRegression()

# cols to drop
# drop_cols = ['timestamp','price_doc','nominal_index','adj_price_doc','price_full','log_price','price_full']
cols = ['full_sq','floor','sub_area','kremlin_km','month']
lm_data = df[cols].copy()

df_obj = lm_data.select_dtypes(include=['object'])
df_num = lm_data.select_dtypes(exclude=['object'])


dummies = pd.get_dummies(df_obj)
df_all = pd.concat([df_num,dummies],axis=1)

x_train = df_all.loc[train_index]

y_train = df.loc[train_index,'log_price']

x_test = df_all.loc[test_index,:]

ols.fit(x_train,y_train)
print('R^2: %.2f' % ols.score(x_train, y_train))
# df.log_price

R^2: 0.82


In [118]:
df.loc[test_index,'price_doc'] = np.exp(ols.predict(x_test)) * df.loc[test_index,'nominal_index']

In [119]:
df['price_full'] = df.price_doc / df.full_sq

In [120]:
cols = ['price_doc','full_sq','price_full']
sub = df.loc[test_index,cols]

In [121]:
sub.loc[37686,'price_doc'] = sub.loc[37686,'full_sq'] * 225000
sub.loc[[34670,32941],'price_doc'] = sub.loc[[34670,32941],'full_sq'] * 275000
sub.loc[33974,'price_doc'] = sub.loc[33974,'full_sq'] * 175000

In [122]:
sub[(sub.price_full > 200000) & (sub.full_sq<15)]
# sub.sort_values('price_full',ascending=False)

Unnamed: 0_level_0,price_doc,full_sq,price_full
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
33974,1750000.0,10.0,436196.2


In [102]:
# kaggle score 0.34
sub['price_doc'].to_frame().to_csv('../Chase/submissions/simple_linear_052716.csv')

In [41]:
cv_scores = cross_val_score(ols, x_train, y_train, cv=10)
print cv_scores

[ 0.75726873  0.81129533  0.82960046  0.84645333  0.81857573  0.80718293
  0.80774842  0.82773509  0.81249425  0.80145846]


In [42]:
ols.predict(x_test)

array([ 15.33152921,  15.77202253,  15.43048885, ...,  15.20714997,
        15.41058912,  15.87700661])

In [43]:
cv_scores = cross_val_score(ols, x_train, y_train, cv=10)
print cv_scores

[ 0.75726873  0.81129533  0.82960046  0.84645333  0.81857573  0.80718293
  0.80774842  0.82773509  0.81249425  0.80145846]


In [47]:
test1 = pd.DataFrame({'price_doc': ols.predict(x_test)},index=test_index)

In [48]:
test['price_doc'] = ols.predict(x_test)

In [44]:
from sklearn.linear_model import Ridge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV


pipe  =  make_pipeline(MinMaxScaler(), Ridge())
param_grid = {'ridge__alpha': [100,10,1,0.1,0.01,0.001,0.0001,0]}
grid =  GridSearchCV(pipe, param_grid, cv=5)
lm_predictions = grid.fit(x_train, y_train)

In [45]:
# print lm_predictions.predict(x_train)
print lm_predictions.best_score_

0.814421384633


In [46]:
from sklearn import preprocessing 

In [47]:
cols = ['timestamp','price_doc','nominal_index','adj_price_doc','price_full','log_price','price_full']

pipe  =  make_pipeline(MinMaxScaler(), Ridge())
param_grid = {'ridge__alpha': [100,10,1,0.1,0.01,0.001,0.0001,0]}
grid =  GridSearchCV(pipe, param_grid, cv=5)
grid.fit(x_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(steps=[('minmaxscaler', MinMaxScaler(copy=True, feature_range=(0, 1))), ('ridge', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'ridge__alpha': [100, 10, 1, 0.1, 0.01, 0.001, 0.0001, 0]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [48]:
sub = grid.predict(x_test)

In [49]:
sub = pd.DataFrame({'id': test_index, 'price_doc':sub})

In [50]:
sub.loc[:,'nominal_index'] = df.loc[test_index,'nominal_index'].values

In [51]:
'%f' % 1.128899e+08

'112889900.000000'

In [33]:
sub.price_doc = sub.price_doc * sub.nominal_index

In [34]:
sub.loc[:,'price_doc'].to_frame().to_csv('../Chase/submissions/052717_linear_ridge_regression.csv')

In [35]:
sub.sort_values('price_doc')

Unnamed: 0,id,price_doc,nominal_index
7654,38128,17.066710,1.162486
7468,37942,17.157199,1.162486
5651,36125,17.234647,1.173957
6938,37412,17.252984,1.162486
7413,37887,17.291430,1.162486
4042,34516,17.294941,1.173957
7633,38107,17.297512,1.162486
7641,38115,17.300261,1.162486
7317,37791,17.309880,1.162486
6872,37346,17.318427,1.162486


In [237]:
# df.loc[test_index,'nominal_index']

In [247]:
sub.head()

Unnamed: 0,id,price_doc,nominal_index
0,30474,4176246.0,1.214295
1,30475,9594552.0,1.214295
2,30476,5235794.0,1.214295
3,30477,9846788.0,1.214295
4,30478,2628036.0,1.214295
