In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.head()

Unnamed: 0,id,timestamp,full_sq,life_sq,floor,max_floor,material,build_year,num_room,kitch_sq,...,cafe_count_5000_price_2500,cafe_count_5000_price_4000,cafe_count_5000_price_high,big_church_count_5000,church_count_5000,mosque_count_5000,leisure_count_5000,sport_count_5000,market_count_5000,price_doc
0,1,2011-08-20,43,27.0,4.0,,,,,,...,9,4,0,13,22,1,0,52,4,5850000
1,2,2011-08-23,34,19.0,3.0,,,,,,...,15,3,0,15,29,1,10,66,14,6000000
2,3,2011-08-27,43,29.0,2.0,,,,,,...,10,3,0,11,27,0,4,67,10,5700000
3,4,2011-09-01,89,50.0,9.0,,,,,,...,11,2,1,4,4,0,0,26,3,13100000
4,5,2011-09-05,77,77.0,4.0,,,,,,...,319,108,17,135,236,2,91,195,14,16331452


In [4]:
df.shape

(30471, 292)

**Just for this quick submission, drop all features that have NaN values**

In [3]:
df = df.dropna()

**Identify features that have > 0.25 correlation with the target variable**

In [4]:
x = df[['full_sq','life_sq','num_room','kitch_sq','office_sqm_5000','cafe_count_5000_price_high',
       'cafe_count_5000_price_2500','ID_metro','cafe_count_5000_price_1500','cafe_count_5000',
       'cafe_count_5000_price_1000','cafe_count_5000_price_4000']]
y = df['price_doc']

In [5]:
df.corr()['price_doc'].sort_values(ascending = False).head(13)

price_doc                     1.000000
full_sq                       0.675455
life_sq                       0.524602
num_room                      0.478370
kitch_sq                      0.368339
office_sqm_5000               0.271088
cafe_count_5000_price_high    0.260564
cafe_count_5000_price_2500    0.260002
ID_metro                      0.257857
cafe_count_5000_price_1500    0.257182
cafe_count_5000               0.254917
cafe_count_5000_price_1000    0.254544
cafe_count_5000_price_4000    0.253283
Name: price_doc, dtype: float64

**Create functions for our pipeline**

In [6]:
def fullsq(data):
    return data['full_sq'].values.reshape(-1,1)

def lifesq(data):
    return data['life_sq'].values.reshape(-1,1)

def numroom(data):
    return data['num_room'].values.reshape(-1,1)

def kitchsq(data):
    return data['kitch_sq'].values.reshape(-1,1)

def officesqm(data):
    return data['office_sqm_5000'].values.reshape(-1,1)

def cafecounthigh(data):
    return data['cafe_count_5000_price_high'].values.reshape(-1,1)

def cafecount2500(data):
    return data['cafe_count_5000_price_2500'].values.reshape(-1,1)

def metro(data):
    return data['ID_metro'].values.reshape(-1,1)

def cafecount1500(data):
    return data['cafe_count_5000_price_1500'].values.reshape(-1,1)

def cafecount(data):
    return data['cafe_count_5000'].values.reshape(-1,1)

def cafecount1000(data):
    return data['cafe_count_5000_price_1000'].values.reshape(-1,1)

def cafecount4000(data):
    return data['cafe_count_5000_price_4000'].values.reshape(-1,1)

In [7]:
from sklearn.pipeline import Pipeline, make_pipeline, make_union, FeatureUnion
from sklearn import metrics
from sklearn.preprocessing import FunctionTransformer,Binarizer, Imputer, LabelBinarizer

In [8]:
fullsq_pipe = make_pipeline(FunctionTransformer(fullsq, validate = False),
                           Imputer(),)

lifesq_pipe = make_pipeline(FunctionTransformer(lifesq, validate = False),
                           Imputer())

numroom_pipe = make_pipeline(FunctionTransformer(numroom, validate = False),
                           Imputer())

kitchsq_pipe = make_pipeline(FunctionTransformer(kitchsq, validate = False),
                           Imputer())

officesqm_pipe = make_pipeline(FunctionTransformer(officesqm, validate = False),
                           Imputer())

cafecounthigh_pipe = make_pipeline(FunctionTransformer(cafecounthigh, validate = False),
                           Imputer())

cafecount2500_pipe = make_pipeline(FunctionTransformer(cafecount2500, validate = False),
                           Imputer())

metro_pipe = make_pipeline(FunctionTransformer(metro, validate = False),
                           Imputer())

cafecount1500_pipe = make_pipeline(FunctionTransformer(cafecount1500, validate = False),
                           Imputer())

cafecount_pipe = make_pipeline(FunctionTransformer(cafecount, validate = False),
                           Imputer())

cafecount1000_pipe = make_pipeline(FunctionTransformer(cafecount1000, validate = False),
                           Imputer())

cafecount4000_pipe = make_pipeline(FunctionTransformer(cafecount4000, validate = False),
                           Imputer())

In [9]:
fu = make_union(fullsq_pipe, lifesq_pipe, numroom_pipe, kitchsq_pipe, officesqm_pipe, cafecounthigh_pipe, 
                cafecount2500_pipe, metro_pipe, cafecount1500_pipe, cafecount_pipe, cafecount1000_pipe, 
               cafecount4000_pipe)

**Standardize our features**

In [12]:
x_std = StandardScaler().fit_transform(x)
x_std = pd.DataFrame(x_std, columns = x.columns)
x_std.head()

Unnamed: 0,full_sq,life_sq,num_room,kitch_sq,office_sqm_5000,cafe_count_5000_price_high,cafe_count_5000_price_2500,ID_metro,cafe_count_5000_price_1500,cafe_count_5000,cafe_count_5000_price_1000,cafe_count_5000_price_4000
0,-1.764874,-1.099593,-1.120942,1.210995,2.876516,3.096153,2.905469,-0.107712,2.812098,2.756136,2.56862,3.016623
1,-0.348795,-0.307056,0.003349,-0.365512,-0.909627,-0.497139,-0.63352,-0.127036,-0.681058,-0.718822,-0.828458,-0.54371
2,0.983986,0.832216,1.127641,0.15999,1.531828,2.405135,1.592788,1.49614,1.249705,1.453027,1.419233,1.68814
3,0.109349,-0.207988,0.003349,0.685492,0.900906,0.608489,0.122585,1.399523,-0.045938,0.040942,0.008041,0.279949
4,-0.931886,-0.604257,-1.120942,-0.628263,0.569077,0.608489,0.385121,1.82464,0.227163,0.160662,0.020811,0.572215


In [13]:
fu.fit(x_std)

FeatureUnion(n_jobs=1,
       transformer_list=[('pipeline-1', Pipeline(steps=[('functiontransformer', FunctionTransformer(accept_sparse=False,
          func=<function fullsq at 0x11b7c7c80>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y=False, validate=False)), ('imputer', Imputer(axis=0, copy=True, missi...lse)), ('imputer', Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0))]))],
       transformer_weights=None)

In [14]:
test = pd.read_csv('test.csv')

In [11]:
from sklearn.preprocessing import StandardScaler

In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
ln = LinearRegression()

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
pipex = fu.transform(df)

**Train test split our features**

In [19]:
x_train, x_test, y_train, y_test = train_test_split(pipex, y, test_size = 0.3)

In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

**Cross validate with GridSearchCV using Linear Regression model**

In [21]:
gs = GridSearchCV(ln, param_grid = {}, scoring = make_scorer(metrics.mean_squared_error))

In [22]:
gs.fit(x_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False),
       fit_params={}, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(mean_squared_error), verbose=0)

In [23]:
gs.best_estimator_.score(x_train, y_train)

0.52488875490899733

In [24]:
gs.best_estimator_.score(x_test, y_test)

0.48242950104796889

In [25]:
pipetest = fu.transform(test)

**Repeating with Random Forest Regressor**

In [26]:
from sklearn.ensemble import RandomForestRegressor

In [27]:
rf = RandomForestRegressor()

In [28]:
gs1 = GridSearchCV(rf, param_grid = {}, scoring = make_scorer(metrics.mean_squared_error))

In [29]:
gs1.fit(x_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(mean_squared_error), verbose=0)

In [30]:
gs1.best_estimator_.score(x_train, y_train)

0.93533402219834161

In [31]:
gs1.best_estimator_.score(x_test, y_test)

0.49215344805003158

**Repeating with XGBoost Regressor**

In [32]:
from xgboost import XGBRegressor

In [33]:
xgb = XGBRegressor()

In [34]:
gs2 = GridSearchCV(xgb, param_grid = {}, scoring = make_scorer(metrics.mean_squared_error))

In [35]:
gs2.fit(x_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1, param_grid={},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(mean_squared_error), verbose=0)

In [36]:
gs2.best_estimator_.score(x_train, y_train)

0.7805009818498716

In [37]:
gs2.best_estimator_.score(x_test, y_test)

0.56157292892551924

**For quick submission, no parameter tuning**

**Function for creating Kaggle submission**

In [38]:
def evaluation_transformation(dataset, predictions):
    dataset = dataset.join(pd.DataFrame(predictions, columns=['price_doc']))
    dataset[['id', 'price_doc']].to_csv('submission2.csv', index=False)

predictions = gs2.best_estimator_.predict(pipetest)
evaluation_transformation(test, np.abs(predictions))

**To be improved on**