# Multiple Linear Regression Exploration

## Import libraries

In [124]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from __future__ import print_function

from scipy import stats
from sklearn import linear_model


%matplotlib inline

pd.options.mode.chained_assignment = None # default = 'warn'
pd.set_option('display.max_columns', 500)
color = sns.color_palette()

## Pull data and create regression object

In [143]:
train = pd.read_csv('train.csv', parse_dates=['timestamp'])
test = pd.read_csv('test.csv', parse_dates = ['timestamp'])
macro = pd.read_csv('macro.csv', parse_dates = ['timestamp'])

ols = linear_model.LinearRegression()  # create the object easily

# Combine Datasets

In [137]:
train['dataset'] = 'train'
test['dataset'] = 'test'

# concat train and test
df = pd.concat([train, test])

# merge macro set 
df = pd.merge(df, macro, how='left', on = 'timestamp')

df.shape

(38133, 392)

# Subset important columns

In [157]:
df_important = df.loc[:, ['id', 'timestamp', 'full_sq',
                          'life_sq', 'floor', 'max_floor', 'material',
                          'build_year', 'num_room',
                          'kitch_sq', 'state',
                          'product_type', 'sub_area',
                          'indust_part', 'school_education_centers_raion',
                          'sport_objects_raion', 'culture_objects_top_25_raion',
                          'oil_chemistry_raion', 'metro_min_avto',
                          'green_zone_km', 'industrial_km',
                          'kremlin_km', 'radiation_km',
                          'ts_km', 'fitness_km',
                          'stadium_km', 'additional_education_km',
                          'cafe_count_1500_price_500', 'cafe_count_1500_price_high',
                          'cafe_count_2000_price_2500', 'trc_sqm_5000',
                          'cafe_count_5000', 'cafe_count_5000_price_high',
                          'gdp_quart', 'cpi',
                          'ppi', 'usdrub',
                          'eurrub', 'gdp_annual',
                          'rts', 'micex',
                          'micex_cbi_tr', 'deposits_rate',
                          'mortgage_rate', 'income_per_cap',
                          'salary', 'labor_force',
                          'unemployment', 'employment']]

In [161]:
# check missing values
# df_important.info()
# it checked out

In [142]:
df_important.columns.values

array(['id', 'timestamp', 'full_sq', 'life_sq', 'floor', 'max_floor',
       'material', 'build_year', 'num_room', 'kitch_sq', 'tk/state',
       'product_type', 'sub_area', 'indust_part',
       'school_education_centers_raion', 'sport_objects_raion',
       'culture_objects_top_25_raion', 'oil_chemistry_raion',
       'metro_min_avto', 'green_zone_km', 'industrial_zone_km',
       'kremlin_km', 'radiation_km', 'ts_km', 'fitness_km', 'stadium_km',
       'additional_education_km', 'cafe_count_1500_price_500',
       'cafe_count_1500_price_high', 'cafe_count_2000_price_2500',
       'trc_sqm_5000', 'cafe_count_5000', 'cafe_count_5000_price_high',
       'gdp_quart', 'cpi', 'ppi', 'usdrub', 'eurrub', 'gdp_annual', 'rts',
       'micex', 'micex_cbi_tr', 'deposits_rate', 'mortgage_rate',
       'income_per_cap', 'salary', 'labor_force', 'unemployment',
       'employment'], dtype=object)

## Normalize Price

And set up x and y

In [96]:
train['price_doc_log'] = np.log1p(train['price_doc'])
train['price_doc_log10'] = np.log10(train['price_doc'])
train_nona = train.dropna()

In [97]:
x = train_nona.drop('price_doc', axis = 1)
x = x.drop('price_doc_log', axis = 1)
x = x.drop('price_doc_log10', axis = 1)
x = x.select_dtypes(include=['float64'])


In [98]:
y = train_nona[['price_doc_log']]

## Fit the Regression

In [99]:
ols.fit(x, y)
# print('Coeffs are ' + str(ols.coef_))
print('R^2 is ' + str(ols.score(x,y)))    # only 31% (so bad)

R^2 is 0.31320360088


## Create:
#### + Green
#### + Prom
#### + Office
#### + Trc
#### + Chruch
#### + Sport
#### + Leisure
#### + Market
#### + Cafe_price
#### + Cafe_count

In [105]:
green = ['green_part_500', 'green_part_1000','green_part_1500',
         'green_part_2000','green_part_3000','green_part_5000']

prom = ['prom_part_500','prom_part_1000','prom_part_1500',
        'prom_part_2000','prom_part_3000','prom_part_5000']
        
office = ['office_count_500','office_sqm_500','office_count_1000',
          'office_sqm_1000','office_count_1500', 'office_sqm_1500',
          'office_count_2000','office_sqm_2000','office_count_3000',
          'office_sqm_3000','office_count_5000','office_sqm_5000']
          
# shopping malls
trc = ['trc_count_1000', 'trc_count_1500', 'trc_count_2000', 'trc_count_3000', 
       'trc_count_500', 'trc_count_5000', 'trc_sqm_1000', 'trc_sqm_1500',
       'trc_sqm_2000', 'trc_sqm_3000', 'trc_sqm_500', 'trc_sqm_5000', 'trc_count_1000', 'trc_sqm_1000']
       
       
church = ['big_church_count_500', 'church_count_500', 'mosque_count_500',
         'big_church_count_1000', 'church_count_1000', 'mosque_count_1000',
         'big_church_count_1500', 'church_count_1500', 'mosque_count_1500',
         'big_church_count_3000', 'church_count_3000', 'mosque_count_3000',
         'big_church_count_5000', 'church_count_5000', 'mosque_count_5000',
         'big_church_count_2000', 'church_count_2000', 'mosque_count_2000']

        
sport = ['sport_count_500','sport_count_1000','sport_count_2000', 
         'sport_count_5000','sport_count_1500','sport_count_3000']

leisure = ['leisure_count_500','leisure_count_3000','leisure_count_1000',
           'leisure_count_1500','leisure_count_2000','leisure_count_5000']

market = ['market_count_500','market_count_5000', 'market_count_2000',
          'market_count_1000','market_count_1500','market_count_3000']    


cafe_price = ['cafe_sum_500_min_price_avg', 'cafe_sum_500_max_price_avg',
              'cafe_avg_price_500', 'cafe_sum_1000_min_price_avg','cafe_sum_1000_max_price_avg', 
              'cafe_avg_price_1000', 'cafe_sum_1500_min_price_avg', 'cafe_sum_1500_max_price_avg', 
              'cafe_avg_price_1500', 'cafe_sum_2000_min_price_avg', 'cafe_sum_2000_max_price_avg', 
              'cafe_avg_price_2000', 'cafe_sum_3000_min_price_avg', 'cafe_sum_3000_max_price_avg',
              'cafe_avg_price_3000',  'cafe_sum_5000_min_price_avg', 'cafe_sum_5000_max_price_avg',
              'cafe_avg_price_5000','cafe_count_5000_price_high']      

    
cafe_count = ['cafe_count_500', 'cafe_count_500_na_price',
       'cafe_count_500_price_500', 'cafe_count_500_price_1000',
       'cafe_count_500_price_1500', 'cafe_count_500_price_2500',
       'cafe_count_500_price_4000', 'cafe_count_500_price_high', 'cafe_count_1000', 
       'cafe_count_1000_na_price', 'cafe_count_1000_price_500',
       'cafe_count_1000_price_1000', 'cafe_count_1000_price_1500',
       'cafe_count_1000_price_2500', 'cafe_count_1000_price_4000',
       'cafe_count_1000_price_high','cafe_count_1500',
       'cafe_count_1500_na_price',
       'cafe_count_1500_price_500', 'cafe_count_1500_price_1000',
       'cafe_count_1500_price_1500', 'cafe_count_1500_price_2500',
       'cafe_count_1500_price_4000', 'cafe_count_1500_price_high', 'cafe_count_2000', 
       'cafe_count_2000_na_price', 'cafe_count_2000_price_500',
       'cafe_count_2000_price_1000', 'cafe_count_2000_price_1500',
       'cafe_count_2000_price_2500', 'cafe_count_2000_price_4000',
       'cafe_count_2000_price_high', 'cafe_count_3000', 'cafe_count_3000_na_price',
       'cafe_count_3000_price_500', 'cafe_count_3000_price_1000',
       'cafe_count_3000_price_1500', 'cafe_count_3000_price_2500',
       'cafe_count_3000_price_4000', 'cafe_count_3000_price_high','cafe_count_5000',
       'cafe_count_5000_na_price', 'cafe_count_5000_price_500',
       'cafe_count_5000_price_1000', 'cafe_count_5000_price_1500',
       'cafe_count_5000_price_2500', 'cafe_count_5000_price_4000',
       'cafe_count_5000_price_high'] 
        
price = ['price_doc', 'price_doc_log', 'price_doc_log10'] 

#### Green, Prom, Office, Trc, Chruch, Sport, Leisure, Market, Cafe_price,  Cafe_count

In [106]:
x_cafe = train[cafe_count]
x_cafe = x_cafe.dropna()
y = train[['price_doc_log']]

ols.fit(x_cafe, y)

print('Coeffs are ' + str(ols.coef_))
print('R^2 is ' + str(ols.score(x_cafe,y)))   # 0.08 before but now 1.0

Coeffs are [[  2.02856761e+09  -2.02856761e+09  -2.02856761e+09  -2.02856761e+09
   -2.02856761e+09  -2.02856761e+09  -2.02856761e+09  -2.02856761e+09
    2.54034996e-04   1.98282301e-02  -9.79506969e-03   3.58009338e-03
    7.19499588e-03   9.77575779e-04  -3.90267372e-02   1.75704360e-02
    5.43427467e-03  -3.03390622e-03   1.81531906e-03  -6.75511360e-03
   -8.30593705e-03   9.32234526e-03  -2.96115875e-02   4.20833007e-02
   -4.49216366e-03  -2.69681215e-03   4.77218628e-03   1.10850334e-02
   -8.67843628e-04  -7.24691153e-03   2.33708620e-02  -3.29381302e-02
    1.89957023e-03   9.49290395e-03  -8.37936997e-03  -2.34782696e-04
    1.93119049e-05   2.13789940e-03  -1.17961168e-02   1.11229569e-02
    5.28049469e-03   1.23032928e-03  -1.25064254e-02   2.40170956e-03
   -7.18307495e-03  -3.08614969e-03  -1.75397694e-02   4.24386859e-02]]
R^2 is 0.0892018322759


In [107]:
x_church = train[church]
x_church = x_church.dropna()
y = train[['price_doc_log']]

ols.fit(x_church, y)

print('Coeffs are ' + str(ols.coef_))
print('R^2 is ' + str(ols.score(x_church,y)))  # 0.05

Coeffs are [[ 0.022364   -0.05036055 -0.14571221  0.03528865 -0.01173266  0.15301492
   0.00110367  0.00212856  0.04093205  0.00027564 -0.00134871 -0.02354261
  -0.00368676  0.00646323  0.05961167 -0.01251535  0.00133001  0.04313117]]
R^2 is 0.0559264588909


In [108]:
x_green = train[church]
x_green = x_green.dropna()
y = train[['price_doc_log']]

ols.fit(x_green, y)

print('Coeffs are ' + str(ols.coef_))
print('R^2 is ' + str(ols.score(x_green,y)))  # 0.05

Coeffs are [[ 0.022364   -0.05036055 -0.14571221  0.03528865 -0.01173266  0.15301492
   0.00110367  0.00212856  0.04093205  0.00027564 -0.00134871 -0.02354261
  -0.00368676  0.00646323  0.05961167 -0.01251535  0.00133001  0.04313117]]
R^2 is 0.0559264588909


In [109]:
x_cafe_price = train[cafe_price]
x_cafe_price = x_cafe_price.dropna()
y = train[['price_doc_log']]

ols.fit(x_cafe_price, y)

print('Coeffs are ' + str(ols.coef_))
print('R^2 is ' + str(ols.score(x_cafe_price,y)))

ValueError: Found input variables with inconsistent numbers of samples: [17190, 30471]

In [110]:
x_office = train[office]
x_office = x_office.dropna()
y = train[['price_doc_log']]

ols.fit(x_office, y)

print('Coeffs are ' + str(ols.coef_))
print('R^2 is ' + str(ols.score(x_office,y))) # 0.06

Coeffs are [[  2.06039331e-02  -2.21250425e-07   7.37546198e-03  -3.58712147e-07
    1.14825424e-02  -9.88531878e-08  -9.35343898e-03   1.07443211e-07
   -7.04803335e-04   1.31691570e-08  -8.05433926e-04   1.35316761e-07]]
R^2 is 0.0646285767276


In [112]:
x_sport = train[church]
x_sport = x_sport.dropna()
y = train[['price_doc_log']]

ols.fit(x_sport, y)

print('Coeffs are ' + str(ols.coef_))
print('R^2 is ' + str(ols.score(x_sport,y))) # 0.05

Coeffs are [[ 0.022364   -0.05036055 -0.14571221  0.03528865 -0.01173266  0.15301492
   0.00110367  0.00212856  0.04093205  0.00027564 -0.00134871 -0.02354261
  -0.00368676  0.00646323  0.05961167 -0.01251535  0.00133001  0.04313117]]
R^2 is 0.0559264588909


In [113]:
x_leisure = train[church]
x_leisure = x_leisure.dropna()
y = train[['price_doc_log']]

ols.fit(x_leisure, y)

print('Coeffs are ' + str(ols.coef_))
print('R^2 is ' + str(ols.score(x_leisure,y)))   # 0.05

Coeffs are [[ 0.022364   -0.05036055 -0.14571221  0.03528865 -0.01173266  0.15301492
   0.00110367  0.00212856  0.04093205  0.00027564 -0.00134871 -0.02354261
  -0.00368676  0.00646323  0.05961167 -0.01251535  0.00133001  0.04313117]]
R^2 is 0.0559264588909


In [114]:
x_market = train[church]
x_market = x_market.dropna()
y = train[['price_doc_log']]

ols.fit(x_market, y)

print('Coeffs are ' + str(ols.coef_))
print('R^2 is ' + str(ols.score(x_market,y))) # 0.05

Coeffs are [[ 0.022364   -0.05036055 -0.14571221  0.03528865 -0.01173266  0.15301492
   0.00110367  0.00212856  0.04093205  0.00027564 -0.00134871 -0.02354261
  -0.00368676  0.00646323  0.05961167 -0.01251535  0.00133001  0.04313117]]
R^2 is 0.0559264588909


# More

In [121]:
important_features = ['id', 'full_sq', 'life_sq', 'floor',
                      'max_floor', 'material', 'build_year',
                      'num_room', 'kitch_sq', 'state', 'area_m',
                      'price_doc', 'gdp_quart', 'deposits_value', 
                      'deposits_growth', 'deposits_rate', 'mortgage_value',
                      'mortgage_growth', 'mortgage_rate', 'salary',
                      'unemployment', 'employment',  'oil_urals',
                      'stadium_km', 'floor', 'max_floor', 'metro_min_avto',
                      'build_year', 'cafe_count_5000_price_high', 'num_room',
                      'radiation_km',
                      'green_zone_km', 'industrial_km', 'indust_part',
                      'cpi', 'cafe_count_5000', 'cafe_count_1500_price_high',
                      'cafe_count_5000_price_500', 'cafe_count_2000_price_2500',
                      'kitch_sq', 'max_floor', 'trc_sqm_5000', 
                      'office_sqm_1000', 'trc_sqm_1500', 'office_sqm_500', 'cpi', 
                      'office_sqm_5000', 'ID_railroad_terminal', 'office_sqm_1500', 
                      'ekder_male', 'raion_popul', 'price_doc']

important_features = list(set(important_features))
# 'month', 'day',
#                       'year', 'month_year', 'week_year', 'oil_chemistry_raion_yes',

In [119]:
result = pd.concat(, axis=1, join_axes=[df1.index])

x_imp = result[important_features]
x_imp = x_imp.dropna()
y = result[['price_doc_log']]

ols.fit(x_imp, y)

print('Coeffs are ' + str(ols.coef_))
print('R^2 is ' + str(ols.score(x_imp,y))) # 0.05

KeyError: "['unemployment' 'month_yearweek_year' 'month' 'oil_chemistry_raion_yes'\n 'year' 'mortgage_rate' 'employment' 'oil_urals' 'deposits_rate' 'day'\n 'mortgage_growth' 'salary' 'cpi' 'gdp_quart' 'mortgage_value'\n 'deposits_value' 'deposits_growth'] not in index"

possible features to engineer/keep:

gdp

loans

employment

cpi



building age

market sentiment/months behind/economy delay

less than five km park yes/no

crime?

traffic?

distances within raion

area squared

other variables squared