# Multiple Linear Regression Exploration

## Import libraries

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from __future__ import print_function

from scipy import stats
from sklearn import linear_model


%matplotlib inline

pd.options.mode.chained_assignment = None # default = 'warn'
pd.set_option('display.max_columns', 500)
color = sns.color_palette()

## Pull data and create regression object

In [143]:
train = pd.read_csv('train.csv', parse_dates=['timestamp'])
test = pd.read_csv('test.csv', parse_dates = ['timestamp'])
macro = pd.read_csv('macro.csv', parse_dates = ['timestamp'])

ols = linear_model.LinearRegression()  # create the object easily

# Combine Datasets

In [137]:
train['dataset'] = 'train'
test['dataset'] = 'test'

# concat train and test
df = pd.concat([train, test])

# merge macro set 
df = pd.merge(df, macro, how='left', on = 'timestamp')

df.shape

(38133, 392)

# Subset important columns

In [157]:
df_important = df.loc[:, ['id', 'timestamp', 'full_sq',
                          'life_sq', 'floor', 'max_floor', 'material',
                          'build_year', 'num_room',
                          'kitch_sq', 'state',
                          'product_type', 'sub_area',
                          'indust_part', 'school_education_centers_raion',
                          'sport_objects_raion', 'culture_objects_top_25_raion',
                          'oil_chemistry_raion', 'metro_min_avto',
                          'green_zone_km', 'industrial_km',
                          'kremlin_km', 'radiation_km',
                          'ts_km', 'fitness_km',
                          'stadium_km', 'additional_education_km',
                          'cafe_count_1500_price_500', 'cafe_count_1500_price_high',
                          'cafe_count_2000_price_2500', 'trc_sqm_5000',
                          'cafe_count_5000', 'cafe_count_5000_price_high',
                          'gdp_quart', 'cpi',
                          'ppi', 'usdrub',
                          'eurrub', 'gdp_annual',
                          'rts', 'micex',
                          'micex_cbi_tr', 'deposits_rate',
                          'mortgage_rate', 'income_per_cap',
                          'salary', 'labor_force',
                          'unemployment', 'employment']]

In [161]:
# check missing values
# df_important.info()
# it checked out

In [142]:
df_important.columns.values

array(['id', 'timestamp', 'full_sq', 'life_sq', 'floor', 'max_floor',
       'material', 'build_year', 'num_room', 'kitch_sq', 'tk/state',
       'product_type', 'sub_area', 'indust_part',
       'school_education_centers_raion', 'sport_objects_raion',
       'culture_objects_top_25_raion', 'oil_chemistry_raion',
       'metro_min_avto', 'green_zone_km', 'industrial_zone_km',
       'kremlin_km', 'radiation_km', 'ts_km', 'fitness_km', 'stadium_km',
       'additional_education_km', 'cafe_count_1500_price_500',
       'cafe_count_1500_price_high', 'cafe_count_2000_price_2500',
       'trc_sqm_5000', 'cafe_count_5000', 'cafe_count_5000_price_high',
       'gdp_quart', 'cpi', 'ppi', 'usdrub', 'eurrub', 'gdp_annual', 'rts',
       'micex', 'micex_cbi_tr', 'deposits_rate', 'mortgage_rate',
       'income_per_cap', 'salary', 'labor_force', 'unemployment',
       'employment'], dtype=object)

In [96]:
train['price_doc_log'] = np.log1p(train['price_doc'])
train['price_doc_log10'] = np.log10(train['price_doc'])
train_nona = train.dropna()

In [97]:
x = train_nona.drop('price_doc', axis = 1)
x = x.drop('price_doc_log', axis = 1)
x = x.drop('price_doc_log10', axis = 1)
x = x.select_dtypes(include=['float64'])


In [98]:
y = train_nona[['price_doc_log']]

## Fit the Regression