# Random Forest 

### Import Libraries

In [11]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

### Import Dataset

In [12]:
train = pd.read_csv("../EDA/train_df_build_year_2017-05-26.csv",  parse_dates=['timestamp'], index_col=False, low_memory=False)
test = pd.read_csv("../EDA/test_df_build_year_2017-05-26.csv",   parse_dates=['timestamp'], index_col=False, low_memory=False)

### Merge Train & Test

In [13]:
frames = [train, test]

df = pd.concat(frames)

### Encode Non-numerical Data 

In [17]:
from sklearn import model_selection, preprocessing
from sklearn.preprocessing import LabelEncoder

def encode_object_features(df):
    '''
    Will encode each non-numerical column.
    '''
    for f in df.columns:
        if df[f].dtype=='object':
            print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(df[f].values.astype('str')) + list(test_df[f].values.astype('str')))
            df[f] = lbl.transform(list(df[f].values.astype('str')))    
    return df

In [14]:
ftr_df = df.loc[:, ['price_doc', 'timestamp', 'full_sq', 'life_sq', 'floor',
                          'max_floor', 'material', 'build_year', 'num_room', 'kitch_sq',
                          'state', 'product_type', 'sub_area', 'indust_part', 
                          'school_education_centers_raion', 'sport_objects_raion',
                          'culture_objects_top_25_raion', 'oil_chemistry_raion','metro_min_avto',
                          'green_zone_km', 'industrial_km','kremlin_km','radiation_km', 'ts_km',
                          'fitness_km', 'stadium_km', 'additional_education_km', 'cafe_count_1500_price_500',                          
                          'cafe_count_1500_price_high', 'cafe_count_2000_price_2500', 'trc_sqm_5000', 'cafe_count_5000',
                          'cafe_count_5000_price_high', 'oil_urals', 'gdp_quart', 'cpi',
                          'ppi', 'usdrub', 'eurrub', 'gdp_annual', 'rts',
                          'micex', 'micex_cbi_tr', 'deposits_rate', 
                          'mortgage_rate', 'income_per_cap', 'salary',
                          'labor_force', 'unemployment', 'employment']]

In [15]:
# Find delta for oil
df['delta_oil_1'] = (df['oil_urals'] - df['oil_urals_1']) / df['oil_urals']
df['delta_oil_3'] = (df['oil_urals'] - df['oil_urals_3']) / df['oil_urals']
df['delta_oil_6'] = (df['oil_urals'] - df['oil_urals_6']) / df['oil_urals']
df['delta_oil_12'] = (df['oil_urals'] - df['oil_urals_12']) / df['oil_urals']

KeyError: 'oil_urals_1'

In [9]:
# find delta for usdrub exchange rate
train['delta_usdrub_1'] = (train['usdrub'] - train['usdrub_1']) / train['usdrub']
train['delta_usdrub_3'] = (train['usdrub'] - train['usdrub_3']) / train['usdrub']
train['delta_usdrub_6'] = (train['usdrub'] - train['usdrub_6']) / train['usdrub']
train['delta_usdrub_12'] = (train['usdrub'] - train['usdrub_12']) / train['usdrub']

KeyError: 'usdrub_1'

In [151]:
# find delta of labor force
train['delta_labor_force_1'] = (train['labor_force'] - train['labor_force_1']) / train['labor_force']
train['delta_labor_force_3'] = (train['labor_force'] - train['labor_force_3']) / train['labor_force']
train['delta_labor_force_6'] = (train['labor_force'] - train['labor_force_6']) / train['labor_force']
train['delta_labor_force_12'] = (train['labor_force'] - train['labor_force_12']) / train['labor_force']

In [152]:
# now do the same for test
test['delta_oil_1'] = (test['oil_urals'] - test['oil_urals_1']) / test['oil_urals']
test['delta_oil_3'] = (test['oil_urals'] - test['oil_urals_3']) / test['oil_urals']
test['delta_oil_6'] = (test['oil_urals'] - test['oil_urals_6']) / test['oil_urals']
test['delta_oil_12'] = (test['oil_urals'] - test['oil_urals_12']) / test['oil_urals']

test['delta_usdrub_1'] = (test['usdrub'] - test['usdrub_1']) / test['usdrub']
test['delta_usdrub_3'] = (test['usdrub'] - test['usdrub_3']) / test['usdrub']
test['delta_usdrub_6'] = (test['usdrub'] - test['usdrub_6']) / test['usdrub']
test['delta_usdrub_12'] = (test['usdrub'] - test['usdrub_12']) / test['usdrub']

test['delta_labor_force_1'] = (test['labor_force'] - test['labor_force_1']) / test['labor_force']
test['delta_labor_force_3'] = (test['labor_force'] - test['labor_force_3']) / test['labor_force']
test['delta_labor_force_6'] = (test['labor_force'] - test['labor_force_6']) / test['labor_force']
test['delta_labor_force_12'] = (test['labor_force'] - test['labor_force_12']) / test['labor_force']

## Selecting Features

In [197]:
X3 = train.loc[:,['month','year','day','year_month','num_room','kremlin_km','cafe_count_5000',
               'metro_min_avto','additional_education_km','build_year',
               'industrial_km','state','delta_oil_1', 'delta_oil_3', 'delta_oil_6',
               'delta_oil_12', 'delta_labor_force_1','delta_labor_force_3',
               'delta_labor_force_6', 'delta_labor_force_12', 'delta_usdrub_1',
               'delta_usdrub_3','delta_usdrub_6', 'delta_usdrub_12', 'gdp_quart']]

y3 = np.log(train[[0]])

x3_test = test.loc[:,['month','year','day','year_month','num_room','kremlin_km','cafe_count_5000',
               'metro_min_avto','additional_education_km','build_year',
               'industrial_km','state','delta_oil_1', 'delta_oil_3', 'delta_oil_6',
               'delta_oil_12', 'delta_labor_force_1','delta_labor_force_3',
               'delta_labor_force_6', 'delta_labor_force_12', 'delta_usdrub_1',
               'delta_usdrub_3','delta_usdrub_6', 'delta_usdrub_12', 'gdp_quart']]

sample_leaf_options = [50,100,200,500]

for leaf_size in sample_leaf_options :
    rfr3 = RandomForestRegressor(n_estimators=1000, oob_score=True, n_jobs = -1,
                                   verbose = 1, random_state = 69, min_samples_leaf = leaf_size)
    # Train the model using the training sets and check score
    rfr3.fit(X3, y3.values.ravel())
    print "Leaf Size: %d" % (leaf_size)
    print rfr3.score(X3, y3)


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   28.6s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  2.2min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  2.9min finished


Leaf Size: 1


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    1.6s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    2.8s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    3.5s finished


0.903289750202


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   24.8s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   58.0s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  2.0min finished


Leaf Size: 5


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.5s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    1.1s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    1.9s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    2.3s finished


0.683725911737


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.1s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   18.7s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   45.8s


KeyboardInterrupt: 