# Random Forest 

### Import Libraries

In [1]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None)

### Import Dataset

In [94]:
df = pd.read_csv("../EDA/cleaned_df_2017-05-26.csv",  parse_dates=['timestamp'], index_col=False, low_memory=False)
y_train = df.loc[df['price_doc_log'].notnull(), 'price_doc_log']
X_train = df.loc[df['price_doc_log'].notnull(), df.columns != 'price_doc_log']
X_test = df.loc[df['price_doc_log'].isnull(), df.columns != 'price_doc_log'].drop('price_doc', axis=1)

In [95]:
id_test = X_test.index

Int64Index([30471, 30472, 30473, 30474, 30475, 30476, 30477, 30478, 30479,
            30480,
            ...
            38123, 38124, 38125, 38126, 38127, 38128, 38129, 38130, 38131,
            38132],
           dtype='int64', length=7662)

In [96]:
y_train.head()

0    15.581952
1    15.607270
2    15.555977
3    16.388123
4    16.608603
Name: price_doc_log, dtype: float64

### Encode Non-numerical Data 

In [97]:
from sklearn import model_selection, preprocessing
from sklearn.preprocessing import LabelEncoder

def encode_object_features(df):
    '''
    Will encode each non-numerical column.
    '''
    for f in df.columns:
        if df[f].dtype=='object':
            print(f)
            lbl = preprocessing.LabelEncoder()
            lbl.fit(list(df[f].values.astype('str')) + list(df[f].values.astype('str')))
            df[f] = lbl.transform(list(df[f].values.astype('str')))    
    return df

In [98]:
X_train = encode_object_features(X_train)
X_test = encode_object_features(X_test)

one_month_lag
three_month_lag
six_month_lag
twelve_month_lag
timestamp_1
timestamp_3
timestamp_6
timestamp_12
big_market_raion
big_road1_1line
culture_objects_top_25
detention_facility_raion
ecology
incineration_raion
nuclear_reactor_raion
oil_chemistry_raion
product_type
radiation_raion
railroad_1line
railroad_terminal_raion
sub_area
thermal_power_plant_raion
water_1line
child_on_acc_pre_school
modern_education_share
old_education_build_share
one_month_lag
three_month_lag
six_month_lag
twelve_month_lag
timestamp_1
timestamp_3
timestamp_6
timestamp_12
big_market_raion
big_road1_1line
culture_objects_top_25
detention_facility_raion
ecology
incineration_raion
nuclear_reactor_raion
oil_chemistry_raion
product_type
radiation_raion
railroad_1line
railroad_terminal_raion
sub_area
thermal_power_plant_raion
water_1line
child_on_acc_pre_school
modern_education_share
old_education_build_share


In [99]:
feature_list = ['full_sq',
                'life_sq',
                'floor',
                'max_floor',
                'material',
                'build_year',
                'num_room',
                'kitch_sq',
                'state',
                'product_type',
                'sub_area',
                'indust_part',
                'school_education_centers_raion',
                'sport_objects_raion',
                'culture_objects_top_25_raion',
                'oil_chemistry_raion',
                'metro_min_avto',
                'green_zone_km',
                'industrial_km',
                'kremlin_km',
                'radiation_km',
                'ts_km',
                'fitness_km',
                'stadium_km',
                'additional_education_km',
                'cafe_count_1500_price_500',
                'cafe_count_1500_price_high',
                'cafe_count_2000_price_2500',
                'trc_sqm_5000',
                'cafe_count_5000',
                'cafe_count_5000_price_high',
                'delta_oil_1',
                'delta_oil_3',
                'delta_oil_6',
                'delta_oil_12',
                'delta_usdrub_1',
                'delta_usdrub_3',
                'delta_usdrub_6',
                'delta_usdrub_12',
                'delta_labor_force_1',
                'delta_labor_force_3',
                'delta_labor_force_6',
                'delta_labor_force_12']

In [100]:
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

In [101]:
np.sum(X_test.isnull())
# fillna(0)

0_13_all                      0
0_13_female                   0
0_13_male                     0
0_17_all                      0
0_17_female                   0
0_17_male                     0
0_6_all                       0
0_6_female                    0
0_6_male                      0
16_29_all                     0
16_29_female                  0
16_29_male                    0
7_14_all                      0
7_14_female                   0
7_14_male                     0
ID_big_road1                  0
ID_big_road2                  0
ID_bus_terminal               0
ID_metro                      0
ID_railroad_station_avto      0
ID_railroad_station_walk      0
ID_railroad_terminal          0
additional_education_km       0
additional_education_raion    0
area_m                        0
basketball_km                 0
big_church_count_1000         0
big_church_count_1500         0
big_church_count_2000         0
big_church_count_3000         0
                             ..
delta_us

## Selecting Features

In [102]:
from sklearn.ensemble import RandomForestRegressor

X = X_train.loc[:,feature_list]

y = y_train

x_test = X_test.loc[:,feature_list]

#sample_leaf_options = [50,100,200,500]

for leaf_size in sample_leaf_options :
    rfr50 = RandomForestRegressor(n_estimators=1000, oob_score=True, n_jobs = -1,
                                   verbose = 1, random_state = 42, min_samples_leaf = 50)
    # Train the model using the training sets and check score
    rfr50.fit(X, y.values)
    print "Leaf Size: %d" % (leaf_size)
    print rfr50.score(X, y)


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   24.3s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   54.9s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  2.0min finished


Leaf Size: 50


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    2.0s finished


0.505769477952


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   24.9s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   57.4s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  2.3min finished


Leaf Size: 100


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    1.5s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    2.0s finished


0.505769477952


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   25.9s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   56.8s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  2.1min finished


Leaf Size: 200


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    0.4s
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 792 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done 1000 out of 1000 | elapsed:    2.1s finished


0.505769477952


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   24.4s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:   57.9s
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  2.2min finished


KeyboardInterrupt: 

In [None]:
rfr50 = RandomForestRegressor(n_estimators=1000, oob_score=True, n_jobs = -1,
                              verbose = 1, random_state = 69, min_samples_leaf = 50)

In [48]:
rfr100 = RandomForestRegressor(n_estimators=1000, oob_score=True, n_jobs = -1,
                              verbose = 1, random_state = 69, min_samples_leaf = 100)

In [63]:
rfr50.fit(X, y.values.ravel())

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   28.5s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed:  2.3min finished


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=50,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=-1, oob_score=True, random_state=69,
           verbose=1, warm_start=False)

In [None]:
rfr50.predict(X_test)

In [None]:
id_test = test['id']
y_pred = np.round(predicted * 0.99)
df_sub = pd.DataFrame({'id': id_test, 'price_doc': y_pred})