In [42]:
# Supress unnecessary warnings so that presentation looks clean
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

print ("Reading dataset...")
#Read the train dataset
dataset = pd.read_csv('../../input/trainv3.csv')
#Read test dataset
dataset_test = pd.read_csv('../../input/testv3.csv')

print ("Length of dataset = " + str(len(dataset)))

## cat117 is cont 2
## cat 118 is High vs. Low
print ("Factorizing categorical variables...")
features = dataset.columns
cats = [feat for feat in features if 'cat' in feat]
## print cats
for feat in cats:
    dataset[feat] = pd.factorize(dataset[feat], sort=True)[0]
    dataset_test[feat] = pd.factorize(dataset_test[feat], sort=True)[0]
print ("Finished loading and factorized data.")

Reading dataset...
Length of dataset = 188318
Factorizing categorical variables...
Finished loading and factorized data.


In [25]:
print ("Preprocessing the Data...")
## Response
shift = 200
response = np.log(dataset['loss'].values + shift)

ids_train = dataset['id']

## Drop the response from our dataset
dataset = dataset.drop(['loss', 'id'], 1)

print "Responses:"
print response

Preprocessing the Data...
Responses:
[ 7.78870066  7.30222685  8.07249545 ...,  8.69326862  7.47469844
  8.50749027]


In [66]:
print("-")*50
print "Random Forest Algo with KFold"
print("-")*50
#Evaluation of various combinations of RandomForest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn import cross_validation
from sklearn.cross_validation import KFold

print ("Creating Training and Validation sets...")
#get the number of rows and columns

seed = 0

## CODE FOR 1-FOLD VALIDATION
## FOR REFERENCE:
##X_train is 90% training features
##Y_train is 90% training responses

##X_val is the 10% validation features
##Y_val is the 10% validation responses

#X_train, X_val, Y_train, Y_val = cross_validation.train_test_split(dataset, response, test_size=.1, random_state=seed)
#print "Length of Validation Set:" + str(len(Y_val))
#print "Length of Train Set:" + str(len(Y_train))
#print "Ratio: " + str(len(Y_val) / float(len(Y_train)))


## K-Fold Validation
print ("Creating K-fold validation dataset indices")
n_folds = 10
kf = KFold(dataset.shape[0], n_folds=n_folds, shuffle = True)
pred_test = 0
temp_cv_score = []
cv_loss = pd.DataFrame(columns=["id","loss"])

for i, (train_index, test_index) in enumerate(kf):
    print "-" * 80
    print('\nFold %d' % (i + 1))

    X_train, X_val = dataset.iloc[train_index], dataset.iloc[test_index]
    Y_train, Y_val = response[train_index], response[test_index]
    print "Training size: " + str(len(X_train))
    print "Validation size: " + str(len(X_val))
    print "Total size: " + str(len(X_train) + len(X_val))

    
    seed = 0
    print "Training Random Forest Model..."
    n_estimators = 50
    ## n_jobs=-1: Use all cores
    ## n_estimators: Create random forest of 200 trees
    model = RandomForestRegressor(n_jobs=-1,
                                  n_estimators=n_estimators,
                                  max_features="sqrt",
                                  oob_score = True,
                                  verbose=1,
                                  random_state=seed)
    model.fit(X_train, Y_train)
    importances = model.feature_importances_
    
    results = np.exp(model.predict(X_val)) - 200
    cv_loss = pd.concat([cv_loss, pd.DataFrame({"id": ids_train[test_index], "loss": results})])
    
    MAE = mean_absolute_error(np.exp(Y_val) - shift, np.exp(model.predict(X_val)) - shift)
    print "MAE: " + str(MAE)
    
    pred_test += np.exp(model.predict(dataset_test.drop(['id'],1))) - shift

# filename = str(n_folds) + "Fold_" + str(n_estimators) + "Forest_CV_losses.csv"
# cv_loss.to_csv(filename, index=False, sort=True)

# submission = pd.DataFrame()
# submission['id'] = dataset_test['id']
# submission['loss'] = pred_test/n_folds
# filename = str(n_folds) + "Fold_" + str(n_estimators) + "Forest_Predictions_losses.csv"
# submission.to_csv(filename, index=False, sort=True)


--------------------------------------------------
Random Forest Algo with KFold
--------------------------------------------------
Creating Training and Validation sets...
Creating K-fold validation dataset indices
--------------------------------------------------------------------------------

Fold 1
Training size: 169486
Validation size: 18832
Total size: 188318
Training Random Forest Model...


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   18.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   20.9s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.4s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.3s finished


MAE: 407.787791607


ValueError: Number of features of the model must  match the input. Model n_features is 132 and  input n_features is 130 

In [65]:
cv_50 = pd.read_csv('10Fold_50Forest_CV_losses.csv')
cv_200 = pd.read_csv('10Fold_200Forest_CV_losses.csv')



mean_absolute_error(np.array(dataset['loss']), np.array(cv_50.sort(['id'], ascending=1)['loss']))

1212.2262741234279

In [17]:
filename = str(n_folds) + "Fold_" + str(n_estimators) + "Forest_CV_losses.csv"
cv_loss.to_csv(filename, index=False, sort=True)

submission = pd.DataFrame()
submission['id'] = dataset_test['id']
submission['loss'] = pred_test/n_folds
filename = str(n_folds) + "Fold_" + str(n_estimators) + "Forest_Predictions_losses.csv"
submission.to_csv(filename, index=False, sort=True)

In [39]:
print("-")*50
print "Random Forest Algo with 1 Fold Validation"
print("-")*50
#Evaluation of various combinations of RandomForest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn import cross_validation
from sklearn.cross_validation import KFold

print ("Creating Training and Validation sets...")
#get the number of rows and columns

seed = 0

## CODE FOR 1-FOLD VALIDATION
## FOR REFERENCE:
#X_train is 90% training features
#Y_train is 90% training responses

#X_val is the 10% validation features
#Y_val is the 10% validation responses

X_train, X_val, Y_train, Y_val = cross_validation.train_test_split(dataset, response, test_size=.1, random_state=seed)
print "Length of Validation Set:" + str(len(Y_val))
print "Length of Train Set:" + str(len(Y_train))
print "Ratio: " + str(len(Y_val) / float(len(Y_train)))

n_list = np.array([50])

for i in n_list:
    seed = 0
    print "Training Random Forest Model..."
    ## n_jobs=-1: Use all cores
    ## n_estimators: Create random forest of 50 trees
    model = RandomForestRegressor(n_jobs=-1,n_estimators=i,random_state=seed)
    model.fit(X_train, Y_train)
    
    results = np.exp(model.predict(X_val)) - 200
    print pd.DataFrame(results)
    # pd.DataFrame(results).to_csv("results.csv", index=False)

    result = i, mean_absolute_error(np.exp(Y_val), np.exp(model.predict(X_val)))
    print result
    
predictions = np.exp(model.predict(dataset_test.drop(['id'],1))) - shift
final = pd.DataFrame({"id": dataset_test['id'], "loss": predictions})
print final
final.to_csv("predictions.csv", index=False)



--------------------------------------------------
Random Forest Algo with 1 Fold Validation
--------------------------------------------------
Creating Training and Validation sets...
Length of Validation Set:18832
Length of Train Set:169486
Ratio: 0.111112422265
Training Random Forest Model...


KeyboardInterrupt: 

In [38]:
## Code for Tuning number of trees

print("-")*50
print "Random Forest Algo with 1 Fold Validation"
print("-")*50
#Evaluation of various combinations of RandomForest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn import cross_validation
from sklearn.cross_validation import KFold

print ("Creating Training and Validation sets...")
#get the number of rows and columns

seed = 0

## CODE FOR 1-FOLD VALIDATION
## FOR REFERENCE:
#X_train is 90% training features
#Y_train is 90% training responses

#X_val is the 10% validation features
#Y_val is the 10% validation responses

X_train, X_val, Y_train, Y_val = cross_validation.train_test_split(dataset, response, test_size=.1, random_state=seed)
print "Length of Validation Set:" + str(len(Y_val))
print "Length of Train Set:" + str(len(Y_train))
print "Ratio: " + str(len(Y_val) / float(len(Y_train)))

MAE = []
n_list = np.array([1,5,10,20,30,40,50,60,70,80,90,100,200,300])

for i in n_list:
    seed = 0
    print "Training Random Forest Model..."
    print str(i) + " Number of Trees..."
    ## n_jobs=-1: Use all cores
    ## n_estimators: Create random forest of 50 trees
    model = RandomForestRegressor(n_jobs=-1,
                                  n_estimators=i,
                                  max_features="sqrt",
                                  oob_score = True,
                                  verbose=1,
                                  random_state=seed)
    model.fit(X_train, Y_train)
    
    results = np.exp(model.predict(X_val)) - 200

    result = i, mean_absolute_error(np.exp(Y_val), np.exp(model.predict(X_val)))
    MAE.append(result)
    print MAE

pd.DataFrame(MAE).to_csv("MAE.csv", index=False)
    



--------------------------------------------------
Random Forest Algo with 1 Fold Validation
--------------------------------------------------
Creating Training and Validation sets...
Length of Validation Set:18832
Length of Train Set:169486
Ratio: 0.111112422265
Training Random Forest Model...
1 Number of Trees...


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s finished


              0
0       5293.62
1       1263.19
2       4570.22
3       1438.98
4       1505.39
5       5376.31
6       2527.29
7        343.53
8       1623.05
9       3173.48
10      2058.37
11       168.28
12      2963.10
13      6587.07
14      3601.19
15      1189.41
16      4925.34
17      3767.13
18      2447.72
19      3128.75
20      2027.65
21      1072.48
22      3140.14
23     10427.95
24      4502.35
25      1322.84
26       364.92
27      1427.01
28      2842.34
29      1527.94
...         ...
18802   1299.66
18803   2376.21
18804   1891.40
18805   3069.22
18806   9344.65
18807   2989.34
18808   2085.28
18809   2271.42
18810    760.10
18811   6189.07
18812   5118.88
18813   1574.08
18814   4174.21
18815   6491.26
18816   1247.00
18817   3553.49
18818   2520.79
18819   2365.29
18820   5287.48
18821   6020.20
18822   3148.59
18823    556.04
18824   1296.57
18825   4618.31
18826   1548.70
18827   7715.66
18828   1340.80
18829   2341.59
18830   1368.48
18831  17057.69

[18832 

[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    1.9s finished
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    0.0s finished
[Parallel(n_jobs=4)]: Done   5 out of   5 | elapsed:    0.0s finished


                 0
0      2326.235473
1      1559.673642
2      2572.284275
3      1142.524491
4      1224.885503
5      5669.638126
6      2574.036354
7       693.016659
8      2897.055815
9      1970.589251
10     1317.997208
11     1197.492619
12     3293.614112
13     5273.170369
14     5757.628589
15     1851.396448
16     5410.808749
17     4165.798993
18     1599.075079
19     4662.604408
20     4657.233118
21     2305.539100
22     5420.014881
23     3459.915186
24     2423.381073
25     5508.319311
26      769.183051
27     4956.997442
28      826.651042
29     1753.985892
...            ...
18802  1316.278812
18803  1416.284184
18804  1717.866919
18805  1658.138829
18806  3225.714700
18807  2746.579223
18808  1309.540531
18809  2550.842234
18810  1876.732015
18811  4707.850717
18812  3076.585430
18813   964.364898
18814  4581.998676
18815  2581.876240
18816  2290.888197
18817  2904.468890
18818  4562.954669
18819  2924.214820
18820  3924.946055
18821  2741.374813
18822  4238.

[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.3s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    0.1s finished


                 0
0      1558.807206
1      1573.535422
2      2562.159900
3      1058.620938
4      1254.200371
5      5924.439859
6      2194.441211
7       782.402467
8      1945.179853
9      1933.808168
10     1700.564322
11     1547.733575
12     2861.508081
13     5109.921649
14     6822.961530
15     2392.384345
16     4752.564644
17     3973.081019
18     1485.303162
19     3387.468647
20     3336.717547
21     2001.515734
22     5072.579908
23     3256.972101
24     1867.105165
25     5010.682640
26      888.205939
27     5486.279166
28      959.241217
29     1973.103562
...            ...
18802  1310.616893
18803  1403.106321
18804  1661.461504
18805  2055.289094
18806  2607.421322
18807  2432.208189
18808  1429.730889
18809  3613.571092
18810  1925.480026
18811  4853.984841
18812  2912.462030
18813  1141.047187
18814  4778.820858
18815  2285.728599
18816  1839.886135
18817  2519.640453
18818  3911.764597
18819  3130.450513
18820  3963.583905
18821  2673.375686
18822  5257.

[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    9.4s finished
[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    0.1s finished


                 0
0      1584.167866
1      1580.144794
2      2317.051638
3      1013.442431
4      1471.919153
5      6423.623601
6      2184.585810
7       675.166487
8      1823.202217
9      2105.296162
10     1480.808468
11     1245.024255
12     2783.350387
13     5281.092048
14     5112.842680
15     2280.216673
16     4623.195672
17     3923.976336
18     1356.378372
19     3516.288817
20     3059.726800
21     2745.555640
22     4092.869927
23     2957.667101
24     1598.993730
25     4513.600406
26      900.873735
27     4713.284996
28     1385.670667
29     1954.449261
...            ...
18802  1748.639019
18803  1309.512216
18804  1732.916638
18805  1725.469765
18806  3132.512798
18807  1954.641918
18808  1563.888303
18809  3031.611657
18810  1711.491034
18811  3223.804242
18812  2835.054550
18813  1264.825652
18814  5099.290453
18815  2662.310196
18816  2028.729129
18817  2740.176971
18818  3019.536120
18819  3150.570727
18820  3631.424774
18821  2565.243330
18822  4653.

[Parallel(n_jobs=4)]: Done  20 out of  20 | elapsed:    0.2s finished


[(1, 1852.3735536620445), (5, 1328.7001159845779), (10, 1269.4253326089677), (20, 1232.4228383456134)]
(20, 1232.4228383456134)
Training Random Forest Model...
30 Number of Trees...


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   14.7s finished
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    0.1s finished


                 0
0      1860.295337
1      1586.926124
2      2165.809621
3       995.229174
4      1396.881891
5      6774.653427
6      2095.489675
7       722.425213
8      1937.551957
9      2214.926283
10     1400.096302
11     1209.757344
12     2655.921603
13     4721.310986
14     4346.050096
15     2154.343163
16     4302.637654
17     3840.457565
18     1328.485991
19     3482.563925
20     3418.768513
21     2983.857576
22     4517.356964
23     2835.536328
24     1459.967168
25     4268.217845
26      816.083108
27     4945.747834
28     1466.031253
29     1931.707595
...            ...
18802  1903.954528
18803  1249.495255
18804  1783.898954
18805  1798.883035
18806  3027.586548
18807  1616.228663
18808  1695.511914
18809  2891.775849
18810  2108.470301
18811  3163.150648
18812  2803.790603
18813  1239.465022
18814  5104.501421
18815  2505.684740
18816  2010.302297
18817  2624.259590
18818  2832.311193
18819  3249.620880
18820  3361.360936
18821  2470.785897
18822  5301.

[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:    0.2s finished


[(1, 1852.3735536620445), (5, 1328.7001159845779), (10, 1269.4253326089677), (20, 1232.4228383456134), (30, 1223.7479485529041)]
(30, 1223.7479485529041)
Training Random Forest Model...
40 Number of Trees...


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   14.2s finished
[Parallel(n_jobs=4)]: Done  40 out of  40 | elapsed:    0.2s finished


                 0
0      2011.253727
1      1637.299977
2      2345.777542
3       954.700150
4      1398.980884
5      6717.011839
6      2195.095014
7       648.684148
8      1853.598339
9      2043.024751
10     1464.409033
11     1255.870127
12     2754.444439
13     4636.669315
14     4387.896629
15     2115.841897
16     4287.299682
17     3865.765708
18     1327.223001
19     3237.279133
20     4198.719972
21     2726.544841
22     4525.532741
23     2623.295615
24     1404.483844
25     4053.145216
26      784.034412
27     5402.636353
28     1551.126550
29     1879.483825
...            ...
18802  1829.576950
18803  1252.552671
18804  1769.939875
18805  1765.364110
18806  3140.279554
18807  1481.958583
18808  1675.305402
18809  2933.692318
18810  2119.547672
18811  2806.835933
18812  3099.196636
18813  1366.282581
18814  4773.472546
18815  2406.947915
18816  2105.428869
18817  2716.224458
18818  3089.030173
18819  3162.704931
18820  3218.978708
18821  2488.492190
18822  4953.

[Parallel(n_jobs=4)]: Done  40 out of  40 | elapsed:    0.2s finished


[(1, 1852.3735536620445), (5, 1328.7001159845779), (10, 1269.4253326089677), (20, 1232.4228383456134), (30, 1223.7479485529041), (40, 1218.9802203917311)]
(40, 1218.9802203917311)
Training Random Forest Model...
50 Number of Trees...


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   19.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   23.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.3s finished


                 0
0      1971.047709
1      1584.628926
2      2506.318442
3       931.091278
4      1386.834450
5      6497.988087
6      2227.394405
7       628.452792
8      1841.568223
9      2052.084564
10     1390.708779
11     1360.137401
12     2669.161377
13     4571.603344
14     4532.501397
15     2111.970140
16     4350.557220
17     3882.717371
18     1398.908345
19     3278.765827
20     4175.014890
21     2792.350065
22     4555.603174
23     2922.393516
24     1446.122143
25     3897.187048
26      753.698721
27     5358.293068
28     1594.238963
29     1742.267854
...            ...
18802  1939.236462
18803  1272.849622
18804  1749.795793
18805  1792.328771
18806  3280.207542
18807  1541.320253
18808  1658.665308
18809  2856.828832
18810  2220.948072
18811  2743.668695
18812  3058.589831
18813  1429.453499
18814  4746.754887
18815  2535.408966
18816  2006.379247
18817  2621.202491
18818  2977.755958
18819  3300.430563
18820  3192.432733
18821  2594.231444
18822  4538.

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  50 out of  50 | elapsed:    0.3s finished


[(1, 1852.3735536620445), (5, 1328.7001159845779), (10, 1269.4253326089677), (20, 1232.4228383456134), (30, 1223.7479485529041), (40, 1218.9802203917311), (50, 1216.5838093306693)]
(50, 1216.5838093306693)
Training Random Forest Model...
60 Number of Trees...


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.7s
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed:   23.7s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed:    0.3s finished


                 0
0      1855.915425
1      1571.988381
2      2466.567915
3       902.469259
4      1339.144153
5      6456.402836
6      2234.728253
7       648.642841
8      1774.126583
9      2018.865005
10     1488.644491
11     1384.056570
12     2740.335641
13     4651.570982
14     4508.371103
15     2107.008993
16     4446.623130
17     3838.961965
18     1450.084629
19     3244.925359
20     3916.834677
21     2747.399958
22     4608.108277
23     3078.428495
24     1452.426998
25     3544.157467
26      767.373552
27     5489.240462
28     1483.172380
29     1718.367970
...            ...
18802  1917.015339
18803  1312.684948
18804  1759.026009
18805  1795.332624
18806  3367.367284
18807  1463.424943
18808  1591.102174
18809  2953.811892
18810  2180.314894
18811  2765.158782
18812  2925.738859
18813  1386.178868
18814  4678.221792
18815  2364.362878
18816  1934.818941
18817  2660.628667
18818  2999.824117
18819  3154.681183
18820  3108.015746
18821  2678.878637
18822  4446.

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.3s
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed:    0.4s finished


[(1, 1852.3735536620445), (5, 1328.7001159845779), (10, 1269.4253326089677), (20, 1232.4228383456134), (30, 1223.7479485529041), (40, 1218.9802203917311), (50, 1216.5838093306693), (60, 1213.693331209957)]
(60, 1213.693331209957)
Training Random Forest Model...
70 Number of Trees...


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.8s
[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:   23.6s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  70 out of  70 | elapsed:    0.4s finished


                 0
0      1920.523039
1      1588.658759
2      2530.388912
3       883.703479
4      1334.018101
5      6543.255296
6      2269.718250
7       648.145291
8      1761.746008
9      1992.474003
10     1429.142196
11     1503.041588
12     2758.632421
13     4643.359628
14     4420.316792
15     2104.485248
16     4383.533000
17     3858.958779
18     1405.132264
19     3276.869309
20     3786.794376
21     2838.222827
22     4451.958946
23     2970.304872
24     1484.209143
25     3453.690247
26      776.288952
27     5150.407401
28     1484.126455
29     1766.437022
...            ...
18802  1936.951606
18803  1302.156126
18804  1766.411778
18805  1784.939881
18806  3344.103987
18807  1438.792189
18808  1616.919220
18809  2918.091173
18810  2195.746250
18811  2829.564413
18812  2808.728093
18813  1339.255095
18814  4904.340497
18815  2281.920457
18816  1937.284594
18817  2747.821400
18818  3139.565396
18819  3110.297547
18820  3187.927922
18821  2692.558325
18822  4508.

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  70 out of  70 | elapsed:    0.4s finished


[(1, 1852.3735536620445), (5, 1328.7001159845779), (10, 1269.4253326089677), (20, 1232.4228383456134), (30, 1223.7479485529041), (40, 1218.9802203917311), (50, 1216.5838093306693), (60, 1213.693331209957), (70, 1211.9008216961377)]
(70, 1211.9008216961377)
Training Random Forest Model...
80 Number of Trees...


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   16.8s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   30.1s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  80 out of  80 | elapsed:    0.5s finished


                 0
0      1859.767409
1      1566.833987
2      2562.173278
3       875.745861
4      1320.698204
5      6461.517960
6      2252.471128
7       646.065752
8      1667.588215
9      1959.858056
10     1437.636683
11     1508.267460
12     2750.800200
13     4570.693049
14     4235.845956
15     2083.177138
16     4480.447936
17     3806.973699
18     1419.770445
19     3305.564788
20     3848.442329
21     2826.288941
22     4178.041774
23     2892.582171
24     1515.811527
25     3444.455891
26      822.497967
27     5139.860750
28     1420.013188
29     1693.788055
...            ...
18802  1899.050833
18803  1261.320470
18804  1770.508636
18805  1813.698444
18806  3340.622270
18807  1424.813209
18808  1617.204110
18809  2766.114793
18810  2176.010762
18811  2923.795338
18812  2672.361698
18813  1355.245843
18814  4924.854241
18815  2363.743471
18816  1940.739792
18817  2634.685634
18818  3040.613588
18819  3110.323677
18820  3045.474508
18821  2696.019728
18822  4412.

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  80 out of  80 | elapsed:    0.5s finished


[(1, 1852.3735536620445), (5, 1328.7001159845779), (10, 1269.4253326089677), (20, 1232.4228383456134), (30, 1223.7479485529041), (40, 1218.9802203917311), (50, 1216.5838093306693), (60, 1213.693331209957), (70, 1211.9008216961377), (80, 1210.8823532415558)]
(80, 1210.8823532415558)
Training Random Forest Model...
90 Number of Trees...


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   17.4s
[Parallel(n_jobs=-1)]: Done  90 out of  90 | elapsed:   38.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:    0.5s finished


                 0
0      1923.892688
1      1614.848140
2      2507.732073
3       883.599844
4      1322.889902
5      6399.272215
6      2304.961231
7       664.301480
8      1694.046220
9      1935.557500
10     1406.987357
11     1477.033618
12     2735.380717
13     4494.247812
14     4159.655120
15     2003.501373
16     4504.128026
17     3729.420033
18     1399.387505
19     3365.177448
20     4027.615699
21     2801.969681
22     4191.703734
23     2876.329843
24     1457.990815
25     3662.448527
26      813.800717
27     5007.370518
28     1368.531993
29     1612.262976
...            ...
18802  1868.747812
18803  1268.682347
18804  1784.190457
18805  1773.854348
18806  3500.327096
18807  1441.553841
18808  1613.076387
18809  2863.608231
18810  2124.307436
18811  2967.103199
18812  2814.703881
18813  1364.070294
18814  4957.088099
18815  2333.711626
18816  1898.189506
18817  2554.665657
18818  3038.783364
18819  3142.580724
18820  3135.704741
18821  2709.565330
18822  4412.

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:    0.4s finished


[(1, 1852.3735536620445), (5, 1328.7001159845779), (10, 1269.4253326089677), (20, 1232.4228383456134), (30, 1223.7479485529041), (40, 1218.9802203917311), (50, 1216.5838093306693), (60, 1213.693331209957), (70, 1211.9008216961377), (80, 1210.8823532415558), (90, 1210.3924708584934)]
(90, 1210.3924708584934)
Training Random Forest Model...
100 Number of Trees...


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.0s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   36.0s finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.5s finished


                 0
0      1911.929954
1      1659.378933
2      2504.430868
3       874.746435
4      1340.393827
5      6424.830964
6      2195.458465
7       700.810707
8      1684.264012
9      1936.155636
10     1447.580255
11     1467.660467
12     2725.826562
13     4616.332652
14     4220.835577
15     2104.320423
16     4507.362860
17     3680.660325
18     1375.743044
19     3359.012744
20     4111.109523
21     2810.735070
22     4055.593395
23     2931.888442
24     1456.235879
25     3597.361202
26      798.578462
27     4898.949235
28     1420.657291
29     1599.451790
...            ...
18802  1814.518696
18803  1235.641750
18804  1783.169131
18805  1815.026029
18806  3627.641005
18807  1469.741407
18808  1653.081833
18809  2813.127628
18810  2120.914653
18811  3063.305525
18812  2890.007820
18813  1383.169497
18814  4966.492462
18815  2321.681223
18816  1925.101688
18817  2648.169623
18818  3039.041174
18819  3176.492935
18820  3203.430082
18821  2647.669915
18822  4302.

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.5s finished


[(1, 1852.3735536620445), (5, 1328.7001159845779), (10, 1269.4253326089677), (20, 1232.4228383456134), (30, 1223.7479485529041), (40, 1218.9802203917311), (50, 1216.5838093306693), (60, 1213.693331209957), (70, 1211.9008216961377), (80, 1210.8823532415558), (90, 1210.3924708584934), (100, 1209.1758397649126)]
(100, 1209.1758397649126)
Training Random Forest Model...
200 Number of Trees...


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.6s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  1.1min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    3.5s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    3.6s finished


                 0
0      1962.992812
1      1667.208801
2      2423.261602
3       898.379891
4      1390.782489
5      6481.256139
6      2146.699576
7       690.437737
8      1684.077603
9      1927.712907
10     1478.381804
11     1701.710821
12     2788.123706
13     4748.627879
14     4148.264153
15     2124.983633
16     4317.309588
17     3734.699331
18     1367.984378
19     3315.820907
20     4535.666985
21     2571.940058
22     4050.569462
23     2926.271483
24     1504.339749
25     3187.103979
26      796.870583
27     4832.689103
28     1404.889174
29     1572.446755
...            ...
18802  1803.491887
18803  1209.303513
18804  1780.877113
18805  1799.574944
18806  3657.514298
18807  1532.831122
18808  1632.310676
18809  2952.561023
18810  2125.383922
18811  2998.286986
18812  2757.009592
18813  1405.463568
18814  4786.558402
18815  2451.691009
18816  1943.622424
18817  2640.027353
18818  2977.771603
18819  3109.535493
18820  3161.918351
18821  2726.739913
18822  4101.

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    1.0s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:    1.0s finished


[(1, 1852.3735536620445), (5, 1328.7001159845779), (10, 1269.4253326089677), (20, 1232.4228383456134), (30, 1223.7479485529041), (40, 1218.9802203917311), (50, 1216.5838093306693), (60, 1213.693331209957), (70, 1211.9008216961377), (80, 1210.8823532415558), (90, 1210.3924708584934), (100, 1209.1758397649126), (200, 1206.6705722102038)]
(200, 1206.6705722102038)
Training Random Forest Model...
300 Number of Trees...


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   15.5s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.8min finished
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.8s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    4.1s
[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    6.3s finished


                 0
0      1941.420347
1      1691.079461
2      2454.234295
3       876.724950
4      1447.007655
5      6557.705149
6      2208.362830
7       690.284767
8      1684.471457
9      1923.525702
10     1452.560624
11     1638.057969
12     2823.575356
13     4875.581306
14     4109.190970
15     2154.650390
16     4359.920074
17     3694.189355
18     1346.893955
19     3388.073801
20     4497.017327
21     2514.434282
22     3966.317647
23     3028.103837
24     1504.595061
25     3032.022506
26      787.486694
27     4715.149150
28     1393.551516
29     1612.452515
...            ...
18802  1841.502809
18803  1197.691072
18804  1807.892918
18805  1804.012688
18806  3726.871683
18807  1589.677336
18808  1629.929283
18809  2924.494527
18810  2068.124556
18811  3051.511748
18812  2711.005729
18813  1435.427490
18814  4901.144229
18815  2473.117232
18816  1977.622081
18817  2697.618113
18818  2934.989116
18819  3102.609460
18820  3138.825655
18821  2668.611036
18822  4130.

[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.9s
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed:    4.0s


[(1, 1852.3735536620445), (5, 1328.7001159845779), (10, 1269.4253326089677), (20, 1232.4228383456134), (30, 1223.7479485529041), (40, 1218.9802203917311), (50, 1216.5838093306693), (60, 1213.693331209957), (70, 1211.9008216961377), (80, 1210.8823532415558), (90, 1210.3924708584934), (100, 1209.1758397649126), (200, 1206.6705722102038), (300, 1205.305347249322)]
(300, 1205.305347249322)


[Parallel(n_jobs=4)]: Done 300 out of 300 | elapsed:    6.2s finished


In [43]:


model.predict(X_val)

result = mean_absolute_error(np.exp(Y_val), np.exp(model.predict(X_val)))
print result


1199.52252619


In [39]:
print(np.max(np.exp(Y_val)))
print(np.max(np.exp(model.predict(X_val))))

print(np.min(np.exp(Y_val)))
print(np.min(np.exp(model.predict(X_val))))

79823.52
74284.6499438
208.4
212.468673402


In [45]:
predictions = np.exp(model.predict(dataset_test.drop(['id'],1))) - shift
pd.DataFrame(test_results).to_csv("test_results.csv", index=False)

In [46]:
predictions = np.exp(model.predict(dataset_test.drop(['id'],1))) - shift
final = pd.DataFrame({"id": dataset_test['id'], "loss": predictions})
print final
final.to_csv("predictions.csv", index=False)

            id          loss
0            4   1564.566853
1            6   1867.538184
2            9  17609.892638
3           12   8705.106719
4           15    667.158580
5           17   4467.462705
6           21   1587.517670
7           28   1082.680701
8           32   3196.317715
9           43   8423.991231
10          46   3785.479402
11          50   1257.128050
12          54   1015.295290
13          62   1819.752481
14          70   2082.138470
15          71   9916.939520
16          75   3882.482637
17          77   2714.188691
18          81   5479.230856
19          83   2339.194765
20          87   2300.101534
21          97   1769.455781
22         103   1392.603501
23         119   1318.940507
24         120   1837.763600
25         127   1138.647078
26         138   3378.242777
27         141   2632.653634
28         148    952.278967
29         150   3253.852208
...        ...           ...
125516  587482   1396.866419
125517  587484   4719.304256
125518  587489