In [1]:
%matplotlib inline

In [2]:
# Imports

# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np

# machine learning
from sklearn.ensemble import RandomForestRegressor
from sklearn import cross_validation




In [3]:
# get training & test csv files as a DataFrame
train_df = pd.read_csv("../input/train.csv" )
test_df    = pd.read_csv("../input/test.csv")

In [4]:
# There are some columns with non-numerical values(i.e. dtype='object'),
# So, We will create a corresponding unique numerical value for each non-numerical value in a column of training and testing set.

from sklearn import preprocessing

for f in train_df.columns:
    if train_df[f].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(np.unique(list(train_df[f].values) + list(test_df[f].values)))
        train_df[f] = lbl.transform(list(train_df[f].values))
        test_df[f]       = lbl.transform(list(test_df[f].values))

In [5]:
# define training and testing sets
# Remove Height and Weight. Rely on BMI

X_train = train_df.drop(["loss"],axis=1)
y_train = train_df["loss"]
X_test  = test_df.copy()

In [6]:
# Find the features that really matter in data set using Random Forest Classifier

feat_labels = X_train.columns
forest = RandomForestRegressor(n_estimators=500, random_state=0, n_jobs=-1)
forest.fit(X_train, y_train)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
importances

array([  2.95496331e-02,   7.91008736e-03,   1.82880263e-03,
         9.03982587e-04,   1.65624625e-03,   1.65357103e-03,
         1.62206520e-03,   3.78818177e-04,   7.56325773e-04,
         1.93106581e-03,   1.63027535e-03,   1.96111533e-03,
         2.88116506e-02,   1.73508543e-03,   8.70519571e-04,
         3.26023238e-06,   1.26747157e-03,   6.45941187e-04,
         3.58490649e-04,   6.05969491e-04,   1.48812399e-04,
         1.05754880e-04,   2.30066610e-05,   2.24165216e-03,
         8.74054433e-04,   1.70412920e-03,   2.10795757e-03,
         2.91704346e-03,   1.06756486e-03,   9.48192011e-04,
         5.66091287e-04,   7.99535070e-04,   3.49095152e-04,
         3.05881694e-04,   2.23683191e-04,   2.00770468e-04,
         2.89452923e-03,   2.24086634e-03,   2.73005337e-03,
         7.21479262e-04,   1.22991492e-03,   9.62865269e-04,
         4.56906946e-04,   6.84483654e-04,   2.18911098e-03,
         8.90302915e-04,   3.88936872e-04,   4.59858290e-04,
         1.17608519e-04,

In [7]:
# identify the list of top features

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))

 1) cat80                          0.225551
 2) cont7                          0.091583
 3) cat57                          0.053105
 4) cont2                          0.044899
 5) cat79                          0.040723
 6) cont14                         0.039435
 7) id                             0.029550
 8) cat12                          0.028812
 9) cat101                         0.019782
10) cat81                          0.019300
11) cat100                         0.015906
12) cat112                         0.015752
13) cont8                          0.015448
14) cont3                          0.015147
15) cont12                         0.015029
16) cont6                          0.014454
17) cont5                          0.014134
18) cont1                          0.013307
19) cont11                         0.013122
20) cont4                          0.012804
21) cont13                         0.011889
22) cat103                         0.011300
23) cat110                      

In [8]:
# Use only top features
X_train = forest.transform(X_train, threshold=.005)
X_test = forest.transform(X_test, threshold=.005)



In [9]:
forest = RandomForestRegressor(n_estimators=500, random_state=0, n_jobs=-1)
forest.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=-1, oob_score=False, random_state=0,
           verbose=0, warm_start=False)

In [10]:
y_pred = forest.predict(X_test)



In [11]:

output = pd.DataFrame({
        "ID": test_df["id"],
        "loss": y_pred[:]
    })
output.to_csv("../input/output.csv", index=False)