In [1]:
%matplotlib inline

In [2]:
# Imports

# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np

# machine learning
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import cross_validation




In [3]:
# get training & test csv files as a DataFrame
train_df = pd.read_csv("../input/train.csv" )
test_df    = pd.read_csv("../input/test.csv")

In [4]:
# There are some columns with non-numerical values(i.e. dtype='object'),
# So, We will create a corresponding unique numerical value for each non-numerical value in a column of training and testing set.

from sklearn import preprocessing

for f in train_df.columns:
    if train_df[f].dtype == 'object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(np.unique(list(train_df[f].values) + list(test_df[f].values)))
        train_df[f] = lbl.transform(list(train_df[f].values))
        test_df[f]       = lbl.transform(list(test_df[f].values))

In [5]:
# define training and testing sets
# Remove Height and Weight. Rely on BMI

X_train = train_df.drop(["loss"],axis=1)
y_train = train_df["loss"]
X_test  = test_df.copy()

In [6]:
# Find the features that really matter in data set using Random Forest Classifier

feat_labels = X_train.columns
forest = ExtraTreesRegressor(n_estimators=500, random_state=0, n_jobs=-1)
forest.fit(X_train, y_train)
importances = forest.feature_importances_
indices = np.argsort(importances)[::-1]
importances

array([  1.26396783e-02,   9.93803669e-03,   5.29652521e-03,
         1.81381179e-03,   4.11856326e-03,   3.93622154e-03,
         4.14290499e-03,   5.48915298e-04,   1.97181022e-03,
         3.43393528e-03,   4.29254370e-03,   4.64086970e-03,
         3.39900455e-02,   4.16958891e-03,   1.61670020e-03,
         9.74773099e-06,   2.29374987e-03,   1.34698778e-03,
         7.08815669e-04,   1.01390949e-03,   2.23909341e-04,
         2.39004561e-04,   3.25025958e-05,   4.41550212e-03,
         2.11667111e-03,   3.97726349e-03,   3.88520239e-03,
         4.34887967e-03,   2.67429829e-03,   1.67901645e-03,
         1.25529733e-03,   1.95633079e-03,   7.42808862e-04,
         6.65116724e-04,   4.77933279e-04,   3.75739082e-04,
         4.84910975e-03,   3.85531253e-03,   4.87391597e-03,
         1.56022926e-03,   2.79746123e-03,   2.35028160e-03,
         7.81323161e-04,   1.72576722e-03,   5.82809924e-03,
         2.02527111e-03,   6.13313087e-04,   7.68872719e-04,
         2.70279162e-04,

In [7]:
# identify the list of top features

for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))

 1) cat80                          0.178627
 2) cat79                          0.079035
 3) cat57                          0.055112
 4) cont2                          0.040831
 5) cont7                          0.040695
 6) cat12                          0.033990
 7) cat81                          0.020051
 8) cont14                         0.016194
 9) cont12                         0.015983
10) cont11                         0.015601
11) cat105                         0.014260
12) cat87                          0.014109
13) cat100                         0.013211
14) id                             0.012640
15) cat72                          0.012600
16) cat112                         0.011810
17) cat101                         0.011163
18) cat106                         0.010927
19) cont3                          0.010300
20) cont5                          0.010244
21) cat113                         0.010038
22) cat1                           0.009938
23) cat110                      

In [8]:
# Use only top features
X_train = forest.transform(X_train, threshold=.005)
X_test = forest.transform(X_test, threshold=.005)



In [9]:
forest = ExtraTreesRegressor(n_estimators=500, random_state=0, n_jobs=-1)
forest.fit(X_train, y_train)

ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_split=1e-07, min_samples_leaf=1,
          min_samples_split=2, min_weight_fraction_leaf=0.0,
          n_estimators=500, n_jobs=-1, oob_score=False, random_state=0,
          verbose=0, warm_start=False)

In [10]:
y_pred = forest.predict(X_test)



In [11]:

output = pd.DataFrame({
        "ID": test_df["id"],
        "loss": y_pred[:]
    })
output.to_csv("../input/output-xtrees.csv", index=False)