In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_absolute_error
from itertools import product
import utils as ut
reload(ut)
%matplotlib inline

In [5]:
train = pd.read_csv('data/ds3_train.csv', index_col=0)
val = pd.read_csv('data/ds3_val.csv', index_col=0)
test = pd.read_csv('data/ds3_test.csv', index_col=0)

In [6]:
# Separate target variables
X, y_e, y_a = train.ix[:, :-3], train.ix[:, 'elevation'], train.ix[:, 'azimuth']
X_val, y_val_e, y_val_a = val.ix[:, :-3], val.ix[:, 'elevation'], val.ix[:, 'azimuth']
X_test, y_test_e, y_test_a = test.ix[:, :-3], test.ix[:, 'elevation'], test.ix[:, 'azimuth']

In [7]:
print X.shape, X_val.shape, X_test.shape

(42500, 1448) (6250, 1448) (7500, 1448)


### Standarize features

In [8]:
scaler = preprocessing.StandardScaler().fit(X)

In [9]:
# Feature normalization
X = scaler.transform(X)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

### Grid Search method

In [15]:
def grid_search(X, y, X_val, y_val, loss_func, model_grid):
    res = {}
    for j in range(len(model_grid)):
        tested_params = []
        loss_val_res = []
        loss_train_res = []
        skmodel = model_grid[j]['model']
        params_to_test = model_grid[j]["params"]

        for param_comb in product(*params_to_test.values()):
            params = {}
            for i, p in enumerate(params_to_test.keys()):
                params[p] = param_comb[i]
            print 'Testing j:', j, 'param_comb:', params
            model = skmodel(**params)
            model.fit(X, y)
            y_pred = model.predict(X_val)
            y_pred_train = model.predict(X)
            loss = loss_func(y_pred, y_val)
            loss_train = loss_func(y_pred_train, y)
            tested_params.append(str(params))
            loss_val_res.append(loss)
            loss_train_res.append(loss_train)
        print j, str(skmodel), zip(tested_params, loss_train_res, loss_val_res)
        res[str(skmodel)] = {p: [l_t, l_v] for (p, l_t, l_v) in zip(tested_params, loss_train_res, loss_val_res)}
    return res

In [16]:
model_grid = [{
            "model": DecisionTreeRegressor,
            "params": {'max_depth': range(10, 12),
                       'min_samples_leaf': [1, 50, 100],
                       'min_samples_split': [2, 50, 100]}
            },
            {
            "model": RandomForestRegressor,
            "params": {'n_estimators': [50, 100, 200],
                       'max_features': ['sqrt', 'log2']}
            },
            {
            "model": GradientBoostingRegressor,
            "params": {'loss': ['ls', 'lad'],
                       'n_estimators': [100, 200],
                       'subsample': [0.8, 1],
                       'max_features': [None, 'sqrt', 'log2']}
            },
            {
            "model": KNeighborsRegressor,
            "params": {'n_neighbors': [5, 10, 20],
                       'weights': ['uniform', 'distance'],
                       'n_jobs': [-1]}
            }]

print grid_search(X, y_e, X_val, y_val_e, mean_absolute_error, model_grid)

Tenting j: 0 param_comb: {'min_samples_split': 2, 'max_depth': 10, 'min_samples_leaf': 1}
Tenting j: 0 param_comb: {'min_samples_split': 2, 'max_depth': 10, 'min_samples_leaf': 10}
Tenting j: 0 param_comb: {'min_samples_split': 2, 'max_depth': 10, 'min_samples_leaf': 20}
Tenting j: 0 param_comb: {'min_samples_split': 2, 'max_depth': 10, 'min_samples_leaf': 40}
Tenting j: 0 param_comb: {'min_samples_split': 2, 'max_depth': 11, 'min_samples_leaf': 1}
Tenting j: 0 param_comb: {'min_samples_split': 2, 'max_depth': 11, 'min_samples_leaf': 10}
Tenting j: 0 param_comb: {'min_samples_split': 2, 'max_depth': 11, 'min_samples_leaf': 20}
Tenting j: 0 param_comb: {'min_samples_split': 2, 'max_depth': 11, 'min_samples_leaf': 40}
Tenting j: 0 param_comb: {'min_samples_split': 20, 'max_depth': 10, 'min_samples_leaf': 1}
Tenting j: 0 param_comb: {'min_samples_split': 20, 'max_depth': 10, 'min_samples_leaf': 10}
Tenting j: 0 param_comb: {'min_samples_split': 20, 'max_depth': 10, 'min_samples_leaf': 20}

In [9]:
from sklearn.neighbors import KNeighborsRegressor
# Does it help KNN?
knn_regr = KNeighborsRegressor(n_neighbors=5, weights='distance', n_jobs=-1)
knn_regr.fit(X, y_e)
# Predict
y_pred_e = knn_regr.predict(X_val)
loss = mean_absolute_error(y_val_e, y_pred_e)

print loss

18.8523180509


In [None]:
depth = 10
params = [max_depth=depth]
regr = DecisionTreeRegressor()
regr.fit(X, y_e)
# Predict
y_pred_e = regr.predict(X_test)
loss = mean_absolute_error(y_test_e, y_pred_e)

print loss

In [None]:
# Build dataframe to compare real values with predictions on test set
y_test = pd.Series(y_test_e, index=test.index)
y_pred = pd.Series(y_pred_e, index=test.index)
new_df = pd.concat([y_test, y_pred, test.ix[:, -3:]], axis=1)
new_df.to_csv('results/regtree15_all_features.csv')

In [None]:
# Regression Random Forest
forest = RandomForestRegressor(n_estimators=100, max_features='sqrt')
forest.fit(X, y_e)
# Predict
y_pred_e = forest.predict(X_test)
loss = mean_absolute_error(y_test_e, y_pred_e)

print loss

In [None]:
# Regression Random Forest
forest_azimuth = RandomForestRegressor(n_estimators=100, max_features='sqrt')
forest_azimuth.fit(X, y_a)
# Predict
y_pred_a = forest_azimuth.predict(X_test)
loss = loss_azimuth(y_test_a, y_pred_a)

print loss

27.9457466667

In [None]:
forest_azimuth.

In [None]:
importances = pd.Series(regr.feature_importances_, index=ds_3.columns[:-3])

In [None]:
clf = Lasso(alpha=0.1)
clf.fit(X, y_e)
# Predict
y_pred_e = clf.predict(X_test)
loss = mean_absolute_error(y_test_e, y_pred_e)

print loss

In [None]:
coef = clf.coef_

In [None]:
plt.figure(figsize=(7, 7))
plt.plot(range(len(coef)), coef)
print train.columns
plt.title('Coefficients')
plt.show()

In [None]:
# Build dataframe to compare real values with predictions on test set
y_test = pd.Series(y_test_e, index=test.index)
y_pred = pd.Series(y_pred_e, index=test.index)
new_df = pd.concat([y_test, y_pred, test.ix[:, -3:]], axis=1)
new_df.to_csv('results/lasso_all_features.csv')