# Learning and Intelligent Systems - Spring 2015

## Project 1: Regression

In [12]:
import numpy as np
import csv
import datetime

In [13]:
def get_features(row):
    vec = []
    t = datetime.datetime.strptime(row[0], '%Y-%m-%d %H:%M:%S')
    features = [t.year, t.month, t.day, t.weekday(), t.hour, row[1], row[2], row[3], row[4], row[5], row[6]]
    vec = np.concatenate((vec, features), axis=1)
    return vec

def read_data(inpath):
    X = []
    with open(inpath, 'r') as fin:
        reader = csv.reader(fin, delimiter=',')
        for row in reader:
            X.append(get_features(row))
    return np.atleast_2d(X)

In [14]:
X = read_data('train.csv')
Y = np.genfromtxt('train_y.csv', delimiter=',')

X = np.array(X).astype(np.float)
Y = np.array(Y).astype(np.float)

log_y = np.log(1+Y)

In [15]:
def logscore(gtruth, pred):
    pred = np.clip(pred,0, np.inf)
    logdif = np.log(1 + gtruth) - np.log(1 + pred)
    return np.sqrt(np.mean(np.square(logdif)))


In [31]:
def logscore2(gtruth, pred):
    return np.sqrt(np.mean(np.square(gtruth - pred)))

In [27]:
from operator import itemgetter

def report(random_search,  n_top=3):
    top_scores = sorted(random_search.grid_scores_, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")


In [35]:
import sklearn.cross_validation as skcv
import sklearn.metrics as skmet
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.grid_search import RandomizedSearchCV
from scipy.stats import randint as sp_randint
from time import time


cross_validation_object = skcv.StratifiedKFold(log_y, n_folds = 5)
neg_scorefun = skmet.make_scorer(lambda x, y: -logscore2(x,y))

scaler = MinMaxScaler()
regressor_GBR = GradientBoostingRegressor()

pipeline_object = Pipeline([('scaler', scaler),('model', regressor_GBR)])


n_features = X.shape[1]

param_dist = {"model__learning_rate": [ 0.03, 0.1],
              "model__max_depth": sp_randint(1, n_features),
              "model__n_estimators" : [300, 500]}

n_iter_search = 20
random_search_gbr = RandomizedSearchCV(pipeline_object, param_distributions=param_dist,
                                   n_iter=n_iter_search, cv = cross_validation_object,
                                   scoring=neg_scorefun, n_jobs=1, verbose=2)

start = time()
random_search_gbr.fit(X, log_y)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search_gbr)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV] model__max_depth=8, model__learning_rate=0.1, model__n_estimators=300 
[CV]  model__max_depth=8, model__learning_rate=0.1, model__n_estimators=300 -  11.4s
[CV] model__max_depth=8, model__learning_rate=0.1, model__n_estimators=300 
[CV]  model__max_depth=8, model__learning_rate=0.1, model__n_estimators=300 -  13.2s
[CV] model__max_depth=8, model__learning_rate=0.1, model__n_estimators=300 
[CV]  model__max_depth=8, model__learning_rate=0.1, model__n_estimators=300 -  15.2s
[CV] model__max_depth=8, model__learning_rate=0.1, model__n_estimators=300 
[CV]  model__max_depth=8, model__learning_rate=0.1, model__n_estimators=300 -  14.4s
[CV] model__max_depth=8, model__learning_rate=0.1, model__n_estimators=300 
[CV]  model__max_depth=8, model__learning_rate=0.1, model__n_estimators=300 -  14.9s
[CV] model__max_depth=10, model__learning_rate=0.1, model__n_estimators=500 
[CV]  model__max_depth=10, model__learning_rate=0.1, mod

[Parallel(n_jobs=1)]: Done   1 jobs       | elapsed:   11.4s
[Parallel(n_jobs=1)]: Done  41 jobs       | elapsed: 15.5min



[CV] model__max_depth=5, model__learning_rate=0.03, model__n_estimators=300 
[CV]  model__max_depth=5, model__learning_rate=0.03, model__n_estimators=300 -   5.7s
[CV] model__max_depth=5, model__learning_rate=0.03, model__n_estimators=300 
[CV]  model__max_depth=5, model__learning_rate=0.03, model__n_estimators=300 -   6.0s
[CV] model__max_depth=5, model__learning_rate=0.03, model__n_estimators=300 
[CV]  model__max_depth=5, model__learning_rate=0.03, model__n_estimators=300 -   5.9s
[CV] model__max_depth=5, model__learning_rate=0.03, model__n_estimators=300 
[CV]  model__max_depth=5, model__learning_rate=0.03, model__n_estimators=300 -   6.4s
[CV] model__max_depth=3, model__learning_rate=0.1, model__n_estimators=300 
[CV]  model__max_depth=3, model__learning_rate=0.1, model__n_estimators=300 -   2.6s
[CV] model__max_depth=3, model__learning_rate=0.1, model__n_estimators=300 
[CV]  model__max_depth=3, model__learning_rate=0.1, model__n_estimators=300 -   2.9s
[CV] model__max_depth=3, 

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 26.0min finished


In [36]:
Xval = read_data('validate.csv')
Xval = np.array(Xval).astype(np.float)
Ypred = random_search_gbr.best_estimator_.predict(Xval)
Ypred = np.exp(Ypred)-1
np.savetxt('result_validate-final.txt', Ypred)

In [38]:
Xtest = read_data('test.csv')
Xtest = np.array(Xtest).astype(np.float)
Ytest = random_search_gbr.best_estimator_.predict(Xtest)
Ytest = np.exp(Ytest)-1
np.savetxt('result_test.txt', Ytest)