In [1]:
import subprocess
import numpy as np
import pandas as pd
import sys
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
X_public = pd.read_csv('./X_public.csv')

In [3]:
X_public.describe()

Unnamed: 0,Id,feature1,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9,feature10
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,500000.5,4.998562,5.002102,4.998774,5.000081,5.005065,4.999457,4.998022,5.000269,5.002964,4.997661
std,288675.278933,2.888117,2.886425,2.884831,2.887275,2.884997,2.886547,2.887187,2.886937,2.88529,2.885937
min,1.0,3e-06,7e-06,3e-06,1.4e-05,4.8e-05,2.9e-05,1.5e-05,2e-06,2.5e-05,1.1e-05
25%,250000.75,2.49704,2.504117,2.503345,2.498478,2.509999,2.49998,2.495439,2.497291,2.505705,2.497706
50%,500000.5,4.997869,5.000204,4.996783,5.00106,5.006477,4.998965,4.995821,5.002918,5.005597,4.998946
75%,750000.25,7.500603,7.501698,7.493512,7.502291,7.500534,7.501806,7.502213,7.501117,7.498533,7.49539
max,1000000.0,10.0,9.999978,9.99999,9.999996,9.999988,9.999989,9.999997,9.99997,9.999985,9.999995


In [4]:
X_data = X_public.iloc[:,1:].as_matrix()

In [6]:
def expert(X):
    print 'Querying...'
    answers = []
    processes = []
    cur = 500
    m = len(X)-cur
    s = 300
    for i in xrange(cur):
        processes.append(subprocess.Popen('./Oracle.static ' + ' '.join(map(str, X[:cur][i])),
                        stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
                        shell=True, executable='/bin/bash'))
    for k in xrange((m+s-1)/s):
        slic = X[cur:cur+s]
        sys.stdout.write('\r{:.4f}:\t{:5}/{}'.format(float(k*s+len(slic))/m, k+1, (m+s-1)/s))
        for i in slic:
            processes.append(subprocess.Popen('./Oracle.static ' + ' '.join(map(str, i)),
                             stdout=subprocess.PIPE, stderr=subprocess.STDOUT,
                             shell=True, executable='/bin/bash'))
        cur += len(slic)
        for p in processes[:s]:
            answers.append(float(p.communicate()[0][:-1]))
        processes = processes[s:]
    for p in processes:
        answers.append(float(p.communicate()[0][:-1]))
    print '\nAnswered.'
    return np.array(answers)

In [27]:
answers = np.load('./answers_public_1KK.npy')

In [28]:
class ActiveLearning:
    def __init__(self, n_models = 10, warm_start = False, model = GradientBoostingRegressor, **kwargs):
        self.models = []
        for i in range(n_models):
            self.models.append(model(**kwargs))
        self.warm_start = warm_start
        self.metrics = []
        self.inited = False
    
    def rmse(self, y_pred, y_true):
        return sqrt(mean_squared_error(y_pred, y_true))
    
    def fit(self, X_unlabeled, expert, iterations = 10, bag = 0.6, add_size = 10000, init_size = 10000,
            Xtest=None, Ytest=None):
        if (not self.warm_start) or (not self.inited):
            self.metrics = []
            rc = np.random.choice(X_unlabeled.shape[0], init_size)
            self.X = X_unlabeled[rc]
            self.y = expert(self.X)
            X_unlabeled = np.delete(X_unlabeled, rc, axis=0)
            self.inited = True
            self.iterations = 0
        for i in xrange(iterations):
            predictions = []
            for k, model in enumerate(self.models):
                rc = np.random.choice(self.X.shape[0], int(bag*self.X.shape[0]))
                print 'Fitting {} of {}...'.format(k+1, len(self.models))
                model.fit(self.X[rc], self.y[rc])
                predictions.append(model.predict(X_unlabeled))
            var = np.var(predictions, axis=0)
            srt = var.argsort()[::-1][:add_size]
            self.X = np.vstack([self.X, X_unlabeled[srt]])
            self.y = np.hstack([self.y, expert(X_unlabeled[srt])])
            X_unlabeled = np.delete(X_unlabeled, srt, axis=0)
            if not ((Xtest is None) or (Ytest is None)):
                pr = self.predict(Xtest)
                print 'Iteration {0:3}: score {1:.4f}'.format(self.iterations, self.rmse(pr, Ytest))
            self.iterations += 1
        for k, model in enumerate(self.models):
            rc = np.random.choice(self.X.shape[0], int(bag*self.X.shape[0]))
            print 'Final fitting {} of {}...'.format(k+1, len(self.models))
            model.fit(self.X[rc], self.y[rc])
    
    def predict(self, X):
        predictions = []
        for model in self.models:
            predictions.append(model.predict(X))
        return np.mean(predictions, axis = 0)

In [29]:
al = ActiveLearning(n_models = 25, warm_start=True, n_estimators=1000, max_depth=3)

In [None]:
al.fit(X_data, expert, iterations=100, add_size=10000, bag=0.6, init_size=10000, Xtest=X_data, Ytest=answers)

Querying...
1.0000:	        32/        32

Answered.
Fitting 1 of 25...
Fitting 2 of 25...
Fitting 3 of 25...
Fitting 4 of 25...
Fitting 5 of 25...
Fitting 6 of 25...
Fitting 7 of 25...
Fitting 8 of 25...
Fitting 9 of 25...
Fitting 10 of 25...
Fitting 11 of 25...
Fitting 12 of 25...
Fitting 13 of 25...
Fitting 14 of 25...
Fitting 15 of 25...
Fitting 16 of 25...
Fitting 17 of 25...
Fitting 18 of 25...
Fitting 19 of 25...
Fitting 20 of 25...
Fitting 21 of 25...
Fitting 22 of 25...
Fitting 23 of 25...
Fitting 24 of 25...
Fitting 25 of 25...
Querying...
1.0000:	        32/        32

Answered.
Iteration   0: score 876859542760283410624902193132543127672512839680.0000
Fitting 1 of 25...
Fitting 2 of 25...
Fitting 3 of 25...
Fitting 4 of 25...
Fitting 5 of 25...
Fitting 6 of 25...
Fitting 7 of 25...
Fitting 8 of 25...
Fitting 9 of 25...
Fitting 10 of 25...
Fitting 11 of 25...
Fitting 12 of 25...
Fitting 13 of 25...
Fitting 14 of 25...
Fitting 15 of 25...
Fitting 16 of 25...
Fitting 17 of 25.

In [None]:
print 1

In [None]:
X_private = pd.read_csv('./X_private.csv')

In [None]:
X_data_private = X_private.iloc[:,1:].as_matrix()

In [None]:
predicted = al.predict(X_data_private)

In [None]:
with open('./submission4', 'w') as f:
    f.write('Id,Target\n')
    for i,p in enumerate(predicted):
        f.write('{},{}\n'.format(i+1,p))