In [1]:
import numpy as np
import pandas as pd
from __future__ import division
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

### Functions

In [2]:
def mape(y,y_pred):
    return np.mean(np.abs((y - y_pred) / y))

def max_outlier(y,y_pred):
    deviations = np.abs(y - y_pred)
    return np.where(deviations == deviations.max())

def get_clf(X,y):
    nX = np.copy(X)
    ny = np.copy(y)
    clf = LinearRegression()
    clf.fit(nX,ny)
    while r2_score(ny,clf.predict(nX)) <= 0.99996:
        row_for_delete = max_outlier(ny,clf.predict(nX))
        nX = np.delete(nX, row_for_delete, axis=0)
        ny = np.delete(ny, row_for_delete, axis=0)
        clf.fit(nX,ny)
    return clf

### Main

In [3]:
X_train = pd.read_csv('input/x_train.csv')
y_train = pd.read_csv('input/y_train.csv')
X_test  = pd.read_csv('input/x_test.csv')

train_mkn =  np.array(X_train['m'] * X_train['k'] * X_train['n'])
test_mkn  =  np.array(X_test['m'] * X_test['k'] * X_test['n'])

y_train = y_train['time'].values / train_mkn

X_train.drop(list(X_train.iloc[:,250:]), axis=1, inplace=True)
X_train.drop(list(X_train.iloc[:,:76]), axis=1, inplace=True)

X_test.drop(list(X_test.iloc[:,250:]), axis=1, inplace=True)
X_test.drop(list(X_test.iloc[:,:76]), axis=1, inplace=True)

X_train = np.array(X_train)
X_test  = np.array(X_test)

p1 = np.percentile(train_mkn, 25)
p2 = np.percentile(train_mkn, 50)
p3 = np.percentile(train_mkn, 75)

predict = np.empty([X_test.shape[0],])

train_a = np.where(train_mkn < p1 )[0]
train_b = np.where ( np.logical_and(train_mkn >= p1, train_mkn < p2))[0]
train_c = np.where ( np.logical_and(train_mkn >= p2, train_mkn < p3))[0]
train_d = np.where(train_mkn >= p3)[0]

test_a  = np.where(test_mkn < p1 )[0]
test_b  = np.where ( np.logical_and(test_mkn >= p1, test_mkn < p2))[0]
test_c  = np.where ( np.logical_and(test_mkn >= p2, test_mkn < p3))[0]
test_d  = np.where(test_mkn >= p3)[0]

train_list = ["train_a","train_b","train_c","train_d"]
test_list  = ["test_a","test_b","test_c","test_d"]

for (train_i,test_i) in zip(train_list,test_list):
    train_i = eval(train_i)
    test_i  = eval(test_i)
    clf     = get_clf(X_train[train_i],y_train[train_i])
    predict[test_i,] = clf.predict(X_test)[test_i] * test_mkn[test_i]
    
predict[(np.where (np.abs(predict) > 1000)),] = 1

np.savetxt('submission.csv', predict, fmt="%f")