## Approach:

* Pepare a naive submission based on last observed value.
* Assign target variable to be difference between actual value of security at 5:00 P.M and last observed value.
* Only include those securities which are actually predictive in nature.
* Make sure first value for each of the days is 0.0
* If the change in the value of security to be greater than 40% consider it as data error and divide by 100.

In [140]:
import pandas as pd
import numpy as np
import os

from __future__ import division

from sklearn.feature_selection import f_regression, SelectKBest
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline
from scipy import stats

import warnings
warnings.filterwarnings('ignore')

PATH = '../data/data/'

In [104]:
# load in all the dataset
files = [(os.path.join(PATH, (str(i) + '.csv'))) for i in range(1, 511)]

data_df = pd.concat((pd.read_csv(f) for f in files))

In [105]:
# load target labels
target = pd.read_csv('../data/trainLabels.csv', index_col='FileId'); target.head()

Unnamed: 0_level_0,O1,O2,O3,O4,O5,O6,O7,O8,O9,O10,...,O189,O190,O191,O192,O193,O194,O195,O196,O197,O198
FileId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,2.53,1.03,0.12,0.01,0.07,0.88,1.3,1.82,2.13,1.34,...,-0.65,1.58,1.75,-0.1,0.04,0.6,2.04,3.69,3.56,2.03
2,-4.95,0.18,-0.24,-0.04,-0.49,0.68,0.8,0.64,0.92,0.58,...,0.14,0.64,0.47,0.56,0.47,1.34,1.06,1.22,-0.04,0.38
3,0.16,0.0,0.2,-0.2,-0.11,0.76,0.88,0.18,0.45,0.26,...,0.33,0.84,1.22,0.25,1.5,0.7,1.79,-0.04,1.3,1.61
4,4.43,0.06,-0.2,-0.45,-0.62,0.13,4.35,1.47,0.36,2.65,...,-0.27,-0.49,-0.99,0.75,0.3,0.47,-0.02,-0.79,-0.43,-0.53
5,0.76,0.11,0.12,0.08,-0.25,0.12,-0.44,0.6,0.11,0.1,...,0.2,0.48,0.24,-0.11,0.08,0.4,-0.51,1.85,-0.53,-0.57


In [106]:
# load the submission file
sub = pd.read_csv('../data/sampleSubmission.csv')

In [107]:
# first 200 days are training examples and rest 310 days should be used for training
# 200 * 55 = 11,000 examples will be included in training examples and rest would go in the test set

train = data_df.iloc[:11000]
test = data_df.iloc[11000:]

In [108]:
assert train.shape[0] == 11000
assert test.shape[0] == 17050

## Error Metric

In [75]:
def mae(y_true, y_pred):
    return (abs(y_true - y_pred)).mean(axis=1).mean()

## Preprocessing

* Make sure first value of each security every day should be zero.
* If the change is greater than 40% consider it to be data error and divide it by zero.

In [109]:
def sanity_check(df):
    """
    Assigns zero to first value for each day
    """
    index_list = np.arange(0, len(df), 55)
    df.iloc[index_list, 0:198] = np.zeros((index_list.shape[0], 198))
    
    return pd.DataFrame(df)

def correct_data(df, target):
    """
    Look for relative change in values of securities and see if the respective change is more than
    40%, if so then consider this is data error and divide it by 100
    """
    mask = abs(train.iloc[:, 0:198]) > 40
    per_security = mask.any()
    securities_with_error = per_security.loc[per_security == True].index
    
    for security in securities_with_error:
        df[security] /= 100
        target[security] /= 100
    
    return df, target

In [110]:
train = sanity_check(train)
test = sanity_check(test)

train, target = correct_data(train, target)

## Naive Submission - Last observed value

In [115]:
class NaiveSubmission(object):
    """
    Last observed value
    """
    def __init__(self, df, n=1):
        """
        Parameters:
        df - Data Frame
        n - number of observations to consider
        """
        self._df = df.copy()
        self._n = n
    
    @property
    def opt(self):
        return self._n
    
    @opt.setter
    def opt(self, n):
        """
        Set the number of observations to consider
        """
        
        if n is None:
            raise Error('Number of observations must be defined')
        
        self._n = n
    
    def predict(self):
        """
        Return predictions based on the number of observations
        
        Shape of the predictins is (200, 198)
        """
        
        start_index = 0
        num_entries = 55
        df = self._df
        n = self._n
        predictions = []

        for i in range(1, ((len(df) // 55) + 1)):
            end_index = i * num_entries
            day_activity  = df.iloc[start_index:end_index, 0:198]
            start_index = end_index

            pred_for_day = day_activity.iloc[-n:].mean()
            predictions.append(pred_for_day)

        return np.array(predictions)   

In [116]:
ns = NaiveSubmission(train)

In [117]:
predictions = ns.predict()
print 'Mean Absolute Error %f ' %(mae(target, predictions))

Mean Absolute Error 0.297656 


### Difference between actual value and last observed value

In [118]:
diff_in_predictions = target - predictions

## Predictive abilities of the securities

In [25]:
train.iloc[1:, 0:198].corr()

Unnamed: 0,O1,O2,O3,O4,O5,O6,O7,O8,O9,O10,...,O189,O190,O191,O192,O193,O194,O195,O196,O197,O198
O1,1.000000,0.104715,0.208214,-0.059294,0.604999,0.285506,0.236073,0.443441,0.364449,0.374152,...,0.114176,0.294439,0.250906,0.220975,0.149453,0.238251,0.245113,0.536275,0.446256,0.245203
O2,0.104715,1.000000,0.343565,-0.204911,0.391549,0.521593,0.352200,0.487506,0.514865,0.390155,...,0.186550,0.485682,0.512819,0.418700,0.449778,0.470558,0.502948,0.436793,0.555067,0.464348
O3,0.208214,0.343565,1.000000,-0.127909,0.282714,0.427040,0.259615,0.435510,0.443529,0.393838,...,0.111215,0.396221,0.377650,0.342393,0.245453,0.395566,0.393303,0.375277,0.375593,0.364567
O4,-0.059294,-0.204911,-0.127909,1.000000,-0.248608,-0.474834,-0.329986,-0.368577,-0.441037,-0.284966,...,-0.173623,-0.471931,-0.489094,-0.334746,-0.217183,-0.437315,-0.459440,-0.371453,-0.384011,-0.444860
O5,0.604999,0.391549,0.282714,-0.248608,1.000000,0.606353,0.404825,0.697434,0.630881,0.601869,...,0.284006,0.627026,0.567554,0.409822,0.306361,0.503357,0.535747,0.736329,0.757698,0.503094
O6,0.285506,0.521593,0.427040,-0.474834,0.606353,1.000000,0.573285,0.828873,0.871899,0.654874,...,0.312995,0.942340,0.903097,0.820692,0.656518,0.892085,0.902489,0.724335,0.783556,0.814745
O7,0.236073,0.352200,0.259615,-0.329986,0.404825,0.573285,1.000000,0.601552,0.663474,0.519754,...,0.244215,0.550476,0.524330,0.515567,0.367187,0.537080,0.535205,0.505030,0.506728,0.468910
O8,0.443441,0.487506,0.435510,-0.368577,0.697434,0.828873,0.601552,1.000000,0.886114,0.785153,...,0.377129,0.850931,0.787725,0.608076,0.498718,0.731950,0.767328,0.816282,0.803117,0.695538
O9,0.364449,0.514865,0.443529,-0.441037,0.630881,0.871899,0.663474,0.886114,1.000000,0.710301,...,0.349030,0.864199,0.817174,0.693618,0.537461,0.791674,0.821761,0.773844,0.793718,0.754145
O10,0.374152,0.390155,0.393838,-0.284966,0.601869,0.654874,0.519754,0.785153,0.710301,1.000000,...,0.347423,0.651969,0.609986,0.518117,0.393282,0.572797,0.617040,0.639512,0.633670,0.546116


** Some of the securities are highly correlated **

In [209]:
X = pd.DataFrame(predictions)
y = diff_in_predictions

In [210]:
def get_p_value(X, y_true, y_pred, coef_):
    sse = np.sum((y_pred - y_true) ** 2, axis=0) / float(X.shape[0] - X.shape[1])
    se = np.sqrt(np.diagonal(sse * np.linalg.inv(np.dot(X.T, X))))
        
    t = coef_ / se
    p = 2 * (1 - stats.t.cdf(np.abs(t), y_true.shape[0] - X.shape[1]))
    return p

In [313]:
def predict_(X, y, naive_submission, sub):
    """
    Parameters:
    
    X - feature matrix
    y - target
    naive_submission - last observed values
    sub - submission data frame which will hold final predictions
    """
    
    feat = []
    for i in range(1, X.shape[1] + 1):
        model = LinearRegression()
        x = X[(i - 1)].reshape(-1, 1)
        model.fit(x, y['O'+str(i)])
        preds = model.predict(x)
        
        pvalues = get_p_value(x, y['O'+str(i)].values, preds, model.coef_)
        
        if pvalues[0] < 0.04:
            sub.iloc[:, i] = naive_submission.iloc[:, (i-1)] + (model.coef_[0] * naive_submission.iloc[:, (i-1)]) * 0.5
      
    return sub

In [314]:
ns_test = NaiveSubmission(test)
test_preds = ns_test.predict()

In [315]:
# use naive prediction ( last observed value ) as baseline
sub.iloc[:, 1:199] = test_preds

In [316]:
submission = predict_(X, y, pd.DataFrame(test_preds), sub)

In [357]:
# scores 0.42540 on the leaderboard
submission.to_csv('../submissions/elegant.csv', index=False)

** Looking forward for comments and suggestions **