In [1]:
# Basic imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime as dt
import scipy.optimize as spo
import sys

%matplotlib inline

%pylab inline
pylab.rcParams['figure.figsize'] = (20.0, 10.0)

%load_ext autoreload
%autoreload 2

sys.path.append('../../')

Populating the interactive namespace from numpy and matplotlib


In [2]:
data_df = pd.read_pickle('../../data/data_train_val_df.pkl')
print(data_df.shape)
data_df.head()

(5520, 2415)


feature,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,SPY,MMM,ABT,ABBV,ACN,ATVI,AYI,ADBE,AMD,AAP,...,WYNN,XEL,XRX,XLNX,XYL,YHOO,YUM,ZBH,ZION,ZTS
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1993-01-29,43.94,24.5,6.88,,,,,2.59,18.75,,...,,87800.0,7633602.0,1745196.0,,,,,33600.0,
1993-02-01,44.25,24.69,6.88,,,,,2.72,19.12,,...,,72400.0,3001200.0,3574800.0,,,,,32000.0,
1993-02-02,44.34,24.72,6.53,,,,,2.84,20.25,,...,,242200.0,1388598.0,2652396.0,,,,,251600.0,
1993-02-03,44.81,25.19,6.91,,,,,2.7,20.5,,...,,272200.0,1228200.0,5040396.0,,,,,254800.0,
1993-02-04,45.0,26.06,6.84,,,,,2.73,20.12,,...,,162800.0,1675602.0,7033200.0,,,,,317200.0,


## The first objective of this notebook is to implement the next function (to extract sample intervals from the total period).

In [3]:
def generate_train_intervals(data_df, train_time, base_time, step, days_ahead, today):
    pass

Let's define the parameters as constants, just to do some scratch work.

In [4]:
# I will try to keep the convention to name with the "days" suffix, 
# to all the variables that represent "market days". The ones that 
# represent real time will be named more arbitrarily.

train_time = 365 # In real time days
base_days = 7 # In market days
step_days = 7 # market days
ahead_days = 1 # market days
today = data_df.index[-1] # Real date

In [5]:
today

Timestamp('2014-12-31 00:00:00')

### The amount of samples to be generated would be (train_time - base_time) * num_companies / step. There are days_ahead market days left, only for target values, so the total "used" period is train_time + days_ahead. 
### The option of training with all, one, or some companies can be done by the user when it inputs the data (just filter data_df to get the companies you want). Anyway, one interesting choice would be to allow the training with multiple companies, targeting only one. That would multiply the features by the number of available companies, but would reduce the samples a lot. By now, I want to keep the complexity low, so I won't implement that idea, yet. A many to many approach could also be implemented (the target would be the vector with all the companies data). I will start with the simple "one to one".

In [6]:
data_df.index[data_df.index <= today][-(ahead_days + 1)]

Timestamp('2014-12-30 00:00:00')

In [7]:
def add_market_days(base, delta, data_df):
    """
    base is in real time.
    delta is in market days.
    """
    market_days = data_df.index
    if base not in market_days:
        raise Exception('The base date is not in the market days list.')
    base_index = market_days.tolist().index(base)
    if base_index + delta >= len(market_days):
        return market_days[-1]
    if base_index + delta < 0:
        return market_days[0]
    return market_days[base_index + delta]

In [8]:
# Remember the last target days are not used for training, but that is a "market days" period.
end_of_training_date = add_market_days(today, -ahead_days, data_df)
start_date = end_of_training_date - dt.timedelta(train_time) 
print('Start date: %s.  End of training date: %s.' % (start_date, end_of_training_date))

Start date: 2013-12-30 00:00:00.  End of training date: 2014-12-30 00:00:00.


In [9]:
TARGET_FEATURE = 'Close'

### One important thing to note: the base time is in "market days", that means that it doesn't represent a period of "real" time (the real time may vary with each base interval).

In [10]:
def print_period(data_df):
    print('Period: %s  to  %s.' % (data_df.index[0], data_df.index[-1]))

In [11]:
data_train_df = data_df[start_date:end_of_training_date]

print_period(data_train_df)
data_train_df.shape

Period: 2013-12-30 00:00:00  to  2014-12-30 00:00:00.


(253, 2415)

In [12]:
start_target_date = add_market_days(start_date, base_days + ahead_days - 1, data_df)
data_target_df = data_df.loc[start_target_date: today,TARGET_FEATURE]

print_period(data_target_df)
data_target_df.shape

Period: 2014-01-09 00:00:00  to  2014-12-31 00:00:00.


(247, 483)

Is that initial date correct?

In [13]:
data_train_df.index[:10]

DatetimeIndex(['2013-12-30', '2013-12-31', '2014-01-02', '2014-01-03',
               '2014-01-06', '2014-01-07', '2014-01-08', '2014-01-09',
               '2014-01-10', '2014-01-13'],
              dtype='datetime64[ns]', name='date', freq=None)

Ok, it looks so.

### Let's split now!

I should allow for different feature extraction functions to be used, after the time divisions.

In [14]:
date_base_ini = start_date
date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
date_target = add_market_days(date_base_end, ahead_days, data_df)

sample_blob = (data_train_df[date_base_ini: date_base_end], pd.DataFrame(data_target_df.loc[date_target]))
sample_blob[0]

feature,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,SPY,MMM,ABT,ABBV,ACN,ATVI,AYI,ADBE,AMD,AAP,...,WYNN,XEL,XRX,XLNX,XYL,YHOO,YUM,ZBH,ZION,ZTS
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2013-12-30,183.82,139.42,38.41,53.01,82.06,17.8,109.71,59.55,3.85,109.63,...,1950055.0,1867094.0,3846744.0,1049165.0,561674.0,8678033.0,1953272.0,623173.0,895272.0,1803579.0
2013-12-31,184.69,140.25,38.33,52.81,82.22,17.83,109.32,59.88,3.87,110.68,...,731522.0,1752814.0,5359541.0,1215365.0,557969.0,8292761.0,2132916.0,649985.0,1077417.0,2270418.0
2014-01-02,182.92,138.13,38.23,51.98,81.13,18.07,107.65,59.29,3.95,109.74,...,3056065.0,3192314.0,10481405.0,3437019.0,765141.0,21514650.0,1956285.0,868763.0,1356738.0,2576112.0
2014-01-03,182.88,138.45,38.64,52.3,81.4,18.29,108.15,59.16,4.0,112.88,...,1169540.0,2939378.0,7282652.0,1982702.0,454495.0,15761243.0,1457058.0,1288207.0,1122452.0,2524947.0
2014-01-06,182.36,137.63,39.15,50.39,80.54,18.08,106.28,58.12,4.13,111.8,...,1289126.0,3382267.0,14906758.0,1970805.0,849360.0,12472724.0,2940835.0,1414955.0,1988180.0,2763350.0
2014-01-07,183.48,137.65,38.85,50.49,81.52,18.32,109.44,58.97,4.18,113.18,...,1688085.0,3481465.0,15383264.0,1581167.0,611127.0,14141112.0,3625927.0,1852572.0,1343169.0,2338176.0
2014-01-08,183.52,136.63,39.2,50.36,82.15,18.34,110.03,58.9,4.18,112.3,...,1406668.0,3563670.0,7833434.0,2318930.0,1234973.0,18657195.0,4448753.0,1880549.0,2034692.0,3965882.0


In [15]:
target = sample_blob[1].T
target

Unnamed: 0,SPY,MMM,ABT,ABBV,ACN,ATVI,AYI,ADBE,AMD,AAP,...,WYNN,XEL,XRX,XLNX,XYL,YHOO,YUM,ZBH,ZION,ZTS
2014-01-09,183.64,136.45,39.27,51.22,82.95,18.3,123.5,59.09,4.09,113.55,...,204.77,27.82,12.05,45.78,34.63,40.92,75.05,96.47,30.22,31.96


### Let's define a function that takes a "sample blob" and produces one sample per symbol, only for the "Close" feature (looks like the easiest to do first). The dates in the base period should be substituted by an index, and the symbols shuffled later (along with their labels).

In [16]:
feat_close = sample_blob[0][TARGET_FEATURE]
feat_close.index = np.arange(feat_close.shape[0])
feat_close

Unnamed: 0,SPY,MMM,ABT,ABBV,ACN,ATVI,AYI,ADBE,AMD,AAP,...,WYNN,XEL,XRX,XLNX,XYL,YHOO,YUM,ZBH,ZION,ZTS
0,183.82,139.42,38.41,53.01,82.06,17.8,109.71,59.55,3.85,109.63,...,193.29,27.92,12.15,45.64,34.58,40.2,74.92,93.2,29.85,32.67
1,184.69,140.25,38.33,52.81,82.22,17.83,109.32,59.88,3.87,110.68,...,194.21,27.94,12.17,45.92,34.6,40.44,75.61,93.19,29.96,32.69
2,182.92,138.13,38.23,51.98,81.13,18.07,107.65,59.29,3.95,109.74,...,197.94,27.51,11.91,45.97,34.16,39.59,75.09,92.24,29.65,32.36
3,182.88,138.45,38.64,52.3,81.4,18.29,108.15,59.16,4.0,112.88,...,196.0,27.5,11.99,45.62,34.47,40.12,75.56,92.64,29.86,32.05
4,182.36,137.63,39.15,50.39,80.54,18.08,106.28,58.12,4.13,111.8,...,195.86,27.35,12.09,45.42,34.41,39.93,75.5,93.24,29.65,31.98
5,183.48,137.65,38.85,50.49,81.52,18.32,109.44,58.97,4.18,113.18,...,201.51,27.82,12.19,45.52,34.51,40.92,76.56,95.1,29.74,32.1
6,183.52,136.63,39.2,50.36,82.15,18.34,110.03,58.9,4.18,112.3,...,205.29,27.71,12.08,45.91,34.49,41.02,76.53,97.43,30.0,31.74


In [17]:
target.index = ['target']
target

Unnamed: 0,SPY,MMM,ABT,ABBV,ACN,ATVI,AYI,ADBE,AMD,AAP,...,WYNN,XEL,XRX,XLNX,XYL,YHOO,YUM,ZBH,ZION,ZTS
target,183.64,136.45,39.27,51.22,82.95,18.3,123.5,59.09,4.09,113.55,...,204.77,27.82,12.05,45.78,34.63,40.92,75.05,96.47,30.22,31.96


In [18]:
x_y_samples = feat_close.append(target)
x_y_samples

Unnamed: 0,SPY,MMM,ABT,ABBV,ACN,ATVI,AYI,ADBE,AMD,AAP,...,WYNN,XEL,XRX,XLNX,XYL,YHOO,YUM,ZBH,ZION,ZTS
0,183.82,139.42,38.41,53.01,82.06,17.8,109.71,59.55,3.85,109.63,...,193.29,27.92,12.15,45.64,34.58,40.2,74.92,93.2,29.85,32.67
1,184.69,140.25,38.33,52.81,82.22,17.83,109.32,59.88,3.87,110.68,...,194.21,27.94,12.17,45.92,34.6,40.44,75.61,93.19,29.96,32.69
2,182.92,138.13,38.23,51.98,81.13,18.07,107.65,59.29,3.95,109.74,...,197.94,27.51,11.91,45.97,34.16,39.59,75.09,92.24,29.65,32.36
3,182.88,138.45,38.64,52.3,81.4,18.29,108.15,59.16,4.0,112.88,...,196.0,27.5,11.99,45.62,34.47,40.12,75.56,92.64,29.86,32.05
4,182.36,137.63,39.15,50.39,80.54,18.08,106.28,58.12,4.13,111.8,...,195.86,27.35,12.09,45.42,34.41,39.93,75.5,93.24,29.65,31.98
5,183.48,137.65,38.85,50.49,81.52,18.32,109.44,58.97,4.18,113.18,...,201.51,27.82,12.19,45.52,34.51,40.92,76.56,95.1,29.74,32.1
6,183.52,136.63,39.2,50.36,82.15,18.34,110.03,58.9,4.18,112.3,...,205.29,27.71,12.08,45.91,34.49,41.02,76.53,97.43,30.0,31.74
target,183.64,136.45,39.27,51.22,82.95,18.3,123.5,59.09,4.09,113.55,...,204.77,27.82,12.05,45.78,34.63,40.92,75.05,96.47,30.22,31.96


In [19]:
x_y_samples_shuffled = x_y_samples.T.sample(frac=1).reset_index(drop=True)
x_y_samples_shuffled.head()

Unnamed: 0,0,1,2,3,4,5,6,target
0,41.09,41.31,40.66,40.46,40.27,40.39,39.94,39.73
1,94.55,95.1,93.85,93.52,93.8,95.55,94.79,94.84
2,47.01,47.27,47.0,46.83,46.58,46.35,46.94,46.13
3,83.36,83.77,82.56,83.3,83.17,83.87,83.92,83.29
4,36.14,36.16,35.53,35.4,35.76,36.22,35.94,36.04


### It is important to take care of the NaN values. Possibly at this sample_blob level is a good point to do so; just discard too bad samples.

In [20]:
x_y_samples_shuffled.isnull().sum()

0         10
1         10
2         10
3         10
4         10
5         10
6         10
target    10
dtype: int64

In [21]:
x_y_samples_filtered = x_y_samples_shuffled.dropna(axis=0, how='any')
print(x_y_samples_filtered.shape)
x_y_samples_filtered.isnull().sum()

(473, 8)


0         0
1         0
2         0
3         0
4         0
5         0
6         0
target    0
dtype: int64

In [22]:
# At some point I will have to standarize those values... (not now, but just as a reminder...)

std_samples = x_y_samples_shuffled.apply(lambda x: x / np.mean(x), axis=1)
std_samples.head()

Unnamed: 0,0,1,2,3,4,5,6,target
0,1.015038,1.020472,1.004416,0.999475,0.994782,0.997746,0.98663,0.981442
1,1.000529,1.006349,0.993122,0.98963,0.992593,1.011111,1.003069,1.003598
2,1.005266,1.010826,1.005052,1.001417,0.996071,0.991152,1.003769,0.986448
3,0.99946,1.004376,0.989869,0.998741,0.997182,1.005575,1.006175,0.998621
4,1.00672,1.007277,0.989728,0.986107,0.996135,1.008949,1.001149,1.003935


In [23]:
features = std_samples.iloc[:,:-1]
features.head()

Unnamed: 0,0,1,2,3,4,5,6
0,1.015038,1.020472,1.004416,0.999475,0.994782,0.997746,0.98663
1,1.000529,1.006349,0.993122,0.98963,0.992593,1.011111,1.003069
2,1.005266,1.010826,1.005052,1.001417,0.996071,0.991152,1.003769
3,0.99946,1.004376,0.989869,0.998741,0.997182,1.005575,1.006175
4,1.00672,1.007277,0.989728,0.986107,0.996135,1.008949,1.001149


In [24]:
target = pd.DataFrame(std_samples.iloc[:,-1])
target.head()

Unnamed: 0,target
0,0.981442
1,1.003598
2,0.986448
3,0.998621
4,1.003935


### Let's create the samples divider function

In [25]:
TARGET_FEATURE = 'Close'


def feature_close_one_to_one(sample_blob):
    target = sample_blob[1].T
    feat_close = sample_blob[0][TARGET_FEATURE]
    feat_close.index = np.arange(feat_close.shape[0])
    target.index = ['target']
    x_y_samples = feat_close.append(target)
    x_y_samples_shuffled = x_y_samples.T.sample(frac=1).reset_index(drop=True)
    x_y_samples_filtered = x_y_samples_shuffled.dropna(axis=0, how='any')
    
    return x_y_samples_filtered

In [26]:
print(feature_close_one_to_one(sample_blob).shape)
feature_close_one_to_one(sample_blob).head()

(473, 8)


Unnamed: 0,0,1,2,3,4,5,6,target
0,48.21,48.36,47.72,47.85,47.6,47.94,48.15,48.55
1,77.31,77.3,76.6,77.29,76.62,77.3,77.15,77.48
2,177.42,178.36,175.48,176.12,174.5,175.68,176.66,178.84
3,41.79,41.93,41.26,41.06,40.36,39.99,39.07,39.22
4,22.32,22.35,21.94,22.19,21.66,21.92,22.08,22.07


In [27]:
date_base_ini = start_date
date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
date_target = add_market_days(date_base_end, ahead_days, data_df)
feat_tgt_df = pd.DataFrame()

while date_base_end < end_of_training_date:
    sample_blob = (data_train_df[date_base_ini: date_base_end],
                   pd.DataFrame(data_target_df.loc[date_target]))
    feat_tgt_blob = feature_close_one_to_one(sample_blob) # TODO: Change for a generic function
    feat_tgt_df = feat_tgt_df.append(feat_tgt_blob, ignore_index=True)
    
    date_base_ini = add_market_days(date_base_ini, step_days, data_df)
    date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
    date_target = add_market_days(date_base_end, ahead_days, data_df)
    # print('Start: %s,  End:%s' % (date_base_ini, date_base_end))

feat_tgt_df = feat_tgt_df.sample(frac=1).reset_index(drop=True)

X_df = feat_tgt_df.iloc[:,:-1]
y_df = pd.DataFrame(feat_tgt_df.iloc[:,-1])

In [28]:
print(X_df.shape)
X_df.head()

(17102, 7)


Unnamed: 0,0,1,2,3,4,5,6
0,67.41,66.66,65.48,66.76,66.53,67.3,67.3
1,76.1,76.76,76.87,76.23,76.14,76.01,76.43
2,20.6,20.85,21.0,21.23,21.1,21.04,20.92
3,31.11,31.01,30.98,30.66,30.53,30.84,30.98
4,15.45,15.64,15.49,15.47,15.41,15.33,15.37


In [29]:
print(y_df.shape)
y_df.head()

(17102, 1)


Unnamed: 0,target
0,69.33
1,76.77
2,20.78
3,31.59
4,15.6


## So, I have everything to define the final function of this notebook

In [30]:
def generate_train_intervals(data_df, train_time, base_days, step_days, ahead_days, today, blob_fun):
    end_of_training_date = add_market_days(today, -ahead_days, data_df)
    start_date = end_of_training_date - dt.timedelta(train_time)
    start_target_date = add_market_days(start_date, base_days + ahead_days - 1, data_df)
    
    data_train_df = data_df[start_date:end_of_training_date]
    data_target_df = data_df.loc[start_target_date: today,TARGET_FEATURE]
    
    date_base_ini = start_date
    date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
    date_target = add_market_days(date_base_end, ahead_days, data_df)
    feat_tgt_df = pd.DataFrame()

    while date_base_end < end_of_training_date:
        sample_blob = (data_train_df[date_base_ini: date_base_end],
                       pd.DataFrame(data_target_df.loc[date_target]))
        feat_tgt_blob = blob_fun(sample_blob)
        feat_tgt_df = feat_tgt_df.append(feat_tgt_blob, ignore_index=True)
        
        date_base_ini = add_market_days(date_base_ini, step_days, data_df)
        date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
        date_target = add_market_days(date_base_end, ahead_days, data_df)
        # print('Start: %s,  End:%s' % (date_base_ini, date_base_end))
    
    feat_tgt_df = feat_tgt_df.sample(frac=1).reset_index(drop=True)
    
    X_df = feat_tgt_df.iloc[:,:-1]
    y_df = pd.DataFrame(feat_tgt_df.iloc[:,-1])
    
    return X_df, y_df

In [31]:
train_time = 365 # In real time days
base_days = 7 # In market days
step_days = 7 # market days
ahead_days = 1 # market days
today = data_df.index[-1] # Real date

X, y = generate_train_intervals(data_df, train_time, base_days, step_days, ahead_days, today, feature_close_one_to_one)

In [32]:
print(X.shape)
X.head()

(17102, 7)


Unnamed: 0,0,1,2,3,4,5,6
0,45.34,44.59,43.41,44.52,44.53,44.74,45.18
1,173.89,173.29,175.06,174.57,175.77,175.9,176.58
2,16.95,17.03,17.05,17.18,16.85,17.03,17.01
3,15.25,14.96,14.55,14.87,14.73,14.85,14.97
4,36.5,36.76,36.89,37.47,37.73,38.34,38.29


In [33]:
print(y.shape)
y.head()

(17102, 1)


Unnamed: 0,target
0,43.26
1,177.82
2,17.05
3,14.84
4,39.17


In [34]:
%pwd

'/home/miguel/udacity/Machine Learning Nanodegree/projects/capstone/capstone/notebooks/dev'

In [35]:
sys.path.append('../../')
import predictor.feature_extraction as fe

## Let's try the function as it was saved in the package:

In [36]:
X, y = fe.generate_train_intervals(data_df, 
                                   train_time, 
                                   base_days, 
                                   step_days, 
                                   ahead_days, 
                                   today, 
                                   feature_close_one_to_one)

In [37]:
print(X.shape)
X.head()

(24606, 7)


Unnamed: 0,0,1,2,3,4,5,6
440,61.14,60.92,61.14,60.48,60.96,60.88,61.82
296,38.98,39.03,38.81,39.19,39.57,39.11,38.9
163,83.2,83.2,83.14,83.38,82.94,82.91,83.81
80,77.83,76.09,71.75,71.35,67.36,66.21,65.38
288,40.94,40.79,40.29,40.32,41.21,41.48,41.27


In [38]:
print(y.shape)
y.head()

(24606, 1)


Unnamed: 0,target
440,62.49
296,38.94
163,84.42
80,62.6
288,41.11


### Looks good

## Sometimes, it may be useful to keep the dates information...

In [39]:
x_y_samples

Unnamed: 0,SPY,MMM,ABT,ABBV,ACN,ATVI,AYI,ADBE,AMD,AAP,...,WYNN,XEL,XRX,XLNX,XYL,YHOO,YUM,ZBH,ZION,ZTS
0,183.82,139.42,38.41,53.01,82.06,17.8,109.71,59.55,3.85,109.63,...,193.29,27.92,12.15,45.64,34.58,40.2,74.92,93.2,29.85,32.67
1,184.69,140.25,38.33,52.81,82.22,17.83,109.32,59.88,3.87,110.68,...,194.21,27.94,12.17,45.92,34.6,40.44,75.61,93.19,29.96,32.69
2,182.92,138.13,38.23,51.98,81.13,18.07,107.65,59.29,3.95,109.74,...,197.94,27.51,11.91,45.97,34.16,39.59,75.09,92.24,29.65,32.36
3,182.88,138.45,38.64,52.3,81.4,18.29,108.15,59.16,4.0,112.88,...,196.0,27.5,11.99,45.62,34.47,40.12,75.56,92.64,29.86,32.05
4,182.36,137.63,39.15,50.39,80.54,18.08,106.28,58.12,4.13,111.8,...,195.86,27.35,12.09,45.42,34.41,39.93,75.5,93.24,29.65,31.98
5,183.48,137.65,38.85,50.49,81.52,18.32,109.44,58.97,4.18,113.18,...,201.51,27.82,12.19,45.52,34.51,40.92,76.56,95.1,29.74,32.1
6,183.52,136.63,39.2,50.36,82.15,18.34,110.03,58.9,4.18,112.3,...,205.29,27.71,12.08,45.91,34.49,41.02,76.53,97.43,30.0,31.74
target,183.64,136.45,39.27,51.22,82.95,18.3,123.5,59.09,4.09,113.55,...,204.77,27.82,12.05,45.78,34.63,40.92,75.05,96.47,30.22,31.96


In [40]:
target = sample_blob[1].T
feat_close = sample_blob[0][TARGET_FEATURE]
x_y_samples = feat_close.append(target)
x_y_samples

Unnamed: 0,SPY,MMM,ABT,ABBV,ACN,ATVI,AYI,ADBE,AMD,AAP,...,WYNN,XEL,XRX,XLNX,XYL,YHOO,YUM,ZBH,ZION,ZTS
2014-12-18,206.78,165.3,45.77,67.92,89.74,20.04,137.17,74.89,2.55,160.74,...,145.1,35.35,13.89,43.71,38.1,50.91,71.74,114.94,28.2,43.15
2014-12-19,206.52,165.48,46.05,67.71,90.51,20.25,137.38,74.45,2.57,159.85,...,150.11,35.4,13.96,43.0,38.59,50.88,71.17,114.91,28.39,43.51
2014-12-22,207.47,167.27,46.37,66.97,91.18,20.3,140.05,74.5,2.66,159.44,...,147.48,35.65,13.97,43.87,38.74,51.15,72.46,115.05,28.18,43.41
2014-12-23,207.75,166.87,45.72,64.35,91.42,20.23,139.48,74.38,2.67,161.19,...,151.69,35.43,14.07,43.91,38.72,50.02,72.68,113.47,28.53,42.97
2014-12-24,207.77,166.96,45.7,66.21,91.32,20.35,139.94,74.74,2.65,160.23,...,150.9,36.23,14.08,44.08,38.76,50.65,72.79,114.11,28.52,43.84
2014-12-26,208.44,166.26,45.85,66.98,91.26,20.42,139.66,74.67,2.65,160.0,...,150.37,36.58,14.14,43.92,38.95,50.86,73.14,114.17,28.56,44.2
2014-12-29,208.72,166.71,45.6,67.14,90.52,20.36,139.88,74.13,2.66,161.22,...,153.0,37.25,14.14,43.79,38.86,50.53,73.56,113.5,28.72,44.01
2014-12-30,207.6,165.84,45.69,66.3,90.19,20.24,141.2,73.08,2.63,160.03,...,151.2,36.38,14.01,43.55,38.43,51.22,73.28,114.57,28.67,43.35


In [41]:
x_y_samples.index = pd.MultiIndex.from_product([[x_y_samples.index[0]], np.arange(x_y_samples.shape[0])])
x_y_samples

Unnamed: 0,Unnamed: 1,SPY,MMM,ABT,ABBV,ACN,ATVI,AYI,ADBE,AMD,AAP,...,WYNN,XEL,XRX,XLNX,XYL,YHOO,YUM,ZBH,ZION,ZTS
2014-12-18,0,206.78,165.3,45.77,67.92,89.74,20.04,137.17,74.89,2.55,160.74,...,145.1,35.35,13.89,43.71,38.1,50.91,71.74,114.94,28.2,43.15
2014-12-18,1,206.52,165.48,46.05,67.71,90.51,20.25,137.38,74.45,2.57,159.85,...,150.11,35.4,13.96,43.0,38.59,50.88,71.17,114.91,28.39,43.51
2014-12-18,2,207.47,167.27,46.37,66.97,91.18,20.3,140.05,74.5,2.66,159.44,...,147.48,35.65,13.97,43.87,38.74,51.15,72.46,115.05,28.18,43.41
2014-12-18,3,207.75,166.87,45.72,64.35,91.42,20.23,139.48,74.38,2.67,161.19,...,151.69,35.43,14.07,43.91,38.72,50.02,72.68,113.47,28.53,42.97
2014-12-18,4,207.77,166.96,45.7,66.21,91.32,20.35,139.94,74.74,2.65,160.23,...,150.9,36.23,14.08,44.08,38.76,50.65,72.79,114.11,28.52,43.84
2014-12-18,5,208.44,166.26,45.85,66.98,91.26,20.42,139.66,74.67,2.65,160.0,...,150.37,36.58,14.14,43.92,38.95,50.86,73.14,114.17,28.56,44.2
2014-12-18,6,208.72,166.71,45.6,67.14,90.52,20.36,139.88,74.13,2.66,161.22,...,153.0,37.25,14.14,43.79,38.86,50.53,73.56,113.5,28.72,44.01
2014-12-18,7,207.6,165.84,45.69,66.3,90.19,20.24,141.2,73.08,2.63,160.03,...,151.2,36.38,14.01,43.55,38.43,51.22,73.28,114.57,28.67,43.35


### That would be the way to go: the timestamp of the first day of the base period works as a global timestamp for the base period.

In [42]:
x_y_samples.unstack().stack(0).sample(frac=1).reset_index(level=1, drop=True).head()

Unnamed: 0,0,1,2,3,4,5,6,7
2014-12-18,95.41,94.8,96.9,99.14,100.09,99.94,99.22,98.84
2014-12-18,139.82,139.83,141.4,141.31,141.54,141.39,140.67,139.63
2014-12-18,49.0,49.02,49.36,49.17,49.91,50.47,50.88,49.77
2014-12-18,54.1,54.01,53.96,54.42,54.54,54.43,54.73,54.84
2014-12-18,95.36,95.44,96.62,97.36,97.09,97.05,96.73,95.96


### Let's try the whole function, with shuffle (it's better to do it early, so that I won't forget later and get some artificial results), but keeping the index.

In [43]:
TARGET_FEATURE = 'Close'


def feature_close_one_to_one(sample_blob):
    target = sample_blob[1].T
    feat_close = sample_blob[0][TARGET_FEATURE]
    x_y_samples = feat_close.append(target)
    x_y_samples.index = pd.MultiIndex.from_product([[x_y_samples.index[0]], 
                                                    np.arange(x_y_samples.shape[0])])
    x_y_samples_shuffled = x_y_samples.unstack().stack(0).sample(frac=1).reset_index(level=1, drop=True)
    x_y_samples_filtered = x_y_samples_shuffled.dropna(axis=0, how='any')
    
    return x_y_samples_filtered

In [44]:
print(feature_close_one_to_one(sample_blob).shape)
feature_close_one_to_one(sample_blob).head()

(479, 8)


Unnamed: 0,0,1,2,3,4,5,6,7
2014-12-18,26.02,26.14,26.48,26.62,26.54,26.5,26.59,26.52
2014-12-18,511.1,516.35,524.87,530.59,528.77,534.03,530.33,530.42
2014-12-18,252.23,257.81,258.22,257.8,256.98,257.09,258.86,257.49
2014-12-18,89.54,90.1,90.24,90.85,90.26,90.24,90.14,89.46
2014-12-18,58.98,59.58,58.96,57.21,57.46,57.78,57.73,57.65


In [45]:
def generate_train_intervals(data_df, train_time, base_days, step_days, ahead_days, today, blob_fun):
    end_of_training_date = add_market_days(today, -ahead_days, data_df)
    start_date = end_of_training_date - dt.timedelta(train_time)
    start_target_date = add_market_days(start_date, base_days + ahead_days - 1, data_df)
    
    data_train_df = data_df[start_date:end_of_training_date]
    data_target_df = data_df.loc[start_target_date: today, TARGET_FEATURE]
    
    date_base_ini = start_date
    date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
    date_target = add_market_days(date_base_end, ahead_days, data_df)
    feat_tgt_df = pd.DataFrame()

    while date_base_end < end_of_training_date:
        sample_blob = (data_train_df[date_base_ini: date_base_end],
                       pd.DataFrame(data_target_df.loc[date_target]))
        feat_tgt_blob = blob_fun(sample_blob)
        feat_tgt_df = feat_tgt_df.append(feat_tgt_blob)
        
        date_base_ini = add_market_days(date_base_ini, step_days, data_df)
        date_base_end = add_market_days(date_base_ini, base_days - 1, data_df)
        date_target = add_market_days(date_base_end, ahead_days, data_df)
        # print('Start: %s,  End:%s' % (date_base_ini, date_base_end))
    
    feat_tgt_df = feat_tgt_df.sample(frac=1)
    
    X_df = feat_tgt_df.iloc[:,:-1]
    y_df = pd.DataFrame(feat_tgt_df.iloc[:,-1]).rename(columns={7:'target'})
    
    return X_df, y_df

In [46]:
from time import time

tic = time()
X, y = generate_train_intervals(data_df, 
                                train_time, 
                                base_days, 
                                step_days, 
                                ahead_days, 
                                today, 
                                feature_close_one_to_one)
toc = time()
print('Elapsed time: %i seconds.' % (toc-tic))

Elapsed time: 2 seconds.


In [47]:
print(X.shape)
X.head(10)

(17102, 7)


Unnamed: 0,0,1,2,3,4,5,6
2014-02-20,80.76,81.48,81.74,81.86,82.45,82.23,82.5
2014-02-20,35.85,36.04,35.97,36.1,36.01,35.95,36.02
2014-05-12,29.04,29.11,29.25,29.23,29.27,28.88,28.95
2014-04-10,28.26,28.38,28.55,28.95,29.17,28.86,28.73
2014-08-11,99.77,99.81,99.99,99.97,100.46,100.6,101.8
2014-01-09,84.64,85.19,84.02,84.41,85.08,83.98,83.92
2014-11-28,63.23,63.6,64.84,65.33,65.86,65.9,66.22
2014-07-22,121.11,122.01,122.26,122.84,122.65,123.31,130.01
2014-09-10,48.75,49.01,48.4,48.56,48.96,49.21,49.69
2014-11-18,83.79,84.99,84.58,84.65,85.4,84.95,84.98


In [48]:
print(y.shape)
y.head(10)

(17102, 1)


Unnamed: 0,target
2014-02-20,81.84
2014-02-20,36.0
2014-05-12,28.87
2014-04-10,28.78
2014-08-11,101.74
2014-01-09,84.06
2014-11-28,65.99
2014-07-22,127.39
2014-09-10,50.35
2014-11-18,87.54


### Let's test the "final" (you never know...) function in its module

In [49]:
sys.path.append('../../')
import predictor.feature_extraction as fe

X, y = fe.generate_train_intervals(data_df, 
                                   train_time, 
                                   base_days, 
                                   step_days, 
                                   ahead_days, 
                                   today, 
                                   feature_close_one_to_one)

In [50]:
print(X.shape)
X.head(10)

(24606, 7)


Unnamed: 0,0,1,2,3,4,5,6
2014-05-09,21.1,21.77,21.46,20.73,19.76,20.06,20.07
2014-03-31,60.21,60.26,61.69,61.45,59.98,59.03,59.1
2014-10-17,79.77,79.64,81.16,81.02,81.56,82.52,82.86
2013-07-19,67.39,67.0,66.85,65.94,66.56,64.73,63.44
2014-11-17,41.16,41.26,40.92,40.92,41.44,41.43,41.15
2013-07-30,52.09,52.04,53.48,53.93,54.25,53.76,52.89
2013-11-06,83.0,81.04,83.38,83.3,84.52,84.84,85.22
2014-05-09,49.27,50.12,50.05,50.2,48.61,48.98,49.34
2014-12-08,18.27,18.34,17.86,17.78,17.02,17.08,16.95
2014-05-09,43.59,43.02,43.2,43.78,43.63,43.64,43.52


In [51]:
print(y.shape)
y.head(10)

(24606, 1)


Unnamed: 0,target
2014-05-09,19.77
2014-03-31,60.23
2014-10-17,84.09
2013-07-19,63.14
2014-11-17,41.87
2013-07-30,52.79
2013-11-06,85.11
2014-05-09,49.74
2014-12-08,17.47
2014-05-09,43.63


## Nice!

## I will try to modify the add_market_days function to make it return a shift in real days instead of an index shift (that takes into account the possible duplicates, that are very common in some of the approaches I will follow)

In [55]:
data_df

feature,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,SPY,MMM,ABT,ABBV,ACN,ATVI,AYI,ADBE,AMD,AAP,...,WYNN,XEL,XRX,XLNX,XYL,YHOO,YUM,ZBH,ZION,ZTS
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1993-01-29,43.94,24.50,6.88,,,,,2.59,18.75,,...,,87800.0,7633602.0,1745196.0,,,,,33600.0,
1993-02-01,44.25,24.69,6.88,,,,,2.72,19.12,,...,,72400.0,3001200.0,3574800.0,,,,,32000.0,
1993-02-02,44.34,24.72,6.53,,,,,2.84,20.25,,...,,242200.0,1388598.0,2652396.0,,,,,251600.0,
1993-02-03,44.81,25.19,6.91,,,,,2.70,20.50,,...,,272200.0,1228200.0,5040396.0,,,,,254800.0,
1993-02-04,45.00,26.06,6.84,,,,,2.73,20.12,,...,,162800.0,1675602.0,7033200.0,,,,,317200.0,
1993-02-05,44.97,27.19,6.88,,,,,2.60,19.62,,...,,73600.0,3104598.0,7197600.0,,,,,292400.0,
1993-02-08,44.97,27.25,6.66,,,,,2.62,19.75,,...,,43400.0,3567600.0,5848800.0,,,,,202000.0,
1993-02-09,44.66,27.25,6.84,,,,,2.59,19.75,,...,,58400.0,1281600.0,2401200.0,,,,,111600.0,
1993-02-10,44.72,27.31,6.88,,,,,2.56,19.75,,...,,174200.0,2275602.0,3268404.0,,,,,38800.0,
1993-02-11,44.94,26.94,6.94,,,,,2.62,21.00,,...,,87400.0,343002.0,1974804.0,,,,,18400.0,


In [57]:
base = data_df.index[0]
delta = 252

In [63]:
market_days = np.unique(data_df.sort_index().index)
len(market_days)

5520

In [None]:
def add_market_days(base, delta, data_df):
    """
    base is in real time.
    delta is in market days.
    """
    market_days = data_df.index
    if base not in market_days:
        raise Exception('The base date is not in the market days list.')
    base_index = market_days.tolist().index(base)
    if base_index + delta >= len(market_days):
        return market_days[-1]
    if base_index + delta < 0:
        return market_days[0]
    return market_days[base_index + delta]