In [1]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
import math
from timeit import default_timer as timer
from datetime import datetime, timedelta
import numba
from ensemble_processing import load_data, load, save

LABEL_COLUMN = "Future8WeekReturn"

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
pd.options.display.max_rows=500
pd.options.display.max_columns=500

In [3]:
unlabelled_df = pd.read_pickle('../data/ml-20180922-unlabelled.pkl.gz', compression='gzip')

In [4]:
# Get 1 week vals
reference_date = '2018-09-08'
converted_ref_date = datetime.strptime(reference_date, '%Y-%m-%d')

weeks_delta_1 = timedelta(weeks=8)
weeks_delta_2 = timedelta(weeks=7)
comparison_1_date = converted_ref_date - weeks_delta_1
print('Calculated comparison 1 date:', comparison_1_date)

comparison_2_date = converted_ref_date - weeks_delta_2
print('Calculated comparison 2 date:', comparison_2_date)

week1_df = unlabelled_df.loc[(unlabelled_df.index > comparison_1_date) & (unlabelled_df.index <= comparison_2_date)]
print('Retaining', len(week1_df.index), 'week 1 records')

Calculated comparison 1 date: 2018-07-14 00:00:00
Calculated comparison 2 date: 2018-07-21 00:00:00
Retaining 7297 week 1 records


In [None]:
week1_df

In [5]:
# Get 2 week vals
reference_date = '2018-09-08'
converted_ref_date = datetime.strptime(reference_date, '%Y-%m-%d')

weeks_delta_1 = timedelta(weeks=7)
weeks_delta_2 = timedelta(weeks=6)
comparison_1_date = converted_ref_date - weeks_delta_1
print('Calculated comparison 1 date:', comparison_1_date)

comparison_2_date = converted_ref_date - weeks_delta_2
print('Calculated comparison 2 date:', comparison_2_date)

week2_df = unlabelled_df.loc[(unlabelled_df.index > comparison_1_date) & (unlabelled_df.index <= comparison_2_date)]
print('Retaining', len(week2_df.index), 'week 2 records')

Calculated comparison 1 date: 2018-07-21 00:00:00
Calculated comparison 2 date: 2018-07-28 00:00:00
Retaining 7321 week 2 records


In [None]:
week2_df

In [6]:
week1_df.to_pickle('../data/ml-20180922-unlabelled-week1.pkl.gz', compression='gzip')
week2_df.to_pickle('../data/ml-20180922-unlabelled-week2.pkl.gz', compression='gzip')

In [None]:
labelled_df = pd.read_pickle('../data/ml-20180922-labelled.pkl.gz', compression='gzip')

In [None]:
# Get new vals since prevoius model
reference_date = '2018-07-14'
converted_ref_date = datetime.strptime(reference_date, '%Y-%m-%d')

weeks_delta_1 = timedelta(weeks=8)
comparison_1_date = converted_ref_date - weeks_delta_1
print('Calculated comparison 1 date:', comparison_1_date)

new_df = labelled_df.loc[labelled_df.index > comparison_1_date]
print('Retaining', len(new_df.index), 'new records')

In [None]:
new_df

In [None]:
new_df.to_pickle('../data/ml-20180922-labelled-new.pkl.gz', compression='gzip')

In [2]:
labelled_df = pd.read_pickle('../data/ml-20180922-labelled.pkl.gz', compression='gzip')

In [3]:
# Generate data to FY1718
reference_date = '2018-07-01'
converted_ref_date = datetime.strptime(reference_date, '%Y-%m-%d')

fy0708_1718_df = labelled_df.loc[labelled_df.index < converted_ref_date]
print('Retaining', len(fy0708_1718_df.index), 'records')

Retaining 3384231 records


In [4]:
fy0708_1718_df.to_pickle('../data/ml-fy0708_1718_df-labelled.pkl.gz', compression='gzip')

In [6]:
del labelled_df
del fy0708_1718_df

In [5]:
raw_df = pd.read_pickle('../data/ml-20180922-data.pkl.gz', compression='gzip')

In [9]:
raw_df['quoteDate'] = pd.to_datetime(raw_df['quoteDate'], errors='coerce')

In [10]:
raw_df

Unnamed: 0,symbol,quoteDate,lastTradePriceOnly,adjustedPrice,volume,daysHigh,daysLow,previousClose,change,changeInPercent,...,Future12WeekReturn,Future12WeekRiskAdjustedReturn,Future26WeekDividend,Future26WeekPrice,Future26WeekReturn,Future26WeekRiskAdjustedReturn,Future52WeekDividend,Future52WeekPrice,Future52WeekReturn,Future52WeekRiskAdjustedReturn
0,CG1,2018-05-14,0.430,0.430,27759,0.430,0.430,0.430,,,...,,,,,,,,,,
1,CG1,2018-05-15,0.425,0.425,45633,0.430,0.425,0.430,-0.005,-0.011628,...,,,,,,,,,,
2,CG1,2018-05-17,0.420,0.420,287372,0.430,0.415,0.425,-0.005,-0.011765,...,,,,,,,,,,
3,CG1,2018-05-18,0.465,0.465,42373,0.465,0.420,0.420,0.045,0.107143,...,,,,,,,,,,
4,CG1,2018-05-21,0.510,0.510,229346,0.510,0.470,0.465,0.045,0.096774,...,,,,,,,,,,
5,CG1,2018-05-22,0.500,0.500,2378,0.500,0.500,0.510,-0.010,-0.019608,...,,,,,,,,,,
6,CG1,2018-05-23,0.490,0.490,92622,0.500,0.490,0.500,-0.010,-0.020000,...,,,,,,,,,,
7,CG1,2018-05-24,0.485,0.485,2922,0.485,0.485,0.490,-0.005,-0.010204,...,,,,,,,,,,
8,CG1,2018-05-25,0.470,0.470,20404,0.470,0.465,0.485,-0.015,-0.030928,...,,,,,,,,,,
9,CG1,2018-05-28,0.465,0.465,8805,0.465,0.465,0.470,-0.005,-0.010638,...,,,,,,,,,,


In [11]:
# Generate data to FY1718
reference_date = '2018-07-01'
converted_ref_date = datetime.strptime(reference_date, '%Y-%m-%d')

fy0708_1718_df = raw_df.loc[raw_df.quoteDate < converted_ref_date]
print('Retaining', len(fy0708_1718_df.index), 'records')

Retaining 3384231 records


In [12]:
fy0708_1718_df.to_pickle('../data/ml-fy0708_1718_df-data.pkl.gz', compression='gzip')