-
Notifications
You must be signed in to change notification settings - Fork 40
/
data_reader.py
168 lines (122 loc) · 4.98 KB
/
data_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import os
from glob import glob
import numpy as np
import pandas as pd
np.set_printoptions(threshold=np.nan)
pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
def read_sp_500():
spy = pd.read_csv('trends/SP500.csv', parse_dates=True, index_col=0)
spy['Adj Close Log'] = spy['Adj Close'].apply(np.log)
spy['Adj Close Log Shift 1'] = spy['Adj Close Log'].shift(1)
spy['returns'] = spy['Adj Close Log'] - spy['Adj Close Log Shift 1']
print(spy.head())
spy.dropna(inplace=True)
spy.drop('Adj Close Log', 1, inplace=True)
spy.drop('Adj Close Log Shift 1', 1, inplace=True)
print('')
print(spy.head())
spy['u'] = (spy['High'] / spy['Open']).apply(np.log)
spy['d'] = (spy['Low'] / spy['Open']).apply(np.log)
spy['c'] = (spy['Close'] / spy['Open']).apply(np.log)
print('')
print(spy.head())
# correct.
# 0.511*(0.006651--0.001358)^2-0.019*(0.006651*(0.006651+-0.001358)-2*0.006651*(-0.001358))-0.383*0.006651^2
# ~ 0.00001482322429 [first value]
spy['sigma'] = 0.511 * (spy['u'] - spy['d']) ** 2 - 0.019 * (
spy['c'] * (spy['u'] + spy['d']) - 2 * spy['u'] * spy['d']) - 0.383 * spy['c'] ** 2
print('')
print(spy.head())
spy.drop('u', 1, inplace=True)
spy.drop('d', 1, inplace=True)
spy.drop('c', 1, inplace=True)
spy = spy[['returns', 'sigma']] # only keep returns and volatility
print('')
print(spy.head())
# import matplotlib.pyplot as plt
# spy['sigma'].plot()
# plt.show()
return spy
def process_trend(trend):
trend_name = trend.split('/')[1].split('.')[0]
t = pd.read_csv(trend, parse_dates=True, index_col=0)
t = t[['Close']]
t.columns = ['Trend {}'.format(trend_name.upper())]
print('Trend [{0}] processed.'.format(trend_name))
# print(t.head())
return t
def read_trends():
trends = glob('trends/*.csv')
trends = filter(lambda x: 'SP500' not in x and 'spy' not in x, trends)
trends = sorted(trends) # reproducibility.
assert len(trends) == 27, 'You should have 27 trends. Check here https://finance.google.com/finance/domestic_trends'
trends_df_list = []
for i, trend in enumerate(trends):
if 'DEBUG' in os.environ and i > 2:
print('DEBUG! WE TRUNCATE TO ONLY THREE TRENDS FOR SPEED')
break
trends_df_list.append(process_trend(trend))
full_trend_df = pd.DataFrame(pd.concat(trends_df_list, axis=1))
if full_trend_df.isnull().values.any():
print(full_trend_df[full_trend_df.isnull().any(axis=1)])
full_trend_df.fillna(method='ffill', inplace=True)
if full_trend_df.isnull().values.any():
full_trend_df.fillna(method='bfill', inplace=True) # we cheat a very bit at the beginning.
# we fill gap values by the last values. If we remove lines, we will have gaps in our data
# and it's not going to be cool.
assert not full_trend_df.isnull().values.any()
return full_trend_df
def read_all():
trends = read_trends()
sp500 = read_sp_500()
full_data = sp500.join(trends, how='outer') # correct
print('-' * 80)
print(full_data.tail())
print('-' * 80)
full_data.dropna(inplace=True) # correct
print('-' * 80)
print(full_data.tail())
print('-' * 80)
# print(read_sp_500().join(process_trend(trend), how='outer'))
return full_data
def split_training_test(df):
# 19-Oct-2004 to 9-Apr-2012 while the test set ranges from 12-Apr-2012 to 24-Jul-2015
training_df = df[:'9-Apr-2015']
testing_df = df['12-Apr-2015':]
return training_df, testing_df
def z_score(x, mean, std):
return (x - mean.values) / std.values # testing set.
def z_score_inv(x, mean, std):
return x * std + mean
def apply_z_score_to_data_frame(df, mean, std):
df = df.apply(lambda x: z_score(x, mean, std), axis=1)
# Tips: Use this to debug.
# print(df.head())
# print(df.apply(lambda x: z_score(x, mean, std), axis=1).head())
# print(mean)
# print(std)
return df
def apply_delta_t_to_data_frame(df):
# we only support those values. They are given by the paper.
delta_t = 3
# for trends (MEAN)
out = df.rolling(window=delta_t, center=False).mean() # correct.
# for returns (SUM)
out['returns'] = df['returns'].rolling(delta_t, center=False).apply(np.sum) # correct
# for volatility (SQUARE - SUM - SQRT)
out['sigma'] = df['sigma'].apply(np.square).rolling(delta_t, center=False).apply(np.sum).apply(np.sqrt) # correct
out.dropna(inplace=True)
return out
def get_data():
df = read_all()
df = apply_delta_t_to_data_frame(df) # try to apply z-score before and after.
mean = np.mean(df)
std = np.std(df)
df = apply_z_score_to_data_frame(df, mean, std)
tr, te = split_training_test(df) # we cheat a bit but very little, no problem.
return tr, te, mean['sigma'], std['sigma']
if __name__ == '__main__':
get_data()