-
Notifications
You must be signed in to change notification settings - Fork 3
/
hist_fx_data_nfp_hive.txt
130 lines (113 loc) · 4.41 KB
/
hist_fx_data_nfp_hive.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
hdfs dfs -mkdir -p /user/cloudera/rawdata/hist_fx_nfp/nfp_txt
hdfs dfs -mkdir -p /user/cloudera/rawdata/hist_fx_nfp/gaps/eurusd
hdfs dfs -mkdir -p /user/cloudera/rawdata/hist_fx_nfp/gaps/gbpusd
hdfs dfs -mkdir -p /user/cloudera/rawdata/hist_fx_nfp/gaps/usdjpy
hdfs dfs -moveFromLocal non_farm_payroll_dataset.txt /user/cloudera/rawdata/hist_fx_nfp/nfp_txt
hdfs dfs -moveFromLocal eurusd/ /user/cloudera/rawdata/hist_fx_nfp
hdfs dfs -moveFromLocal gbpusd/ /user/cloudera/rawdata/hist_fx_nfp
hdfs dfs -moveFromLocal usdjpy/ /user/cloudera/rawdata/hist_fx_nfp
hdfs dfs -moveFromLocal eurusd_gaps/* /user/cloudera/rawdata/hist_fx_nfp/gaps/eurusd
hdfs dfs -moveFromLocal gbpusd_gaps/* /user/cloudera/rawdata/hist_fx_nfp/gaps/gbpusd
hdfs dfs -moveFromLocal usdjpy_gaps/* /user/cloudera/rawdata/hist_fx_nfp/gaps/usdjpy
create schema fx_nfp;
use fx_nfp;
create external table tick_data_raw (
trade_dt string,
bid decimal(12,6),
ask decimal(12,6),
vol tinyint
)
partitioned by (curr_pair string)
row format delimited
fields terminated by ',';
alter table tick_data_raw add partition (curr_pair='EURUSD') location '/user/cloudera/rawdata/hist_fx_nfp/eurusd';
alter table tick_data_raw add partition (curr_pair='GBPUSD') location '/user/cloudera/rawdata/hist_fx_nfp/gbpusd';
alter table tick_data_raw add partition (curr_pair='USDJPY') location '/user/cloudera/rawdata/hist_fx_nfp/usdjpy';
create external table nfp_hist_raw(
release_date string,
release_time string,
actual string,
forecast string,
previous string
)
row format delimited
fields terminated by '|'
location '/user/cloudera/rawdata/hist_fx_nfp/nfp_txt';
create external table nfp_hist(
release_date string,
release_time string,
actual int,
forecast int,
previoius int,
trading_day string,
trading_hour int,
trading_min int
)
stored as parquet;
insert overwrite table nfp_hist
select release_date,
release_time,
cast(regexp_replace(actual, 'K', '') as int) actual,
cast(regexp_replace(forecast, 'K', '') as int) forecast,
cast(regexp_replace(previous, 'K', '') as int) previous,
concat(substr(release_date, 9, 4),case substr(release_date, 1, 3)
when 'Jan' then '01'
when 'Feb' then '02'
when 'Mar' then '03'
when 'Apr' then '04'
when 'May' then '05'
when 'Jun' then '06'
when 'Jul' then '07'
when 'Aug' then '08'
when 'Sep' then '09'
when 'Oct' then '10'
when 'Nov' then '11'
when 'Dec' then '12'
else 'NA' end, substr(release_date, 5, 2)) trading_day,
cast(substr(release_time, 1, 2) as int) trading_hour,
cast(substr(release_time, 4, 2) as int) trading_min
from nfp_hist_raw
where release_time <> 'Time';
-- do this in impala
create external table tick_data (
trade_dt string, day string, hour int, min int, sec int, milli int,
bid decimal(12,6),
ask decimal(12,6)
)
partitioned by (curr_pair string)
stored as parquet;
alter table tick_data add partition (curr_pair='EURUSD');
alter table tick_data add partition (curr_pair='GBPUSD');
alter table tick_data add partition (curr_pair='USDJPY');
insert overwrite table tick_data partition (curr_pair='EURUSD')
select trade_dt,
substr(trade_dt, 1, 8) trading_day,
cast(substr(trade_dt, 10, 2) as int) trading_hour,
cast(substr(trade_dt, 12, 2) as int) trading_min,
cast(substr(trade_dt, 14, 2) as int) trading_sec,
cast(substr(trade_dt, 16, 3) as int) trading_milli, ask, bid
from tick_data_raw
where curr_pair = 'EURUSD';
insert overwrite table tick_data partition (curr_pair='GBPUSD')
select trade_dt,
substr(trade_dt, 1, 8) trading_day,
cast(substr(trade_dt, 10, 2) as int) trading_hour,
cast(substr(trade_dt, 12, 2) as int) trading_min,
cast(substr(trade_dt, 14, 2) as int) trading_sec,
cast(substr(trade_dt, 16, 3) as int) trading_milli, ask, bid
from tick_data_raw
where curr_pair = 'GBPUSD';
insert overwrite table tick_data partition (curr_pair='USDJPY')
select trade_dt,
substr(trade_dt, 1, 8) trading_day,
cast(substr(trade_dt, 10, 2) as int) trading_hour,
cast(substr(trade_dt, 12, 2) as int) trading_min,
cast(substr(trade_dt, 14, 2) as int) trading_sec,
cast(substr(trade_dt, 16, 3) as int) trading_milli, ask, bid
from tick_data_raw
where curr_pair = 'USDJPY';
-- for space sake, delete the files
hdfs dfs -rmr -skipTrash /user/cloudera/rawdata/hist_fx_nfp/eurusd
hdfs dfs -rmr -skipTrash /user/cloudera/rawdata/hist_fx_nfp/gbpusd
hdfs dfs -rmr -skipTrash /user/cloudera/rawdata/hist_fx_nfp/usdjpy
drop table tick_data_raw;