# Extract News Events

In [None]:
import pandas as pd
import os
os.getcwd()

## Reuters

In [None]:
dir = 'ReutersNews106521'

In [None]:
filepaths=[]
for root,d_names,f_names in os.walk(dir):
	for f in f_names:
		filepaths.append(os.path.join(root, f))

Extract events to newslist

In [None]:
newslist = []
timelist = []
exceptlist = []

for file in filepaths:
    try:
        f = open(file, encoding='utf8')
        line1 = f.readline()
        line2 = f.readline()
        line3 = f.readline()
        newslist.append(line1)
        timelist.append(line3)
        f.close()
        
    except Exception as e:
        exceptlist.append(file)
        continue 

In [None]:
newslist = [x[3:-1] + '.' for x in newslist]
timelist = [x[3:-1] for x in timelist]

In [None]:
news_df = pd.DataFrame({'datetime': time_list, 'news_title': newslist, 'line_num': linelist})    

In [None]:
news_df['datetime'] = pd.to_datetime(news_df['datetime'], infer_datetime_format=True)

In [None]:
news_df['source'] = 'Reuters'

In [None]:
news_df['tz'] = [x[-3:] for x in time_list]

Timezone conversion to UTC

In [None]:
tz_bool = news_df['tz']=='EDT'

In [None]:
news_df['datetime'] = news_df['datetime'].dt.tz_localize(tz='US/Eastern', ambiguous=tz_bool)

In [None]:
news_df

In [None]:
news_df['datetime'] = news_df['datetime'].dt.tz_convert('UTC')

In [None]:
news_df.drop('tz', axis=1, inplace=True)

Write news to newslist

In [None]:
# with open('./newslist', 'w', encoding='utf8') as f:
#     for item in newslist:
#         f.write("%s\n" % item)

In [None]:
# with open('./timelist', 'w', encoding='utf8') as f:
#     for item in time_list:
#         f.write("%s\n" % item)

In [None]:
newslist = [line.rstrip('\n') for line in open('./newslist', encoding='utf8')]

In [None]:
time_list = [line.rstrip('\n') for line in open('./timelist', encoding='utf8)]

In [None]:
# datetime_list = [datetime.strptime(x[:-4], '%a %b %d, %Y %H:%M%p') for x in time_list]

## Bloomberg

In [None]:
dir = '20061020_20131126_bloomberg_news'

In [None]:
filepaths=[]
for root,d_names,f_names in os.walk(dir):
	for f in f_names:
		filepaths.append(os.path.join(root, f))

In [None]:
# newslist2 = []
# exceptlist2 = []
# time_list2 = []

# # Turn extracted events into dataframe
# o1_list = []
# p_list = []
# o2_list = []

# for event in event_list:
#     o1_list.append(event[2])
#     p_list.append(event[3])
#     o2_list.append(event[4])
    
# event_ext_df = pd.DataFrame({'line_num': line_num_list, 'o1': o1_list, 'p': p_list, 'o2': o2_list})

In [None]:
newslist2 = [x[3:-1] for x in newslist2]
newslist2 = [x + '.' for x in newslist2]
time_list2 = [x[3:-1] for x in time_list2]

In [None]:
# with open('./newslist_b', 'w', encoding='utf8') as f:
#     for item in newslist2:
#         f.write("%s\n" % item)

In [None]:
# with open('./time_list_b', 'w', encoding='utf8') as f:
#     for item in time_list2:
#         f.write("%s\n" % item)

In [None]:
# newslist2 = [line.rstrip('\n') for line in open('./newslist_b', encoding='utf8')]
# time_list2 = [line.rstrip('\n') for line in open('./time_list_b', encoding='utf8')]

In [None]:
import pandas as pd

In [None]:
news2_df = pd.DataFrame({'datetime': time_list2, 'news_title': newslist2})    

In [None]:
# Data cleanup
news2_df = news2_df[news2_df['datetime'] != '']
news2_df.reset_index(inplace=True, drop=True)
news2_df = news2_df[news2_df['datetime'].str.len() == 20]

In [None]:
news2_df.loc[:, 'datetime'] = pd.to_datetime(news2_df.loc[:, 'datetime'])

In [None]:
news2_df['source'] = 'Bloomberg'

## Combine news sources

In [None]:
df = pd.concat([news_df, news2_df])

In [None]:
df = df[df['datetime'].notnull()]

In [None]:
df = df.sort_values(by='datetime')

In [None]:
newslist = list(df['news_title'])

In [None]:
df = df.reset_index(drop=True)

In [None]:
df.loc[:, 'datetime'] = df['datetime'].dt.tz_convert('US/Eastern')

In [None]:
df.to_pickle("./news.pkl")

In [None]:
df = pd.read_pickle("./news.pkl")

Join stock movements

In [None]:
df.tail()

In [None]:
dji = pd.read_pickle("./dji.pkl")

In [None]:
df = pd.merge_asof(left=df, right=dji[['Date', 'stock']], left_on='datetime', right_on='Date', direction='backward')

In [None]:
df = pd.merge(left=dji[['Date']], right=df, on='Date', how='left')

In [None]:
df.reset_index(inplace=True, drop=True)
df.reset_index(inplace=True, drop=False)

In [None]:
train_df = df[df['datetime'] <= '2012-11-26']
test_df = df[df['datetime'] > '2012-11-26']

In [None]:
train_df.to_pickle('./train_df.pkl')
test_df.to_pickle('./test_df.pkl')

In [None]:
train_df = pd.read_pickle('./train_df.pkl')
test_df = pd.read_pickle('./test_df.pkl')

In [None]:
train_df.head()

In [None]:
# with open('./train_news', 'w', encoding='utf8') as f:
#     for item in train_df['news_title']:
#         f.write("%s\n" % item)

In [None]:
# with open('./test_news', 'w', encoding='utf8') as f:
#     for item in test_df['news_title']:
#         f.write("%s\n" % item)

In [None]:
!java -Xmx512m -jar ./reverb-latest.jar -N train_news > ./train_events

# Reverb


## All events

Extract all events

In [None]:
# Run in command line

# cd /C/Users/Yonge/Stock_Price_Prediction

# find ReutersNews106521/ -type f -path "ReutersNews106521/2006*" -type f | java -Xmx512m -jar ./reverb-latest.jar -f > ./reuters06.txt
# find ReutersNews106521/ -type f -path "ReutersNews106521/2007*" -type f | java -Xmx512m -jar ./reverb-latest.jar -f > ./reuters07.txt
# find ReutersNews106521/ -type f -path "ReutersNews106521/2008*" -type f | java -Xmx512m -jar ./reverb-latest.jar -f > ./reuters08.txt
# find ReutersNews106521/ -type f -path "ReutersNews106521/2009*" -type f | java -Xmx512m -jar ./reverb-latest.jar -f > ./reuters09.txt
# find ReutersNews106521/ -type f -path "ReutersNews106521/2010*" -type f | java -Xmx512m -jar ./reverb-latest.jar -f > ./reuters10.txt
# find ReutersNews106521/ -type f -path "ReutersNews106521/2011*" -type f | java -Xmx512m -jar ./reverb-latest.jar -f > ./reuters11.txt
# find ReutersNews106521/ -type f -path "ReutersNews106521/2012*" -type f | java -Xmx512m -jar ./reverb-latest.jar -f > ./reuters12.txt
# find ReutersNews106521/ -type f -path "ReutersNews106521/2013*" -type f | java -Xmx512m -jar ./reverb-latest.jar -f > ./reuters13.txt

# find 20061020_20131126_bloomberg_news/ -type f -path "20061020_20131126_bloomberg_news/2006*" -type f | java -Xmx512m -jar ./reverb-latest.jar -f > ./reuters06.txt
# find 20061020_20131126_bloomberg_news/ -type f -path "20061020_20131126_bloomberg_news/2007*" -type f | java -Xmx512m -jar ./reverb-latest.jar -f > ./reuters07.txt
# find 20061020_20131126_bloomberg_news/ -type f -path "20061020_20131126_bloomberg_news/2008*" -type f | java -Xmx512m -jar ./reverb-latest.jar -f > ./reuters08.txt
# find 20061020_20131126_bloomberg_news/ -type f -path "20061020_20131126_bloomberg_news/2009*" -type f | java -Xmx512m -jar ./reverb-latest.jar -f > ./reuters09.txt
# find 20061020_20131126_bloomberg_news/ -type f -path "20061020_20131126_bloomberg_news/2010*" -type f | java -Xmx512m -jar ./reverb-latest.jar -f > ./reuters10.txt
# find 20061020_20131126_bloomberg_news/ -type f -path "20061020_20131126_bloomberg_news/2011*" -type f | java -Xmx512m -jar ./reverb-latest.jar -f > ./reuters11.txt
# find 20061020_20131126_bloomberg_news/ -type f -path "20061020_20131126_bloomberg_news/2012*" -type f | java -Xmx512m -jar ./reverb-latest.jar -f > ./reuters12.txt
# find 20061020_20131126_bloomberg_news/ -type f -path "20061020_20131126_bloomberg_news/2013*" -type f | java -Xmx512m -jar ./reverb-latest.jar -f > ./reuters13.txt

In [None]:
# files = []
# for i in range(6, 13):
#     files.append('reuters' + str(i).zfill(2) + '.txt')

In [None]:
# # Extract all Reuters events
# o1_list = []
# p_list = []
# o2_list = []

# for file in files:
#     for line in open(file, encoding='utf-8'):
#         line = line.rstrip('\n').split('\t')
#         o1_list.append(line[2])
#         p_list.append(line[3])
#         o2_list.append(line[4])
    
# reuters_event_df = pd.DataFrame({'o1': o1_list, 'p': p_list, 'o2': o2_list})

In [None]:
# reuters_event_df.to_pickle("./reuters_event_df.pkl")

In [None]:
# files = []
# for i in range(6, 13):
#     files.append('bloomberg' + str(i).zfill(2) + '.txt')

In [None]:
# # Extract all Bloomberg events
# o1_list = []
# p_list = []
# o2_list = []

# for file in files:
#     print(file)            
#     for line in open(file, encoding='utf-8', errors='replace'):
#         line = line.rstrip('\n').split('\t')
#         o1_list.append(line[2])
#         p_list.append(line[3])
#         o2_list.append(line[4])


    
# bloomberg_event_df = pd.DataFrame({'o1': o1_list, 'p': p_list, 'o2': o2_list})

In [None]:
# bloomberg_event_df.to_pickle("./bloomberg_event_df.pkl")

In [None]:
# reuters_event_df = pd.read_pickle("./reuters_event_df.pkl")
# bloomberg_event_df = pd.read_pickle("./bloomberg_event_df.pkl")

In [None]:
# combined_event_df = pd.concat([reuters_event_df, bloomberg_event_df])

In [None]:
# combined_event_df.to_pickle("./combined_event_df.pkl")

In [None]:
combined_events_df.head()

In [None]:
combined_events_df.tail()