In [206]:
import pandas as pd
from functools import reduce

In [207]:
twitter_dir = 'data/bucket/twitter/'
news_dir = 'data/bucket/news/'
stock_dir = 'data/out/'
output_dir = 'data/features/'

In [208]:
influencers = ['BarackObama_sentiment','cnnbrk_sentiment', 'KimKardashian_sentiment', 'Trump_sentiment']
sources = ['New York Times_sentiment', 'CNN_sentiment', 'Washington Post_sentiment', 'NYT_ALL_sentiment']
indices = ['VIX_Predict', 'SPY_Predict']
stocks = ['AAPL_Predict', 'GOOG_Predict','MSFT_Predict']

In [255]:
twitter_important_columns = ['Date','sentiment_score','heuristic_score']
news_important_columns = ['Date','sentiment_score','polar_score']
stock_important_columns = ['Date','Change','Direction','Lag 1 Change', 'Lag 1 Direction','Lag 2 Change', 'Lag 2 Direction','Lag 5 Change', 'Lag 5 Direction']

stock_simple_columns = ['Date','Change','Lag 2 Significant']

In [256]:
first_date = pd.to_datetime('1/1/2015')
last_date = pd.to_datetime('5/1/2019')

In [257]:
dfs_twitter = []
for influencer in influencers:
    influencer_name = influencer.split('_')[0]
    df = pd.read_csv(twitter_dir+influencer+'.csv', parse_dates=['Date'])
    df = df[df['Date']<last_date]
    df = df[df['Date']>first_date]
    df = df[twitter_important_columns]
    df = df.rename(columns={'sentiment_score':influencer_name+'_sentiment_score'})
    df = df.rename(columns={'heuristic_score':influencer_name+'_heuristic_score'})
    dfs_twitter.append(df)

In [258]:
dfs_news = []
for source in sources:
    source_name = source.split('_')[0]
    df = pd.read_csv(news_dir+source+'.csv', parse_dates=['Date'])
    df = df[df['Date']<last_date]
    df = df[df['Date']>first_date]
    df = df.rename(columns={'sentiment_score':source_name+'_sentiment_score','polar_score':source_name+'_polar_score'})
    dfs_news.append(df)

In [259]:
dfs_index = []
for index in indices:
    df = pd.read_csv(stock_dir+index+'.csv', parse_dates=['Date'])
    df['Date']= pd.to_datetime(df['Date'].dt.normalize())
    df = df[df['Date']<last_date]
    df = df[df['Date']>first_date]
    df = df[stock_important_columns]
    dfs_index.append(df)

In [260]:
dfs_stock = []
for stock in stocks:
    stock_name = stock.split('_')[0]
    df = pd.read_csv(stock_dir+stock+'.csv', parse_dates=['Date'])
    df['Date']= pd.to_datetime(df['Date'].dt.normalize())
    df = df[df['Date']<last_date]
    df = df[df['Date']>first_date]
    df = df[stock_simple_columns]
    df = df.rename(columns={'Change':stock_name+'_Change','Lag 2 Significant':stock_name+'_Lag 2 Significant'})
    df['stock'] = stock_name
    dfs_stock.append(df)

In [261]:
dfs_twitter[2]

Unnamed: 0,Date,KimKardashian_sentiment_score,KimKardashian_heuristic_score
0,2015-01-06,4.0653,37.652221
1,2015-01-07,0.9154,8.877281
2,2015-01-08,1.7150,16.294419
3,2015-01-09,4.6104,34.428163
4,2015-01-12,0.6966,4.554270
...,...,...,...
362,2017-06-01,1.3789,11.152696
363,2017-08-01,0.7215,7.021072
364,2017-09-29,0.0000,0.000000
365,2017-11-01,0.3612,4.032091


In [262]:
df_twitter = reduce(lambda  left,right: pd.merge(left,right,on=['Date'],
                                            how='outer'), dfs_twitter)

In [263]:
df_news = reduce(lambda  left,right: pd.merge(left,right,on=['Date'],
                                            how='outer'), dfs_news)

In [264]:
df_stock = reduce(lambda  left,right: pd.merge(left,right,on=['Date'],
                                            how='outer'), dfs_stock)

In [265]:
df_stock['Date'].value_counts()

2016-09-08    1
2016-08-01    1
2018-03-15    1
2017-03-15    1
2018-02-12    1
             ..
2018-08-08    1
2016-01-08    1
2018-08-29    1
2016-08-05    1
2018-12-21    1
Name: Date, Length: 1088, dtype: int64

In [266]:
df_twitter[df_twitter['Date']=='2016-07-08']

Unnamed: 0,Date,BarackObama_sentiment_score,BarackObama_heuristic_score,cnnbrk_sentiment_score,cnnbrk_heuristic_score,KimKardashian_sentiment_score,KimKardashian_heuristic_score,Trump_sentiment_score,Trump_heuristic_score
363,2016-07-08,3.6317,30.071745,-9.6273,-58.698247,1.0561,8.794363,-2.6855,-29.170788


In [267]:
index_name = [' '+index.split('_')[0] for index in indices]

In [268]:
df_index = reduce(lambda  left,right: pd.merge(left,right,on=['Date'],
                                            how='outer', suffixes=index_name), dfs_index)

In [269]:
# Handle Weekends

In [270]:
predict_dates = df_stock['Date']

In [271]:
predict_dates.shape

(1088,)

In [272]:
#FOR NOW ONLY DELETE THE WEEKENDS
df_twitter = pd.merge(predict_dates, df_twitter, on=["Date"], how='left')
df_news = pd.merge(predict_dates, df_news, on=["Date"], how='left')

In [273]:
df_twitter.shape

(1088, 9)

In [274]:
df_news.shape

(1088, 9)

In [275]:
df_index.shape

(1088, 17)

In [276]:
df_stock.shape

(1088, 10)

In [277]:
df_twitter.to_csv(output_dir+'twitter_features.csv', index=False)
df_news.to_csv(output_dir+'news_features.csv', index=False)
df_index.to_csv(output_dir+'index_features.csv', index=False)
df_stock.to_csv(output_dir+'stock_features.csv', index=False)