In [1]:
import pandas as pd
from functools import reduce

In [2]:
twitter_dir = 'data/bucket/twitter/'
news_dir = 'data/bucket/news/'
stock_dir = 'data/out/'
output_dir = 'data/features/'

In [97]:
influencers = ['BarackObama_sentiment','cnnbrk_sentiment', 'KimKardashian_sentiment']
sources = ['New York Times_sentiment', 'CNN_sentiment', 'Washington Post_sentiment']
indices = ['VIX_Predict', 'SPY_Predict']
stocks = ['AAPL_Predict', 'GOOG_Predict','MSFT_Predict']

In [4]:
twitter_important_columns = ['Date','sentiment_score','heuristic_score']
stock_important_columns = ['Date','Change','Lag 2 Change', 'Lag 2 Significant']

In [5]:
first_date = pd.to_datetime('1/1/2015')
last_date = pd.to_datetime('1/1/2017')

In [6]:
dfs_twitter = []
for influencer in influencers:
    influencer_name = influencer.split('_')[0]
    df = pd.read_csv(twitter_dir+influencer+'.csv', parse_dates=['Date'])
    df = df[df['Date']<last_date]
    df = df[df['Date']>first_date]
    df = df[twitter_important_columns]
    df = df.rename(columns={'sentiment_score':influencer_name+'_sentiment_score'})
    df = df.rename(columns={'heuristic_score':influencer_name+'_heuristic_score'})
    dfs_twitter.append(df)

In [7]:
dfs_news = []
for source in sources:
    source_name = source.split('_')[0]
    df = pd.read_csv(news_dir+source+'.csv', parse_dates=['Date'])
    df = df[df['Date']<last_date]
    df = df[df['Date']>first_date]
    df = df.rename(columns={'sentiment_score':source_name+'_sentiment_score'})
    dfs_news.append(df)

In [98]:
dfs_index = []
for index in indices:
    df = pd.read_csv(stock_dir+index+'.csv', parse_dates=['Date'])
    df['Date']= pd.to_datetime(df['Date'].dt.normalize())
    df = df[df['Date']<last_date]
    df = df[df['Date']>first_date]
    df = df[stock_important_columns]
    dfs_index.append(df)

In [101]:
dfs_stock = []
for stock in stocks:
    stock_name = stock.split('_')[0]
    df = pd.read_csv(stock_dir+stock+'.csv', parse_dates=['Date'])
    df['Date']= pd.to_datetime(df['Date'].dt.normalize())
    df = df[df['Date']<last_date]
    df = df[df['Date']>first_date]
    df = df[stock_important_columns]
    df['stock'] = stock_name
    dfs_stock.append(df)

In [102]:
dfs_stock[0]

Unnamed: 0,Date,Change,Lag 2 Change,Lag 2 Significant,stock
3793,2015-01-02,-0.012675,-0.043541,-1,AAPL
3794,2015-01-05,-0.027830,-0.010066,-1,AAPL
3795,2015-01-06,-0.016160,0.025249,1,AAPL
3796,2015-01-07,0.006195,0.051026,1,AAPL
3797,2015-01-08,0.018937,0.030852,1,AAPL
...,...,...,...,...,...
4292,2016-12-23,-0.006532,0.016697,1,AAPL
4293,2016-12-27,0.008046,-0.000601,0,AAPL
4294,2016-12-28,0.008582,-0.007403,0,AAPL
4295,2016-12-29,-0.009105,-0.005582,0,AAPL


In [9]:
df_twitter = reduce(lambda  left,right: pd.merge(left,right,on=['Date'],
                                            how='outer'), dfs_twitter)

In [10]:
df_news = reduce(lambda  left,right: pd.merge(left,right,on=['Date'],
                                            how='outer'), dfs_news)

In [106]:
df_stock = pd.concat(dfs_stock, ignore_index=True)

In [107]:
df_stock

Unnamed: 0,Date,Change,Lag 2 Change,Lag 2 Significant,stock
0,2015-01-02,-0.012675,-0.043541,-1,AAPL
1,2015-01-05,-0.027830,-0.010066,-1,AAPL
2,2015-01-06,-0.016160,0.025249,1,AAPL
3,2015-01-07,0.006195,0.051026,1,AAPL
4,2015-01-08,0.018937,0.030852,1,AAPL
...,...,...,...,...,...
1507,2016-12-23,-0.006109,-0.000788,0,MSFT
1508,2016-12-27,-0.003783,-0.005537,0,MSFT
1509,2016-12-28,0.003006,-0.006940,0,MSFT
1510,2016-12-29,-0.008517,-0.001114,0,MSFT


In [104]:
index_name = [' '+index.split('_')[0] for index in indices]

In [105]:
df_index = reduce(lambda  left,right: pd.merge(left,right,on=['Date'],
                                            how='outer', suffixes=index_name), dfs_index)

In [13]:
# Handle Weekends

In [14]:
predict_dates = df_stock['Date']

In [15]:
#FOR NOW ONLY DELETE THE WEEKENDS
df_twitter = pd.merge(predict_dates, df_twitter, on=["Date"], how='left')
df_news = pd.merge(predict_dates, df_news, on=["Date"], how='left')

In [108]:
df_twitter.to_csv(output_dir+'twitter_features.csv', index=False)
df_news.to_csv(output_dir+'news_features.csv', index=False)
df_index.to_csv(output_dir+'index_features.csv', index=False)
df_stock.to_csv(output_dir+'stock_features.csv', index=False)