In [1]:
import pandas as pd
from functools import reduce

In [2]:
twitter_dir = 'data/bucket/twitter/'
news_dir = 'data/bucket/news/'
stock_dir = 'data/out/'
output_dir = 'data/features/'

In [3]:
influencers = ['BarackObama_sentiment','cnnbrk_sentiment', 'KimKardashian_sentiment']
sources = ['New York Times_sentiment', 'CNN_sentiment', 'Washington Post_sentiment']
stocks = ['VIX_Predict', 'SPY_Predict']

In [4]:
twitter_important_columns = ['Date','sentiment_score','heuristic_score']
stock_important_columns = ['Date','Change','Lag 2 Change', 'Lag 2 Significant']

In [5]:
first_date = pd.to_datetime('1/1/2015')
last_date = pd.to_datetime('1/1/2017')

In [6]:
dfs_twitter = []
for influencer in influencers:
    influencer_name = influencer.split('_')[0]
    df = pd.read_csv(twitter_dir+influencer+'.csv', parse_dates=['Date'])
    df = df[df['Date']<last_date]
    df = df[df['Date']>first_date]
    df = df[twitter_important_columns]
    df = df.rename(columns={'sentiment_score':influencer_name+'_sentiment_score'})
    df = df.rename(columns={'heuristic_score':influencer_name+'_heuristic_score'})
    dfs_twitter.append(df)

In [7]:
dfs_news = []
for source in sources:
    source_name = source.split('_')[0]
    df = pd.read_csv(news_dir+source+'.csv', parse_dates=['Date'])
    df = df[df['Date']<last_date]
    df = df[df['Date']>first_date]
    df = df.rename(columns={'sentiment_score':source_name+'_sentiment_score'})
    dfs_news.append(df)

In [8]:
dfs_stock = []
for stock in stocks:
    df = pd.read_csv(stock_dir+stock+'.csv', parse_dates=['Date'])
    df['Date']= pd.to_datetime(df['Date'].dt.normalize())
    df = df[df['Date']<last_date]
    df = df[df['Date']>first_date]
    df = df[stock_important_columns]
    dfs_stock.append(df)

In [9]:
df_twitter = reduce(lambda  left,right: pd.merge(left,right,on=['Date'],
                                            how='outer'), dfs_twitter)

In [10]:
df_news = reduce(lambda  left,right: pd.merge(left,right,on=['Date'],
                                            how='outer'), dfs_news)

In [11]:
stocks_name = [' '+stock.split('_')[0] for stock in stocks]

In [12]:
df_stock = reduce(lambda  left,right: pd.merge(left,right,on=['Date'],
                                            how='outer', suffixes=stocks_name), dfs_stock)

In [13]:
# Handle Weekends

In [14]:
predict_dates = df_stock['Date']

In [15]:
#FOR NOW ONLY DELETE THE WEEKENDS
df_twitter = pd.merge(predict_dates, df_twitter, on=["Date"], how='left')
df_news = pd.merge(predict_dates, df_news, on=["Date"], how='left')

In [16]:
df_twitter.to_csv(output_dir+'twitter_features.csv', index=False)
df_news.to_csv(output_dir+'news_features.csv', index=False)
df_stock.to_csv(output_dir+'stock_features.csv', index=False)