In [18]:
import numpy as np
import pandas as pd
import datetime

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

# 2.1 Retrieve Reddit World News

In [2]:
news_data = pd.read_csv('./data/RedditNews.csv', encoding='utf-8')
news_data.head()
news_data.shape

(73608, 2)

### Strip binary data and newline tags from news data and eliminate any data not between dates (1/4/2010)

In [3]:
news_data.rename(columns={'Date': 'date'}, inplace=True)
news_data = news_data[news_data['date'] > '2010-01-04']
news_data = news_data.replace({"b'": "", 'b"': '', '\n': '', '\t': '', '"': ''}, regex=True)

## 2.1.1 Combine news articles by date

In [4]:
news_data = news_data.groupby(['date'])['News'].apply(lambda x: ', '.join(x)).reset_index()
news_data.head()

Unnamed: 0,date,News
0,2010-01-05,These images depict the untouched stomach cont...
1,2010-01-06,Three Americans go to Uganda and teach thousan...
2,2010-01-07,23-year-old British woman on holiday in Dubai ...
3,2010-01-08,Top Imams affiliated with the Islamic Supreme ...
4,2010-01-09,"Chevron could face damages of $27.3bn, the big..."


# 2.2 Retrieve and Combine Stocks and Securities

In [5]:
pdata = pd.read_csv('./data/nyse/prices-split-adjusted.csv', encoding='utf-8')
pdata.head()

Unnamed: 0,date,symbol,open,close,low,high,volume
0,2016-01-05,WLTW,123.43,125.839996,122.309998,126.25,2163600.0
1,2016-01-06,WLTW,125.239998,119.980003,119.940002,125.540001,2386400.0
2,2016-01-07,WLTW,116.379997,114.949997,114.93,119.739998,2489500.0
3,2016-01-08,WLTW,115.480003,116.620003,113.5,117.440002,2006300.0
4,2016-01-11,WLTW,117.010002,114.970001,114.089996,117.330002,1408600.0


In [7]:
secdata = pd.read_csv('./data/nyse/securities.csv', encoding='utf-8')
# rename column to allow join
secdata.rename(columns={'Ticker symbol': 'symbol'}, inplace=True)
secdata.head()

Unnamed: 0,symbol,Security,SEC filings,GICS Sector,GICS Sub Industry,Address of Headquarters,Date first added,CIK
0,MMM,3M Company,reports,Industrials,Industrial Conglomerates,"St. Paul, Minnesota",,66740
1,ABT,Abbott Laboratories,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800
2,ABBV,AbbVie,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152
3,ACN,Accenture plc,reports,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373
4,ATVI,Activision Blizzard,reports,Information Technology,Home Entertainment Software,"Santa Monica, California",2015-08-31,718877


In [116]:
stock_data = pd.merge(pdata, secdata[['symbol','Security','GICS Sector', 'GICS Sub Industry']], on='symbol')
stock_data['diff']=(stock_data['close']-stock_data['open'])/stock_data['open']*100
stock_data.head()

Unnamed: 0,date,symbol,open,close,low,high,volume,Security,GICS Sector,GICS Sub Industry,diff
0,2016-01-05,WLTW,123.43,125.839996,122.309998,126.25,2163600.0,Willis Towers Watson,Financials,Insurance Brokers,1.95252
1,2016-01-06,WLTW,125.239998,119.980003,119.940002,125.540001,2386400.0,Willis Towers Watson,Financials,Insurance Brokers,-4.199932
2,2016-01-07,WLTW,116.379997,114.949997,114.93,119.739998,2489500.0,Willis Towers Watson,Financials,Insurance Brokers,-1.228733
3,2016-01-08,WLTW,115.480003,116.620003,113.5,117.440002,2006300.0,Willis Towers Watson,Financials,Insurance Brokers,0.987184
4,2016-01-11,WLTW,117.010002,114.970001,114.089996,117.330002,1408600.0,Willis Towers Watson,Financials,Insurance Brokers,-1.743442


### Evaluate data and find out how many stocks in energy/oil sector

In [9]:
stock_data[(stock_data['date'] == '2016-01-05') & (stock_data['GICS Sector'] == 'Energy')].describe()

Unnamed: 0,open,close,low,high,volume
count,36.0,36.0,36.0,36.0,36.0
mean,47.807778,47.86,46.929166,48.365556,7044572.0
std,29.533748,29.666613,29.209861,29.905739,6586727.0
min,4.89,5.01,4.84,5.09,1448800.0
25%,25.6025,25.595,24.917499,25.817501,2777550.0
50%,45.084999,44.724998,43.85,45.774999,5129350.0
75%,69.519997,69.657499,68.209997,70.014997,8344725.0
max,124.349998,125.169998,122.839996,126.089996,29113900.0


#### Display 10 random stocks in the Energy sector to observe whether there is correlation

In [32]:
stock_data[stock_data['GICS Sector'] == 'Energy']['GICS Sub Industry'].unique()

array(['Oil & Gas Exploration & Production',
       'Oil & Gas Equipment & Services', 'Integrated Oil & Gas',
       'Oil & Gas Drilling',
       'Oil & Gas Refining & Marketing & Transportation'], dtype=object)

In [31]:
plt_data = []
for stock in np.random.choice(stock_data[stock_data['GICS Sector'] == 'Energy']['symbol'].unique(), 10):
    energy_df = stock_data[stock_data['symbol'] == stock]
    
    plt_data.append(go.Scatter(
        x = energy_df['date'].values,
        y = energy_df['close'].values,
        name = stock
    ))
    
layout = go.Layout(dict(title = 'Closing prices of 10 energy stocks',
                       xaxis = dict(title = 'Month'),
                       yaxis = dict(title = 'Price'),
                       ), legend = dict(orientation = 'h'))
py.iplot(dict(data=plt_data, layout=layout), filename='basic-line')

## 2.3 Group by data and securities

In [95]:
grp_data = stock_data.groupby(['GICS Sector', 'date']).agg({
    'Security': "count",
    "open": sum,
    "close": sum
})
grp_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Security,open,close
GICS Sector,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Consumer Discretionary,2010-01-04,77,2672.473763,2671.436019
Consumer Discretionary,2010-01-05,78,2705.571271,2729.757742
Consumer Discretionary,2010-01-06,78,2724.797536,2721.494277
Consumer Discretionary,2010-01-07,78,2727.953311,2740.721386
Consumer Discretionary,2010-01-08,78,2733.650218,2750.357443


In [96]:
grp_data.xs(key='Energy').head()

Unnamed: 0_level_0,Security,open,close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-04,33,1525.897759,1551.516528
2010-01-05,33,1551.832163,1566.911977
2010-01-06,33,1565.173122,1584.721043
2010-01-07,33,1577.543497,1579.518125
2010-01-08,33,1572.397667,1594.940442


### Similar the DJIA index methodology to get the average of the stocks by sector

In [97]:
# data was previously summed in previous row
grp_data['open']=grp_data['open']/grp_data['Security']
grp_data['close']=grp_data['close']/grp_data['Security']

In [98]:
grp_data['Daily Return']=(grp_data['close']-grp_data['open'])/grp_data['open']*100

### 2.3.1 Plot the data to observe when the stock price had the greatest change

In [99]:
grouped = grp_data.groupby('date').agg({'Daily Return': ['std', 'min']}).reset_index()
grouped.head()

Unnamed: 0_level_0,date,Daily Return,Daily Return
Unnamed: 0_level_1,Unnamed: 1_level_1,std,min
0,2010-01-04,0.735353,-1.046991
1,2010-01-05,0.662124,-1.109534
2,2010-01-06,0.880933,-2.055354
3,2010-01-07,0.734142,-0.806164
4,2010-01-08,0.744911,-1.028062


In [100]:
g = grouped.sort_values(('Daily Return', 'std'), ascending=False)[:10]
g['text'] = 'Maximum price drop: ' + round(-1 * g['Daily Return']['min'], 2).astype(str)
g['Daily Return']['std'].values

array([1.92437035, 1.8775909 , 1.52651751, 1.51052325, 1.50756238,
       1.49134958, 1.45367793, 1.40196412, 1.38990074, 1.35374364])

In [101]:
plt_data = go.Scatter(
    x = g['date'].values,
    y = g['Daily Return']['std'].values,
    mode='markers',
    marker=dict(
        size = 20 * g['Daily Return']['std'].values,
        color = g['Daily Return']['std'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = g['text'].values
)
data = [plt_data]

layout = go.Layout(autosize=True,
                  title = 'Top 10 months by standard deviation of price change within a day',
                  hovermode='closest',
                  yaxis=dict(title='Daily Return', ticklen=5, gridwidth=2),
                  showlegend=False)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='stddev')

### 2.3.2 Observe the data for 1 specific sector, such as Energy

In [102]:
grp_data.xs(key='Energy').head()

Unnamed: 0_level_0,Security,open,close,Daily Return
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,33,46.239326,47.015652,1.678931
2010-01-05,33,47.025217,47.482181,0.971743
2010-01-06,33,47.429489,48.02185,1.24893
2010-01-07,33,47.804348,47.864186,0.125171
2010-01-08,33,47.648414,48.331529,1.433656


In [123]:
grouped = stock_data[stock_data['GICS Sector'] == 'Energy'].groupby('date').agg({'diff': ['std', 'max']}).reset_index()
grouped.head()

Unnamed: 0_level_0,date,diff,diff
Unnamed: 0_level_1,Unnamed: 1_level_1,std,max
0,2010-01-04,1.118377,4.664311
1,2010-01-05,1.530731,5.810902
2,2010-01-06,1.448154,5.695183
3,2010-01-07,0.853619,2.171178
4,2010-01-08,1.165558,3.550828


In [127]:
g = grouped.sort_values(('diff', 'std'), ascending=False)[:10]
g['text'] = 'Maximum price drop: ' + round(-1 * g['diff']['max'], 2).astype(str)
g['diff']['max'].values

array([ 7.76212495, 36.28912817, 26.70623145, 29.77099237, 21.78217822,
       11.09018391, 18.47975505, 25.        , 16.73565937, 16.60550459])

In [132]:
plt_data = go.Scatter(
    x = g['date'].values,
    y = g['diff']['std'].values,
    mode='markers',
    marker=dict(
        size = 2*g['diff']['max'].values,
        color = g['diff']['max'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = g['text'].values
)
data = [plt_data]

layout = go.Layout(autosize=True,
                  title = 'Top 10 months by standard deviation of price change within a day',
                  hovermode='closest',
                  yaxis=dict(title='Deviation in price', ticklen=5, gridwidth=2),
                  showlegend=False)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='stddev')

## 2.4 Calculate Daily Returns and Standard Deviations

### Calculate the means and standard deviations by sector

In [134]:
sigma = grp_data.groupby('GICS Sector').agg({'Daily Return':['mean', 'std', 'min']})
sigma

Unnamed: 0_level_0,Daily Return,Daily Return,Daily Return
Unnamed: 0_level_1,mean,std,min
GICS Sector,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Consumer Discretionary,0.036876,0.929677,-4.206013
Consumer Staples,0.051984,0.610403,-3.363354
Energy,0.013425,1.295804,-5.706094
Financials,0.03234,1.005465,-6.352469
Health Care,0.043555,0.877931,-3.80571
Industrials,0.046591,0.883032,-4.398844
Information Technology,0.02312,0.913375,-4.560408
Materials,0.029144,0.947435,-4.702083
Real Estate,0.043639,0.997881,-4.673736
Telecommunications Services,-0.005438,0.85352,-5.337575


### Generate the 1 and 2 sigma lower and upper limits

In [135]:
sigma['1sigmalow']=sigma.iloc[:,0]-sigma.iloc[:,1]
sigma['1sigmahi']=sigma.iloc[:,0]+sigma.iloc[:,1]
sigma['2sigmalow']=sigma.iloc[:,0]-(2*sigma.iloc[:,1])
sigma['2sigmahi']=sigma.iloc[:,0]+(2*sigma.iloc[:,1])
sigma

Unnamed: 0_level_0,Daily Return,Daily Return,Daily Return,1sigmalow,1sigmahi,2sigmalow,2sigmahi
Unnamed: 0_level_1,mean,std,min,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GICS Sector,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Consumer Discretionary,0.036876,0.929677,-4.206013,-0.892801,0.966552,-1.822478,1.896229
Consumer Staples,0.051984,0.610403,-3.363354,-0.558419,0.662387,-1.168822,1.272791
Energy,0.013425,1.295804,-5.706094,-1.282379,1.309229,-2.578183,2.605033
Financials,0.03234,1.005465,-6.352469,-0.973125,1.037805,-1.978589,2.043269
Health Care,0.043555,0.877931,-3.80571,-0.834376,0.921485,-1.712307,1.799416
Industrials,0.046591,0.883032,-4.398844,-0.836441,0.929623,-1.719473,1.812655
Information Technology,0.02312,0.913375,-4.560408,-0.890255,0.936495,-1.803631,1.849871
Materials,0.029144,0.947435,-4.702083,-0.918291,0.976578,-1.865725,1.924013
Real Estate,0.043639,0.997881,-4.673736,-0.954241,1.04152,-1.952122,2.0394
Telecommunications Services,-0.005438,0.85352,-5.337575,-0.858958,0.848081,-1.712478,1.701601


## 2.5 Generate Labels based on whether sigma threshold passed

### Setup labels for 1 and 2 sigma threshold of prices.

In [201]:
def onesigma(row):
    if row['Daily Return'] < sigma.xs(row.name[0])['1sigmalow'][0]:
        row['Label1Sig'] = -1
    elif row['Daily Return'] > sigma.xs(row.name[0])['1sigmahi'][0]:
        row['Label1Sig'] = 1
    else:
        row['Label1Sig'] = 0
    if row['Daily Return'] < sigma.xs(row.name[0])['2sigmalow'][0]:
        row['Label2Sig'] = -1
    elif row['Daily Return'] > sigma.xs(row.name[0])['2sigmahi'][0]:
        row['Label2Sig'] = 1
    else:
        row['Label2Sig'] = 0
    return row


In [163]:
grp_data['Label1Sig'] = 0
grp_data['Label2Sig'] = 0

In [202]:
new_df = grp_data.apply(onesigma, axis=1)

In [205]:
new_df.xs('Energy').head()

Unnamed: 0_level_0,Security,open,close,Daily Return,Label1Sig,Label2Sig
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,33.0,46.239326,47.015652,1.678931,1.0,0.0
2010-01-05,33.0,47.025217,47.482181,0.971743,0.0,0.0
2010-01-06,33.0,47.429489,48.02185,1.24893,0.0,0.0
2010-01-07,33.0,47.804348,47.864186,0.125171,0.0,0.0
2010-01-08,33.0,47.648414,48.331529,1.433656,1.0,0.0


In [206]:
new_df.xs('Energy').sort_values(by=['Daily Return'], ascending=False).head()

Unnamed: 0_level_0,Security,open,close,Daily Return,Label1Sig,Label2Sig
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-10-02,36.0,49.1225,51.973611,5.804083,1.0,1.0
2011-10-04,33.0,42.63405,45.061645,5.69403,1.0,1.0
2014-10-16,36.0,62.619371,65.776134,5.041193,1.0,1.0
2010-05-21,33.0,40.388387,42.200942,4.487812,1.0,1.0
2014-12-17,36.0,57.999861,60.550138,4.397041,1.0,1.0


In [212]:
new_df.reset_index()[['date','GICS Sector', 'Label1Sig', 'Label2Sig']]

Unnamed: 0,date,GICS Sector,Label1Sig,Label2Sig
0,2010-01-04,Consumer Discretionary,0.0,0.0
1,2010-01-05,Consumer Discretionary,0.0,0.0
2,2010-01-06,Consumer Discretionary,0.0,0.0
3,2010-01-07,Consumer Discretionary,0.0,0.0
4,2010-01-08,Consumer Discretionary,0.0,0.0
5,2010-01-11,Consumer Discretionary,0.0,0.0
6,2010-01-12,Consumer Discretionary,0.0,0.0
7,2010-01-13,Consumer Discretionary,0.0,0.0
8,2010-01-14,Consumer Discretionary,0.0,0.0
9,2010-01-15,Consumer Discretionary,-1.0,0.0


In [216]:
combined_df = pd.merge(news_data, new_df.reset_index()[['date','GICS Sector', 'Label1Sig', 'Label2Sig']], on='date')
combined_df.head()

Unnamed: 0,date,News,GICS Sector,Label1Sig,Label2Sig
0,2010-01-05,These images depict the untouched stomach cont...,Consumer Discretionary,0.0,0.0
1,2010-01-05,These images depict the untouched stomach cont...,Consumer Staples,0.0,0.0
2,2010-01-05,These images depict the untouched stomach cont...,Energy,0.0,0.0
3,2010-01-05,These images depict the untouched stomach cont...,Financials,1.0,0.0
4,2010-01-05,These images depict the untouched stomach cont...,Health Care,0.0,0.0


In [217]:
combined_df.to_csv (r'.\cache\data.csv', index = None, header=True) 

#### Visual the classification of the 1 and 2 sigma st dev by sector

In [244]:
sig1df = combined_df.groupby(['GICS Sector','Label1Sig'], as_index=False)['date'].count()
sig2df = combined_df.groupby(['GICS Sector','Label2Sig'], as_index=False)['date'].count()
sig1df = sig1df.rename(columns={"Label1Sig": "action", "date": "onesigma"})
sig2df = sig2df.rename(columns={"Label2Sig": "action", "date": "twosigma"})
sigdf = pd.merge(sig1df, sig2df, on=['GICS Sector', 'action'])
sigdf

Unnamed: 0,GICS Sector,action,onesigma,twosigma
0,Consumer Discretionary,-1.0,228,58
1,Consumer Discretionary,0.0,1204,1541
2,Consumer Discretionary,1.0,203,36
3,Consumer Staples,-1.0,206,51
4,Consumer Staples,0.0,1223,1547
5,Consumer Staples,1.0,206,37
6,Energy,-1.0,228,49
7,Energy,0.0,1202,1544
8,Energy,1.0,205,42
9,Financials,-1.0,210,58


In [259]:
pivdf = sigdf.pivot(index='GICS Sector',columns='action', values=['onesigma', 'twosigma'])
pivdf

Unnamed: 0_level_0,onesigma,onesigma,onesigma,twosigma,twosigma,twosigma
action,-1.0,0.0,1.0,-1.0,0.0,1.0
GICS Sector,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Consumer Discretionary,228,1204,203,58,1541,36
Consumer Staples,206,1223,206,51,1547,37
Energy,228,1202,205,49,1544,42
Financials,210,1237,188,58,1544,33
Health Care,223,1195,217,59,1545,31
Industrials,209,1221,205,57,1540,38
Information Technology,232,1177,226,64,1539,32
Materials,218,1219,198,56,1544,35
Real Estate,194,1244,197,54,1547,34
Telecommunications Services,229,1172,234,45,1558,32
