In [1]:
import numpy as np
import pandas as pd
import base64
import datetime

import spacy
import nltk
from nltk.tokenize.toktok import ToktokTokenizer

import re
from bs4 import BeautifulSoup
import unicodedata

import matplotlib.pyplot as plt
import seaborn as sns

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

%matplotlib inline

# 2.1 Retrieve and Combine Stocks and Securities

In [2]:
pdata = pd.read_csv('./data/nyse/prices-split-adjusted.csv', encoding='utf-8')
pdata.head()

Unnamed: 0,date,symbol,open,close,low,high,volume
0,2016-01-05,WLTW,123.43,125.839996,122.309998,126.25,2163600.0
1,2016-01-06,WLTW,125.239998,119.980003,119.940002,125.540001,2386400.0
2,2016-01-07,WLTW,116.379997,114.949997,114.93,119.739998,2489500.0
3,2016-01-08,WLTW,115.480003,116.620003,113.5,117.440002,2006300.0
4,2016-01-11,WLTW,117.010002,114.970001,114.089996,117.330002,1408600.0


In [3]:
secdata = pd.read_csv('./data/nyse/securities.csv', encoding='utf-8')
# rename column to allow join
secdata.rename(columns={'Ticker symbol': 'symbol'}, inplace=True)
secdata.head()

Unnamed: 0,symbol,Security,SEC filings,GICS Sector,GICS Sub Industry,Address of Headquarters,Date first added,CIK
0,MMM,3M Company,reports,Industrials,Industrial Conglomerates,"St. Paul, Minnesota",,66740
1,ABT,Abbott Laboratories,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800
2,ABBV,AbbVie,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152
3,ACN,Accenture plc,reports,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373
4,ATVI,Activision Blizzard,reports,Information Technology,Home Entertainment Software,"Santa Monica, California",2015-08-31,718877


In [4]:
stock_data = pd.merge(pdata, secdata[['symbol','Security','GICS Sector', 'GICS Sub Industry']], on='symbol')
stock_data['diff']=(stock_data['close']-stock_data['open'])/stock_data['open']*100
stock_data.head()

Unnamed: 0,date,symbol,open,close,low,high,volume,Security,GICS Sector,GICS Sub Industry,diff
0,2016-01-05,WLTW,123.43,125.839996,122.309998,126.25,2163600.0,Willis Towers Watson,Financials,Insurance Brokers,1.95252
1,2016-01-06,WLTW,125.239998,119.980003,119.940002,125.540001,2386400.0,Willis Towers Watson,Financials,Insurance Brokers,-4.199932
2,2016-01-07,WLTW,116.379997,114.949997,114.93,119.739998,2489500.0,Willis Towers Watson,Financials,Insurance Brokers,-1.228733
3,2016-01-08,WLTW,115.480003,116.620003,113.5,117.440002,2006300.0,Willis Towers Watson,Financials,Insurance Brokers,0.987184
4,2016-01-11,WLTW,117.010002,114.970001,114.089996,117.330002,1408600.0,Willis Towers Watson,Financials,Insurance Brokers,-1.743442


### Evaluate data and find out how many stocks in energy/oil sector

In [5]:
stock_data[(stock_data['date'] == '2016-01-05') & (stock_data['GICS Sector'] == 'Energy')].describe()

Unnamed: 0,open,close,low,high,volume,diff
count,36.0,36.0,36.0,36.0,36.0,36.0
mean,47.807778,47.86,46.929166,48.365556,7044572.0,0.061717
std,29.533748,29.666613,29.209861,29.905739,6586727.0,1.202284
min,4.89,5.01,4.84,5.09,1448800.0,-2.69125
25%,25.6025,25.595,24.917499,25.817501,2777550.0,-0.609361
50%,45.084999,44.724998,43.85,45.774999,5129350.0,0.084438
75%,69.519997,69.657499,68.209997,70.014997,8344725.0,0.792391
max,124.349998,125.169998,122.839996,126.089996,29113900.0,2.453988


#### Display 10 random stocks in the Energy sector to observe whether there is correlation

In [6]:
stock_data[stock_data['GICS Sector'] == 'Energy']['GICS Sub Industry'].unique()

array(['Oil & Gas Exploration & Production',
       'Oil & Gas Equipment & Services', 'Integrated Oil & Gas',
       'Oil & Gas Drilling',
       'Oil & Gas Refining & Marketing & Transportation'], dtype=object)

In [7]:
plt_data = []
for stock in np.random.choice(stock_data[stock_data['GICS Sector'] == 'Energy']['symbol'].unique(), 10):
    energy_df = stock_data[stock_data['symbol'] == stock]
    
    plt_data.append(go.Scatter(
        x = energy_df['date'].values,
        y = energy_df['close'].values,
        name = stock
    ))
    
layout = go.Layout(dict(title = 'Closing prices of 10 energy stocks',
                       xaxis = dict(title = 'Month'),
                       yaxis = dict(title = 'Price'),
                       ), legend = dict(orientation = 'h'))
py.iplot(dict(data=plt_data, layout=layout), filename='basic-line')

## 2.3 Group by data and securities

In [8]:
grp_data = stock_data.groupby(['GICS Sector', 'date']).agg({
    'Security': "count",
    "open": sum,
    "close": sum
})
grp_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Security,open,close
GICS Sector,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Consumer Discretionary,2010-01-04,77,2672.473763,2671.436019
Consumer Discretionary,2010-01-05,78,2705.571271,2729.757742
Consumer Discretionary,2010-01-06,78,2724.797536,2721.494277
Consumer Discretionary,2010-01-07,78,2727.953311,2740.721386
Consumer Discretionary,2010-01-08,78,2733.650218,2750.357443


In [9]:
grp_data.xs(key='Energy').head()

Unnamed: 0_level_0,Security,open,close
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2010-01-04,33,1525.897759,1551.516528
2010-01-05,33,1551.832163,1566.911977
2010-01-06,33,1565.173122,1584.721043
2010-01-07,33,1577.543497,1579.518125
2010-01-08,33,1572.397667,1594.940442


### Similar the DJIA index methodology to get the average of the stocks by sector

In [10]:
# data was previously summed in previous row
grp_data['open']=grp_data['open']/grp_data['Security']
grp_data['close']=grp_data['close']/grp_data['Security']

In [11]:
grp_data['Daily Return']=(grp_data['close']-grp_data['open'])/grp_data['open']*100

### 2.3.1 Plot the data to observe when the stock price had the greatest change

In [12]:
grouped = grp_data.groupby('date').agg({'Daily Return': ['std', 'min']}).reset_index()
grouped.head()

Unnamed: 0_level_0,date,Daily Return,Daily Return
Unnamed: 0_level_1,Unnamed: 1_level_1,std,min
0,2010-01-04,0.735353,-1.046991
1,2010-01-05,0.662124,-1.109534
2,2010-01-06,0.880933,-2.055354
3,2010-01-07,0.734142,-0.806164
4,2010-01-08,0.744911,-1.028062


In [13]:
g = grouped.sort_values(('Daily Return', 'std'), ascending=False)[:10]
g['text'] = 'Maximum price drop: ' + round(-1 * g['Daily Return']['min'], 2).astype(str)
g['Daily Return']['std'].values

array([1.92437035, 1.8775909 , 1.52651751, 1.51052325, 1.50756238,
       1.49134958, 1.45367793, 1.40196412, 1.38990074, 1.35374364])

In [14]:
plt_data = go.Scatter(
    x = g['date'].values,
    y = g['Daily Return']['std'].values,
    mode='markers',
    marker=dict(
        size = 20 * g['Daily Return']['std'].values,
        color = g['Daily Return']['std'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = g['text'].values
)
data = [plt_data]

layout = go.Layout(autosize=True,
                  title = 'Top 10 months by standard deviation of price change within a day',
                  hovermode='closest',
                  yaxis=dict(title='Daily Return', ticklen=5, gridwidth=2),
                  showlegend=False)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='stddev')

### 2.3.2 Observe the data for 1 specific sector, such as Energy

In [15]:
grp_data.xs(key='Energy').head()

Unnamed: 0_level_0,Security,open,close,Daily Return
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-01-04,33,46.239326,47.015652,1.678931
2010-01-05,33,47.025217,47.482181,0.971743
2010-01-06,33,47.429489,48.02185,1.24893
2010-01-07,33,47.804348,47.864186,0.125171
2010-01-08,33,47.648414,48.331529,1.433656


In [16]:
grouped = stock_data[stock_data['GICS Sector'] == 'Energy'].groupby('date').agg({'diff': ['std', 'max']}).reset_index()
grouped.head()

Unnamed: 0_level_0,date,diff,diff
Unnamed: 0_level_1,Unnamed: 1_level_1,std,max
0,2010-01-04,1.118377,4.664311
1,2010-01-05,1.530731,5.810902
2,2010-01-06,1.448154,5.695183
3,2010-01-07,0.853619,2.171178
4,2010-01-08,1.165558,3.550828


In [17]:
g = grouped.sort_values(('diff', 'std'), ascending=False)[:10]
g['text'] = 'Maximum price drop: ' + round(-1 * g['diff']['max'], 2).astype(str)
g['diff']['max'].values

array([ 7.76212495, 36.28912817, 26.70623145, 29.77099237, 21.78217822,
       11.09018391, 18.47975505, 25.        , 16.73565937, 16.60550459])

In [18]:
plt_data = go.Scatter(
    x = g['date'].values,
    y = g['diff']['std'].values,
    mode='markers',
    marker=dict(
        size = 2*g['diff']['max'].values,
        color = g['diff']['max'].values,
        colorscale='Portland',
        showscale=True
    ),
    text = g['text'].values
)
data = [plt_data]

layout = go.Layout(autosize=True,
                  title = 'Top 10 months by standard deviation of price change within a day',
                  hovermode='closest',
                  yaxis=dict(title='Deviation in price', ticklen=5, gridwidth=2),
                  showlegend=False)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename='stddev')

## 2.4 Calculate Daily Returns and Standard Deviations

### Calculate the means and standard deviations by sector

In [19]:
sigma = grp_data.groupby('GICS Sector').agg({'Daily Return':['mean', 'std', 'min']})
sigma

Unnamed: 0_level_0,Daily Return,Daily Return,Daily Return
Unnamed: 0_level_1,mean,std,min
GICS Sector,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Consumer Discretionary,0.036876,0.929677,-4.206013
Consumer Staples,0.051984,0.610403,-3.363354
Energy,0.013425,1.295804,-5.706094
Financials,0.03234,1.005465,-6.352469
Health Care,0.043555,0.877931,-3.80571
Industrials,0.046591,0.883032,-4.398844
Information Technology,0.02312,0.913375,-4.560408
Materials,0.029144,0.947435,-4.702083
Real Estate,0.043639,0.997881,-4.673736
Telecommunications Services,-0.005438,0.85352,-5.337575


### Generate the 1 and 2 sigma lower and upper limits

In [20]:
sigma['1sigmalow']=sigma.iloc[:,0]-sigma.iloc[:,1]
sigma['1sigmahi']=sigma.iloc[:,0]+sigma.iloc[:,1]
sigma['2sigmalow']=sigma.iloc[:,0]-(2*sigma.iloc[:,1])
sigma['2sigmahi']=sigma.iloc[:,0]+(2*sigma.iloc[:,1])
sigma

Unnamed: 0_level_0,Daily Return,Daily Return,Daily Return,1sigmalow,1sigmahi,2sigmalow,2sigmahi
Unnamed: 0_level_1,mean,std,min,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
GICS Sector,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Consumer Discretionary,0.036876,0.929677,-4.206013,-0.892801,0.966552,-1.822478,1.896229
Consumer Staples,0.051984,0.610403,-3.363354,-0.558419,0.662387,-1.168822,1.272791
Energy,0.013425,1.295804,-5.706094,-1.282379,1.309229,-2.578183,2.605033
Financials,0.03234,1.005465,-6.352469,-0.973125,1.037805,-1.978589,2.043269
Health Care,0.043555,0.877931,-3.80571,-0.834376,0.921485,-1.712307,1.799416
Industrials,0.046591,0.883032,-4.398844,-0.836441,0.929623,-1.719473,1.812655
Information Technology,0.02312,0.913375,-4.560408,-0.890255,0.936495,-1.803631,1.849871
Materials,0.029144,0.947435,-4.702083,-0.918291,0.976578,-1.865725,1.924013
Real Estate,0.043639,0.997881,-4.673736,-0.954241,1.04152,-1.952122,2.0394
Telecommunications Services,-0.005438,0.85352,-5.337575,-0.858958,0.848081,-1.712478,1.701601


## 2.5 Generate Labels based on whether sigma threshold passed

### Setup labels for 1 and 2 sigma threshold of prices.

In [21]:
def onesigma(row):
    if row['Daily Return'] < sigma.xs(row.name[0])['1sigmalow'][0]:
        row['Label1Sig'] = -1
    elif row['Daily Return'] > sigma.xs(row.name[0])['1sigmahi'][0]:
        row['Label1Sig'] = 1
    else:
        row['Label1Sig'] = 0
    if row['Daily Return'] < sigma.xs(row.name[0])['2sigmalow'][0]:
        row['Label2Sig'] = -1
    elif row['Daily Return'] > sigma.xs(row.name[0])['2sigmahi'][0]:
        row['Label2Sig'] = 1
    else:
        row['Label2Sig'] = 0
    return row


In [22]:
grp_data['Label1Sig'] = 0
grp_data['Label2Sig'] = 0

In [23]:
sigma_df = grp_data.apply(onesigma, axis=1)

In [24]:
sigma_df.xs('Energy').head()

Unnamed: 0_level_0,Security,open,close,Daily Return,Label1Sig,Label2Sig
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-01-04,33.0,46.239326,47.015652,1.678931,1.0,0.0
2010-01-05,33.0,47.025217,47.482181,0.971743,0.0,0.0
2010-01-06,33.0,47.429489,48.02185,1.24893,0.0,0.0
2010-01-07,33.0,47.804348,47.864186,0.125171,0.0,0.0
2010-01-08,33.0,47.648414,48.331529,1.433656,1.0,0.0


In [25]:
sigma_df.xs('Energy').sort_values(by=['Daily Return'], ascending=False).head()

Unnamed: 0_level_0,Security,open,close,Daily Return,Label1Sig,Label2Sig
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-10-02,36.0,49.1225,51.973611,5.804083,1.0,1.0
2011-10-04,33.0,42.63405,45.061645,5.69403,1.0,1.0
2014-10-16,36.0,62.619371,65.776134,5.041193,1.0,1.0
2010-05-21,33.0,40.388387,42.200942,4.487812,1.0,1.0
2014-12-17,36.0,57.999861,60.550138,4.397041,1.0,1.0


In [26]:
sigma_df.reset_index()
sigma_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Security,open,close,Daily Return,Label1Sig,Label2Sig
GICS Sector,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Consumer Discretionary,2010-01-04,77.0,34.707451,34.693974,-0.038831,0.0,0.0
Consumer Discretionary,2010-01-05,78.0,34.686811,34.996894,0.893951,0.0,0.0
Consumer Discretionary,2010-01-06,78.0,34.933302,34.890952,-0.12123,0.0,0.0
Consumer Discretionary,2010-01-07,78.0,34.97376,35.137454,0.468046,0.0,0.0
Consumer Discretionary,2010-01-08,78.0,35.046798,35.260993,0.611169,0.0,0.0


# 2.5 Retrieve Reddit World News

In [27]:
nlp = spacy.load('en', parse=True, tag=True, entity=True)
tokenizer = ToktokTokenizer()
stopword_list = nltk.corpus.stopwords.words('english')
stopword_list.remove('no')
stopword_list.remove('not')

In [28]:
news_df = pd.read_csv('./data/RedditNews.csv', encoding='utf-8')
news_df.head()

Unnamed: 0,Date,News
0,2016-07-01,A 117-year-old woman in Mexico City finally re...
1,2016-07-01,IMF chief backs Athens as permanent Olympic host
2,2016-07-01,"The president of France says if Brexit won, so..."
3,2016-07-01,British Man Who Must Give Police 24 Hours' Not...
4,2016-07-01,100+ Nobel laureates urge Greenpeace to stop o...


## 2.6 Text Preprocessing

### 2.6.1 NLP methods

In [29]:
ps = nltk.porter.PorterStemmer()
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    stripped_text = soup.get_text()
    return stripped_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def remove_special_characters(text, remove_digits=False):
    pattern = r'[^a-zA-z0-9\s]'
    if remove_digits == True:
        pattern = r'[^a-zA-z\s]'
    
    text = re.sub(pattern, '', text)
    return text

def simple_stemmer(text):
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

def lemmatize_text(text):
    text = nlp(text)
    text = ' '.join([word.lemma_ if word.lemma_ != '-PRON-' else word.text for word in text])
    return text

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = ' '.join(filtered_tokens)    
    return filtered_text

text = news_df.iloc[0,1]
# remove_special_characters(text, True)
# simple_stemmer(text)
# lemmatize_text(text)
remove_stopwords(text)

'117-year-old woman Mexico City finally received birth certificate , died hours later. Trinidad Alvarez Lira waited years proof born 1898 .'

In [30]:
def normalize_corpus(corpus, html_stripping=True, contraction_expansion=True,
                     accented_char_removal=True, text_lower_case=True, 
                     text_lemmatization=True, special_char_removal=True, 
                     stopword_removal=True, remove_digits=True):
    
    normalized_corpus = []
    # normalize each document in the corpus
    for doc in corpus:
        # strip HTML
        if html_stripping:
            doc = strip_html_tags(doc)
        # remove accented characters
        if accented_char_removal:
            doc = remove_accented_chars(doc)
        # lowercase the text    
        if text_lower_case:
            doc = doc.lower()
        # remove extra newlines
        
        doc = re.sub(r'[\r|\n|\r\n]+', ' ',doc)
        # lemmatize text
        if text_lemmatization:
            doc = lemmatize_text(doc)
        # remove special characters and\or digits    
        if special_char_removal:
            # insert spaces between special characters to isolate them    
            special_char_pattern = re.compile(r'([{.(-)!}])')
            doc = special_char_pattern.sub(" \\1 ", doc)
            doc = remove_special_characters(doc, remove_digits=remove_digits)  
        # remove extra whitespace
        doc = re.sub(' +', ' ', doc)
        # remove stopwords
        if stopword_removal:
            doc = remove_stopwords(doc, is_lower_case=text_lower_case)
            
        normalized_corpus.append(doc)
        
    return normalized_corpus

#### NLP functions pulled from following article below

Sarkar, Dipanjan (2018).  A Practitioner's Guide to Natural Language Processing (Part I) - Processing & Understanding Text.  Retrieved on 8/22/2019 from https://towardsdatascience.com/a-practitioners-guide-to-natural-language-processing-part-i-processing-understanding-text-9f4abfd13e72

In [31]:
# Run text based on corpus
news_df = news_df.replace({"b'": "", 'b"': ''}, regex=True)
corpus = normalize_corpus(news_df.iloc[:,1], text_lower_case=False, text_lemmatization=False, special_char_removal=False)

In [32]:
corpus[-5:]

["Man goes berzerk Akihabara stabs everyone nearby : 6 dead , 12 injured '",
 "Threat world AIDS pandemic among heterosexuals , report admits '",
 "Angst Ankara : Turkey Steers Dangerous Identity Crisis '",
 'UK : Identity cards \' could used spy people \' new children \' database may used identify likely future criminals. covert surveillance gone far ? "',
 "Marriage , said , reduced status commercial transaction women could discarded husbands claiming discovered hidden defects . '"]

### 2.6.2 Run Sentiment Analysis
Instead of using the words in the articles to predict stocks we will also to use the sentiment score

#### 2.6.2.1 Extract Afinn and TextBlob sentiment

In [33]:
from afinn import Afinn
from textblob import TextBlob

af = Afinn()

In [34]:
def categorize_sentiment(score):
    if score > 0: 
        return 'positive'
    elif score < 0:
        return 'negative'
    else:
        return 'neutral'

In [35]:
def sentiment_score(row):
    article = row['News']
    row['sentiment'] = af.score(article)
    tb = TextBlob(article).sentiment
    row['polarity'] = tb.polarity
    row['subjectivity'] = tb.subjectivity
    return row

In [36]:
news_df = news_df.apply(sentiment_score, axis=1)

In [37]:
news_df[news_df['subjectivity'] > 0].head()

Unnamed: 0,Date,News,sentiment,polarity,subjectivity
0,2016-07-01,A 117-year-old woman in Mexico City finally re...,-3.0,-0.066667,0.366667
3,2016-07-01,British Man Who Must Give Police 24 Hours' Not...,-12.0,0.111111,0.388889
5,2016-07-01,Brazil: Huge spike in number of police killing...,1.0,0.4,0.9
6,2016-07-01,Austria's highest court annuls presidential el...,-3.0,-0.2,0.4
7,2016-07-01,"Facebook wins privacy case, can track any Belg...",5.0,0.25,0.15


In [38]:
sigma_df[sigma_df['Label2Sig'] > 0].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Security,open,close,Daily Return,Label1Sig,Label2Sig
GICS Sector,date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Consumer Discretionary,2010-03-22,78.0,37.350224,38.128713,2.084296,1.0,1.0
Consumer Discretionary,2010-04-22,78.0,40.976757,42.10206,2.746197,1.0,1.0
Consumer Discretionary,2010-05-21,78.0,36.059208,37.440705,3.831191,1.0,1.0
Consumer Discretionary,2010-05-25,78.0,36.425206,37.635892,3.323759,1.0,1.0
Consumer Discretionary,2010-06-02,78.0,37.803082,38.534569,1.934994,1.0,1.0


#### 2.6.2.2 Get Named Entities

In [39]:
text = corpus[0]
nlp = spacy.load('en', parse=True, tag=True, entity=True)
sentence_nlp = nlp(text)

In [40]:
def get_named_entities(news):
    entities = ''
    sentence_nlp = nlp(news)
    for word in sentence_nlp:
#         print('text',word.text,'pos', word.pos_, 'dep', word.dep_)
        ent_type = word.ent_type_
        if (ent_type != '') & (ent_type != 'DATE') & (ent_type != 'TIME') & (ent_type != 'QUANTITY') & (ent_type != 'ORDINAL') & (ent_type != 'CARDINAL'):
            entities += word.text + ' '
    return entities

In [41]:
news_df['Entities'] = news_df['News'].apply(get_named_entities)

In [42]:
def get_pos_tokens(news):
    pos = ''
    sentence_nlp = nlp(news)
    for word in sentence_nlp:
        if (word.pos_ == 'PROPN') | (word.pos_ == 'NOUN') | (word.pos_ == 'PRON') | (word.pos_ == 'VERB') | (word.pos_ == 'ADV'):
            pos += word.text + ' '          
    return pos

In [43]:
news_df['Tokens'] = news_df['News'].apply(get_pos_tokens)

In [44]:
news_df.head()

Unnamed: 0,Date,News,sentiment,polarity,subjectivity,Entities,Tokens
0,2016-07-01,A 117-year-old woman in Mexico City finally re...,-3.0,-0.066667,0.366667,Mexico City Trinidad Alvarez Lira,woman Mexico City finally received birth certi...
1,2016-07-01,IMF chief backs Athens as permanent Olympic host,1.0,0.0,0.0,Athens Olympic,IMF chief backs Athens host
2,2016-07-01,"The president of France says if Brexit won, so...",3.0,0.0,0.0,France Brexit Donald Trump,president France says Brexit won so can Donald...
3,2016-07-01,British Man Who Must Give Police 24 Hours' Not...,-12.0,0.111111,0.388889,British,Man Who Must Give Police Hours Notice Sex Thre...
4,2016-07-01,100+ Nobel laureates urge Greenpeace to stop o...,-1.0,0.0,0.0,Greenpeace,Nobel laureates urge Greenpeace stop opposing ...


In [49]:
news_df.to_csv (r'.\cache\news_sentiment.csv', index = None, header=True) 

## 2.7 Observe the sentiment and polarity against the Stocks

In [45]:
# sigma_df = sigma_df.reset_index()[['date','GICS Sector', 'Label1Sig', 'Label2Sig']]
sigma_df = sigma_df.reset_index()
sigma_df[sigma_df['Label2Sig'] > 0].tail()

Unnamed: 0,GICS Sector,date,Security,open,close,Daily Return,Label1Sig,Label2Sig
19161,Utilities,2016-02-18,28.0,52.423571,53.315357,1.701116,1.0,1.0
19254,Utilities,2016-06-30,28.0,60.173929,61.368929,1.98591,1.0,1.0
19287,Utilities,2016-08-17,28.0,57.339286,58.2825,1.64497,1.0,1.0
19304,Utilities,2016-09-12,28.0,56.007143,56.98,1.737023,1.0,1.0
19311,Utilities,2016-09-21,28.0,57.951071,59.162499,2.090433,1.0,1.0


In [46]:
sdates = sigma_df[sigma_df['Label2Sig'] > 0]['date'].tail().values
sdates

array(['2016-02-18', '2016-06-30', '2016-08-17', '2016-09-12',
       '2016-09-21'], dtype=object)

In [47]:
news_df[news_df['Date'] == sdates[0]].head()

Unnamed: 0,Date,News,sentiment,polarity,subjectivity,Entities,Tokens
3325,2016-02-18,Obama Going to Cuba; First Visit by U.S. Presi...,0.0,0.25,0.333333,Obama Going Cuba First Visit U.S.,Obama Going Cuba First Visit U.S. President Ye...
3326,2016-02-18,David Cameron blocks compulsory sex education ...,-1.0,0.0,0.0,David Cameron UK,David Cameron blocks sex education classrooms UK
3327,2016-02-18,Uganda shuts down social media; candidates arr...,-3.0,-0.061111,0.177778,Uganda,Uganda shuts media candidates arrested electio...
3328,2016-02-18,Japanese TV anchors lose their jobs for lack o...,0.0,0.0,0.0,Japanese,TV anchors lose jobs lack fairness reporting
3329,2016-02-18,Cairo court sentences 4-year-old boy to life i...,-8.0,-0.05,0.0,Cairo - Middle East,Cairo court sentences boy life jail murder dis...


## 2.5.1 Combine news articles by date

In [82]:
grp_news_df = pd.DataFrame(columns=['date', 'positive','negative','neutral','polarity','subjectivity','news','entities','tokens'])
for name, group in news_df.groupby(['Date']):    
    articles=entities=tokens = ''
    positive=negative=neutral=polarity=subjectivity=0
    for row_index, row in group.iterrows():
        polarity += row['polarity']
        subjectivity += row['subjectivity']
        articles += row['News'] + ', '
        entities += row['Entities'] + ' '
        tokens += row['Tokens'] + ' '
        sentiment = row['sentiment']
        if sentiment > 0:
            positive += 1
        elif sentiment < 0:
            negative += 1
        else:
            neutral += 1
    
    size = (len(group))
    positive = positive/size
    negative = negative/size
    neutral = neutral/size
    polarity = polarity/size
    subjectivity = subjectivity/size
    grp_news_df = grp_news_df.append({'date': name, 'positive': positive,'negative': negative,'neutral': neutral,'polarity': polarity,'subjectivity': subjectivity,'news':articles,'entities':entities,'tokens':tokens}, ignore_index=True)

## 2.6 Combine the news article with the Stock data

In [83]:
print(sigma_df.shape)
sigma_df.tail()

(19382, 8)


Unnamed: 0,GICS Sector,date,Security,open,close,Daily Return,Label1Sig,Label2Sig
19377,Utilities,2016-12-23,28.0,57.009286,56.929643,-0.139702,0.0,0.0
19378,Utilities,2016-12-27,28.0,56.803928,56.929286,0.220685,0.0,0.0
19379,Utilities,2016-12-28,28.0,56.936429,56.3075,-1.104615,-1.0,0.0
19380,Utilities,2016-12-29,28.0,56.508215,57.048928,0.956876,1.0,0.0
19381,Utilities,2016-12-30,28.0,57.091429,56.791071,-0.526099,0.0,0.0


In [88]:
print(grp_news_df.shape)
grp_news_df.tail()

(2943, 9)


Unnamed: 0,date,positive,negative,neutral,polarity,subjectivity,news,entities,tokens
2938,2016-06-27,0.24,0.48,0.28,0.003085,0.326705,Barclays and RBS shares suspended from trading...,RBS more than 8 % Poland Poles UK Scotland...,Barclays RBS shares suspended trading tanking ...
2939,2016-06-28,0.28,0.36,0.36,0.035911,0.289912,"2,500 Scientists To Australia: If You Want To ...",Australia French Google Drive United Kingdom...,Scientists Australia You Want Save Great Barri...
2940,2016-06-29,0.28,0.48,0.24,0.038622,0.220205,"Explosion At Airport In Istanbul, Yemeni forme...",Istanbul Yemeni Al Saud UK Australian Brit...,Explosion Airport Istanbul president Terroris...
2941,2016-06-30,0.16,0.64,0.2,0.022841,0.190421,Jamaica proposes marijuana dispensers for tour...,Jamaica Stephen Hawking Boris Johnson Tory p...,Jamaica proposes marijuana dispensers tourists...
2942,2016-07-01,0.2,0.68,0.12,0.008778,0.195556,A 117-year-old woman in Mexico City finally re...,Mexico City Trinidad Alvarez Lira Athens Olym...,woman Mexico City finally received birth certi...


In [85]:
combined_df = pd.merge(sigma_df, grp_news_df, on='date')
combined_df.head()

Unnamed: 0,GICS Sector,date,Security,open,close,Daily Return,Label1Sig,Label2Sig,positive,negative,neutral,polarity,subjectivity,news,entities,tokens
0,Consumer Discretionary,2010-01-04,77.0,34.707451,34.693974,-0.038831,0.0,0.0,0.12,0.44,0.44,-0.017801,0.289023,"New airport scanners break child porn laws', I...",Indian McDonalds Iceland nearly $ 5.7bn Brit...,airport scanners break child porn laws eye cl...
1,Consumer Staples,2010-01-04,34.0,34.337315,34.465189,0.372404,0.0,0.0,0.12,0.44,0.44,-0.017801,0.289023,"New airport scanners break child porn laws', I...",Indian McDonalds Iceland nearly $ 5.7bn Brit...,airport scanners break child porn laws eye cl...
2,Energy,2010-01-04,33.0,46.239326,47.015652,1.678931,1.0,0.0,0.12,0.44,0.44,-0.017801,0.289023,"New airport scanners break child porn laws', I...",Indian McDonalds Iceland nearly $ 5.7bn Brit...,airport scanners break child porn laws eye cl...
3,Financials,2010-01-04,58.0,36.382588,36.701873,0.877577,0.0,0.0,0.12,0.44,0.44,-0.017801,0.289023,"New airport scanners break child porn laws', I...",Indian McDonalds Iceland nearly $ 5.7bn Brit...,airport scanners break child porn laws eye cl...
4,Health Care,2010-01-04,54.0,44.285254,44.513077,0.514443,0.0,0.0,0.12,0.44,0.44,-0.017801,0.289023,"New airport scanners break child porn laws', I...",Indian McDonalds Iceland nearly $ 5.7bn Brit...,airport scanners break child porn laws eye cl...


In [86]:
combined_df.to_csv (r'.\cache\data_parsed.csv', index = None, header=True) 

#### Visual the classification of the 1 and 2 sigma st dev by sector

In [78]:
sig1df = combined_df.groupby(['GICS Sector','Label1Sig'], as_index=False)['date'].count()
sig2df = combined_df.groupby(['GICS Sector','Label2Sig'], as_index=False)['date'].count()
sig1df = sig1df.rename(columns={"Label1Sig": "action", "date": "onesigma"})
sig2df = sig2df.rename(columns={"Label2Sig": "action", "date": "twosigma"})
sigdf = pd.merge(sig1df, sig2df, on=['GICS Sector', 'action'])
sigdf

Unnamed: 0,GICS Sector,action,onesigma,twosigma
0,Consumer Discretionary,0.0,4,6
1,Consumer Staples,0.0,4,5
2,Consumer Staples,1.0,2,1
3,Energy,0.0,5,6
4,Financials,-1.0,2,1
5,Financials,0.0,2,5
6,Health Care,0.0,3,6
7,Industrials,0.0,4,6
8,Information Technology,0.0,4,6
9,Materials,-1.0,1,1


In [79]:
pivdf = sigdf.pivot(index='GICS Sector',columns='action', values=['onesigma', 'twosigma'])
pivdf

Unnamed: 0_level_0,onesigma,onesigma,onesigma,twosigma,twosigma,twosigma
action,-1.0,0.0,1.0,-1.0,0.0,1.0
GICS Sector,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
Consumer Discretionary,,4.0,,,6.0,
Consumer Staples,,4.0,2.0,,5.0,1.0
Energy,,5.0,,,6.0,
Financials,2.0,2.0,,1.0,5.0,
Health Care,,3.0,,,6.0,
Industrials,,4.0,,,6.0,
Information Technology,,4.0,,,6.0,
Materials,1.0,4.0,,1.0,5.0,
Real Estate,,4.0,,,6.0,
Telecommunications Services,,3.0,,,6.0,
