### Load packages

In [36]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.datasets import load_iris
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import re

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\wrigh\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Load Data

In [3]:
textdf = pd.read_csv('News Headlines + Sentiment.csv')

In [23]:
df = pd.read_csv('Stock Data + Sentiment Final.csv')

In [230]:
textdf.head()

Unnamed: 0,title,publication,author,date,year,month,content,title sentiment,title sentiment score,content sentiment,content sentiment score
0,Hillary Clinton’s point of no return,CNN,Dan Merica,2015-01-01,2015.0,1.0,Washington (CNN) As Democrats close to Hillary...,neutral,0.0,positive,0.072
1,Taylor Swift’s year-end gift video brings all ...,CNN,Lisa Respers France,2015-01-01,2015.0,1.0,"(CNN) Way to make us weep, Taylor Swift. As i...",neutral,0.0,positive,0.118
2,When is your tweet a threat? (Opinion),CNN,Danny Cevallos,2015-01-02,2015.0,1.0,(CNN) The New York Police Department faced a ...,neutral,0.0,positive,0.104
3,Bono says he’s worried he may never play guita...,CNN,Lisa Respers France,2015-01-02,2015.0,1.0,(CNN) Bono has shared with fans that recovery...,neutral,0.0,neutral,0.033
4,"Donna Douglas, ’Beverly Hillbillies’ Elly May,...",CNN,Todd Leopold,2015-01-02,2015.0,1.0,"(CNN) Donna Douglas, who played voluptuous to...",negative,-0.2,positive,0.144


In [245]:
df.head()

Unnamed: 0,Date,content sentiment score,title sentiment score,Exchange_Name,Adj Close,Close,High,Low,Open,Volume
0,2015-01-02,0.07875,-0.01125,Dow Jones,17832.990234,17832.990234,17951.779297,17731.300781,17823.070313,76270000
1,2015-01-02,0.07875,-0.01125,NYSE,10830.919922,10830.919922,10889.25,10770.509766,10859.799805,2708700000
2,2015-01-02,0.07875,-0.01125,TSX/S&P,14753.700195,14753.700195,14756.299805,14631.400391,14637.299805,132965800
3,2015-01-02,0.07875,-0.01125,NASDAQ,4726.810059,4726.810059,4777.009766,4698.109863,4760.240234,1435150000
4,2015-01-02,0.07875,-0.01125,S&P,2058.199951,2058.199951,2072.360107,2046.040039,2058.899902,2708700000


In [246]:
df.set_index(['Date', 'Exchange_Name'], inplace = True)
df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,content sentiment score,title sentiment score,Adj Close,Close,High,Low,Open,Volume
Date,Exchange_Name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2015-01-02,Dow Jones,0.07875,-0.01125,17832.990234,17832.990234,17951.779297,17731.300781,17823.070313,76270000
2015-01-02,NYSE,0.07875,-0.01125,10830.919922,10830.919922,10889.25,10770.509766,10859.799805,2708700000
2015-01-02,TSX/S&P,0.07875,-0.01125,14753.700195,14753.700195,14756.299805,14631.400391,14637.299805,132965800
2015-01-02,NASDAQ,0.07875,-0.01125,4726.810059,4726.810059,4777.009766,4698.109863,4760.240234,1435150000
2015-01-02,S&P,0.07875,-0.01125,2058.199951,2058.199951,2072.360107,2046.040039,2058.899902,2708700000


In [24]:
df['title_sentiment'] = [1 if score > 0.05 
                             else -1 if score < -0.05
                             else 0 
                             for score in df['title sentiment score']]
df['content_sentiment'] = [1 if score > 0.05 
                             else -1 if score < -0.05
                             else 0 
                             for score in df['content sentiment score']]

#### Removes dates where stock market wasn't open (i.e weekends, public holidays). Inner join occured, therefore content sentiment score can be indexed.

In [49]:
df.Date.nunique()

583

In [25]:
df.drop(df[df['content sentiment score'] == 0].index, inplace = True)

In [51]:
df.Date.nunique()

581

##### Two values removed

### Encode sentiment labels

In [6]:
textdf.replace({'positive' : 1}, inplace = True)
textdf.replace({'neutral' : 0}, inplace = True)
textdf.replace({'negative' : -1}, inplace = True)

In [41]:
textdf.head()

Unnamed: 0,title,publication,author,date,year,month,content,title sentiment,title sentiment score,content sentiment,content sentiment score
0,Hillary Clinton’s point of no return,CNN,Dan Merica,2015-01-01,2015.0,1.0,Washington (CNN) As Democrats close to Hillary...,0,0.0,1,0.072
1,Taylor Swift’s year-end gift video brings all ...,CNN,Lisa Respers France,2015-01-01,2015.0,1.0,"(CNN) Way to make us weep, Taylor Swift. As i...",0,0.0,1,0.118
2,When is your tweet a threat? (Opinion),CNN,Danny Cevallos,2015-01-02,2015.0,1.0,(CNN) The New York Police Department faced a ...,0,0.0,1,0.104
3,Bono says he’s worried he may never play guita...,CNN,Lisa Respers France,2015-01-02,2015.0,1.0,(CNN) Bono has shared with fans that recovery...,0,0.0,-1,0.033
4,"Donna Douglas, ’Beverly Hillbillies’ Elly May,...",CNN,Todd Leopold,2015-01-02,2015.0,1.0,"(CNN) Donna Douglas, who played voluptuous to...",-1,-0.2,1,0.144


In [97]:
textdf.content.count()

124584

In [53]:
textdf['title sentiment'].value_counts()

 0    69564
 1    34653
-1    20367
Name: title sentiment, dtype: int64

In [54]:
textdf['content sentiment'].value_counts()

 1    82165
 0    36355
-1     6064
Name: content sentiment, dtype: int64

In [55]:
df['content_sentiment'].value_counts()

 1    2716
 0     134
-1       5
Name: content_sentiment, dtype: int64

In [56]:
df['title_sentiment'].value_counts()

 0    2230
 1     510
-1     115
Name: title_sentiment, dtype: int64

### Text Classification Model

#### 1. Create training model for content text classification, min_df chosen as 2 to keep all articles.

In [98]:
cv = CountVectorizer(max_features = 1500, min_df=2, max_df=0.7, stop_words=stopwords.words('english'))
content_text = cv.fit_transform(textdf['content']).toarray()
sent_text = textdf['content sentiment']

In [99]:
content_text.shape

(124584, 1500)

In [100]:
sent_text.shape

(124584,)

#### Save data for csv in order to speed up process.

In [67]:
np.savetxt('content_train.csv',content_text, delimiter =',')

#### Load data back in 

In [None]:
content_text = np.genfromtxt('content_train.csv', delimiter =',', skip_header = 1)

#### 2. Create TF-IDF Matrix

In [101]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()

In [102]:
text_array = tfidf.fit_transform(content_text).toarray()

In [103]:
text_array.shape

(124584, 1500)

#### 3. Create Training Sets for textdf 

In [337]:
content_train, content_test, sent_train, sent_test = train_test_split(text_array, 
                                                                      sent_text, 
                                                                      test_size = 0.3, random_state = 0)

In [192]:
df_text_train = pd.DataFrame(content_train)

#### 4. Run models

In [108]:
classifier = MultinomialNB()
classifier.fit(content_train, sent_train)
sent_pred = classifier.predict(content_test)

In [109]:
print(confusion_matrix(sent_test,sent_pred))
print(classification_report(sent_test,sent_pred))
print(accuracy_score(sent_test, sent_pred))

[[   68   834   980]
 [   86  3823  6946]
 [   14  2354 22271]]
              precision    recall  f1-score   support

          -1       0.40      0.04      0.07      1882
           0       0.55      0.35      0.43     10855
           1       0.74      0.90      0.81     24639

    accuracy                           0.70     37376
   macro avg       0.56      0.43      0.44     37376
weighted avg       0.66      0.70      0.66     37376

0.6999678938356164


###### Multinomial Naive Bayes Model - 70% accuracy

In [110]:
classifier = GaussianNB()
classifier.fit(content_train, sent_train)
sent_pred = classifier.predict(content_test)

In [111]:
print(confusion_matrix(sent_test,sent_pred))
print(classification_report(sent_test,sent_pred))
print(accuracy_score(sent_test, sent_pred))

[[ 1181   409   292]
 [ 3171  4526  3158]
 [ 2656  5375 16608]]
              precision    recall  f1-score   support

          -1       0.17      0.63      0.27      1882
           0       0.44      0.42      0.43     10855
           1       0.83      0.67      0.74     24639

    accuracy                           0.60     37376
   macro avg       0.48      0.57      0.48     37376
weighted avg       0.68      0.60      0.63     37376

0.597040881849315


##### Gaussian Naive Bayes Model - 60% Accuracy

In [118]:
from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(n_estimators= 100)
classifier.fit(content_train, sent_train)
sent_pred = classifier.predict(content_test)

In [119]:
print(confusion_matrix(sent_test,sent_pred))
print(classification_report(sent_test,sent_pred))
print(accuracy_score(sent_test, sent_pred))

[[   27   982   873]
 [   10  3159  7686]
 [    1  1169 23469]]
              precision    recall  f1-score   support

          -1       0.71      0.01      0.03      1882
           0       0.59      0.29      0.39     10855
           1       0.73      0.95      0.83     24639

    accuracy                           0.71     37376
   macro avg       0.68      0.42      0.42     37376
weighted avg       0.69      0.71      0.66     37376

0.713158176369863


##### Random Forest Classifier - 71% accurate

In [1]:
from sklearn.svm import SVC 

In [2]:
classifier = SVC()
classifier.fit(content_train, sent_train)
sent_pred = classifier.predict(content_test)

NameError: name 'content_train' is not defined

In [3]:
print(confusion_matrix(sent_test,sent_pred))
print(classification_report(sent_test,sent_pred))
print(accuracy_score(sent_test, sent_pred))

NameError: name 'confusion_matrix' is not defined

##### Support Vector Classification -  