In [1]:
#sentiment analysis
import pandas as pd

reddit = pd.read_csv("RedditNews.csv", engine="python", sep=",", error_bad_lines=False)

# Show DataFrame
reddit.head()

Unnamed: 0,Date,Title
0,7/1/16,IMF chief backs Athens as permanent Olympic host
1,7/1/16,"The president of France says if Brexit won, so..."
2,7/1/16,British Man Who Must Give Police 24 Hours' Not...
3,7/1/16,100+ Nobel laureates urge Greenpeace to stop o...
4,7/1/16,Brazil: Huge spike in number of police killing...


In [2]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
import re
#nltk.download('wordnet')

In [3]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer() 

def preprocess(sentence):
    sentence=str(sentence)
    sentence = sentence.lower()
    sentence=sentence.replace('{html}',"") 
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', sentence)
    rem_url=re.sub(r'http\S+', '',cleantext)
    rem_num = re.sub('[0-9]+', '', rem_url)
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(rem_num)  
    filtered_words = [w for w in tokens if len(w) > 2 if not w in stopwords.words('english')]
    stem_words=[stemmer.stem(w) for w in filtered_words]
    lemma_words=[lemmatizer.lemmatize(w) for w in stem_words]
    return " ".join(filtered_words)


reddit['cleanText']=reddit['Title'].map(lambda s:preprocess(s))

In [4]:
reddit.head()

Unnamed: 0,Date,Title,cleanText
0,7/1/16,IMF chief backs Athens as permanent Olympic host,imf chief backs athens permanent olympic host
1,7/1/16,"The president of France says if Brexit won, so...",president france says brexit donald trump
2,7/1/16,British Man Who Must Give Police 24 Hours' Not...,british man must give police hours notice sex ...
3,7/1/16,100+ Nobel laureates urge Greenpeace to stop o...,nobel laureates urge greenpeace stop opposing ...
4,7/1/16,Brazil: Huge spike in number of police killing...,brazil huge spike number police killings rio a...


In [5]:
reddit['Date'] =  pd.to_datetime(reddit['Date'])
reddit.head()

Unnamed: 0,Date,Title,cleanText
0,2016-07-01,IMF chief backs Athens as permanent Olympic host,imf chief backs athens permanent olympic host
1,2016-07-01,"The president of France says if Brexit won, so...",president france says brexit donald trump
2,2016-07-01,British Man Who Must Give Police 24 Hours' Not...,british man must give police hours notice sex ...
3,2016-07-01,100+ Nobel laureates urge Greenpeace to stop o...,nobel laureates urge greenpeace stop opposing ...
4,2016-07-01,Brazil: Huge spike in number of police killing...,brazil huge spike number police killings rio a...


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
#pip install vaderSentiment 

In [7]:
#pip install --upgrade vaderSentimen


Collecting vaderSentimen
[31m  ERROR: Could not find a version that satisfies the requirement vaderSentimen (from versions: none)[0m
[31mERROR: No matching distribution found for vaderSentimen[0m
Note: you may need to restart the kernel to use updated packages.


In [8]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [14]:
preprocess = []
analyzer = SentimentIntensityAnalyzer()
for clean in reddit['cleanText']:
    vs = analyzer.polarity_scores(clean)
    preprocess.append(vs)


In [None]:
#positive sentiment: compound score >= 0.05
#neutral sentiment: (compound score > -0.05) and (compound score < 0.05)
#negative sentiment: compound score <= -0.05

In [16]:
df3 = pd.DataFrame(preprocess)

In [18]:
df4 = pd.concat([reddit,df3], axis=1)

In [19]:
djia = pd.read_csv("DJIA.csv", engine="python", sep=",", error_bad_lines=False)

In [20]:
djia.head()

Unnamed: 0,Date,Open,High,Low,Close,Volume,Adj Close
0,2016-07-01,17924.240234,18002.380859,17916.910156,17949.369141,82160000,17949.369141
1,2016-06-30,17712.759766,17930.609375,17711.800781,17929.990234,133030000,17929.990234
2,2016-06-29,17456.019531,17704.509766,17456.019531,17694.679688,106380000,17694.679688
3,2016-06-28,17190.509766,17409.720703,17190.509766,17409.720703,112190000,17409.720703
4,2016-06-27,17355.210938,17355.210938,17063.080078,17140.240234,138740000,17140.240234


In [29]:
djia['Date'] =  pd.to_datetime(djia['Date'], format='%Y-%m-%d')

In [32]:
merge = pd.merge(df4,djia, how='inner', on = 'Date')

In [33]:
merge.head()

Unnamed: 0,Date,Title,cleanText,compound,neg,neu,pos,Open,High,Low,Close,Volume,Adj Close
0,2016-07-01,IMF chief backs Athens as permanent Olympic host,imf chief backs athens permanent olympic host,-0.0516,0.167,0.833,0.0,17924.240234,18002.380859,17916.910156,17949.369141,82160000,17949.369141
1,2016-07-01,"The president of France says if Brexit won, so...",president france says brexit donald trump,0.0,0.0,1.0,0.0,17924.240234,18002.380859,17916.910156,17949.369141,82160000,17949.369141
2,2016-07-01,British Man Who Must Give Police 24 Hours' Not...,british man must give police hours notice sex ...,-0.821,0.413,0.587,0.0,17924.240234,18002.380859,17916.910156,17949.369141,82160000,17949.369141
3,2016-07-01,100+ Nobel laureates urge Greenpeace to stop o...,nobel laureates urge greenpeace stop opposing ...,-0.296,0.268,0.732,0.0,17924.240234,18002.380859,17916.910156,17949.369141,82160000,17949.369141
4,2016-07-01,Brazil: Huge spike in number of police killing...,brazil huge spike number police killings rio a...,-0.4404,0.319,0.426,0.255,17924.240234,18002.380859,17916.910156,17949.369141,82160000,17949.369141


In [38]:
returns = (merge.Close - merge.Close.shift(1))/merge.Close.shift(1)
returns.head(51)

0          NaN
1     0.000000
2     0.000000
3     0.000000
4     0.000000
5     0.000000
6     0.000000
7     0.000000
8     0.000000
9     0.000000
10    0.000000
11    0.000000
12    0.000000
13    0.000000
14    0.000000
15    0.000000
16    0.000000
17    0.000000
18    0.000000
19    0.000000
20    0.000000
21    0.000000
22    0.000000
23    0.000000
24   -0.001080
25    0.000000
26    0.000000
27    0.000000
28    0.000000
29    0.000000
30    0.000000
31    0.000000
32    0.000000
33    0.000000
34    0.000000
35    0.000000
36    0.000000
37    0.000000
38    0.000000
39    0.000000
40    0.000000
41    0.000000
42    0.000000
43    0.000000
44    0.000000
45    0.000000
46    0.000000
47    0.000000
48    0.000000
49   -0.013124
50    0.000000
Name: Close, dtype: float64

In [39]:
merge['returns'] = returns

In [40]:
merge.head()

Unnamed: 0,Date,Title,cleanText,compound,neg,neu,pos,Open,High,Low,Close,Volume,Adj Close,returns
0,2016-07-01,IMF chief backs Athens as permanent Olympic host,imf chief backs athens permanent olympic host,-0.0516,0.167,0.833,0.0,17924.240234,18002.380859,17916.910156,17949.369141,82160000,17949.369141,
1,2016-07-01,"The president of France says if Brexit won, so...",president france says brexit donald trump,0.0,0.0,1.0,0.0,17924.240234,18002.380859,17916.910156,17949.369141,82160000,17949.369141,0.0
2,2016-07-01,British Man Who Must Give Police 24 Hours' Not...,british man must give police hours notice sex ...,-0.821,0.413,0.587,0.0,17924.240234,18002.380859,17916.910156,17949.369141,82160000,17949.369141,0.0
3,2016-07-01,100+ Nobel laureates urge Greenpeace to stop o...,nobel laureates urge greenpeace stop opposing ...,-0.296,0.268,0.732,0.0,17924.240234,18002.380859,17916.910156,17949.369141,82160000,17949.369141,0.0
4,2016-07-01,Brazil: Huge spike in number of police killing...,brazil huge spike number police killings rio a...,-0.4404,0.319,0.426,0.255,17924.240234,18002.380859,17916.910156,17949.369141,82160000,17949.369141,0.0
