# Stock Prediction: Daily News for Stock Market Prediction: Using 8 years daily news headlines to predict stock market movement

In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import scipy
from scipy.stats import chisquare
from scipy.stats import chi2_contingency


%matplotlib inline

## Load data + Adding column labels¶


### Stock data: Dow Jones Industrial Average (DJIA) - Range: 2008-06-08 to 2016-07-01

In [19]:
filename='DJIA_table.csv'
dija = pd.read_csv(filename)
print (dija.head(15))

          Date          Open          High           Low         Close  \
0   2016-07-01  17924.240234  18002.380859  17916.910156  17949.369141   
1   2016-06-30  17712.759766  17930.609375  17711.800781  17929.990234   
2   2016-06-29  17456.019531  17704.509766  17456.019531  17694.679688   
3   2016-06-28  17190.509766  17409.720703  17190.509766  17409.720703   
4   2016-06-27  17355.210938  17355.210938  17063.080078  17140.240234   
5   2016-06-24  17946.630859  17946.630859  17356.339844  17400.750000   
6   2016-06-23  17844.109375  18011.070312  17844.109375  18011.070312   
7   2016-06-22  17832.669922  17920.160156  17770.359375  17780.830078   
8   2016-06-21  17827.330078  17877.839844  17799.800781  17829.730469   
9   2016-06-20  17736.869141  17946.359375  17736.869141  17804.869141   
10  2016-06-17  17733.439453  17733.439453  17602.779297  17675.160156   
11  2016-06-16  17602.230469  17754.910156  17471.289062  17733.099609   
12  2016-06-15  17703.650391  17762.96

In [25]:
dija.columns

Index(['Date', 'Open', 'High', 'Low', 'Close', 'Volume', 'Adj Close'], dtype='object')

In [26]:
dija.describe()

Unnamed: 0,Open,High,Low,Close,Volume,Adj Close
count,1989.0,1989.0,1989.0,1989.0,1989.0,1989.0
mean,13459.116048,13541.303173,13372.931728,13463.032255,162811000.0,13463.032255
std,3143.281634,3136.271725,3150.420934,3144.006996,93923430.0,3144.006996
min,6547.009766,6709.609863,6469.950195,6547.049805,8410000.0,6547.049805
25%,10907.339844,11000.980469,10824.759766,10913.379883,100000000.0,10913.379883
50%,13022.049805,13088.110352,12953.129883,13025.580078,135170000.0,13025.580078
75%,16477.699219,16550.070312,16392.769531,16478.410156,192600000.0,16478.410156
max,18315.060547,18351.359375,18272.560547,18312.390625,674920000.0,18312.390625


In [28]:
dija.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1989 entries, 0 to 1988
Data columns (total 7 columns):
Date         1989 non-null object
Open         1989 non-null float64
High         1989 non-null float64
Low          1989 non-null float64
Close        1989 non-null float64
Volume       1989 non-null int64
Adj Close    1989 non-null float64
dtypes: float64(5), int64(1), object(1)
memory usage: 108.9+ KB


### News data: Crawled historical news headlines from Reddit WorldNews Channel (/r/worldnews). They are ranked by reddit users' votes, and only the top 25 headlines are considered for a single date. (Range: 2008-06-08 to 2016-07-01)

In [37]:
## Display full dataframe information
pd.set_option('display.max_colwidth', -1)

filename_R='RedditNews.csv'
news = pd.read_csv(filename_R)
print (news.head(5))

         Date  \
0  2016-07-01   
1  2016-07-01   
2  2016-07-01   
3  2016-07-01   
4  2016-07-01   

                                                                                                                                                                                       News  
0  A 117-year-old woman in Mexico City finally received her birth certificate, and died a few hours later. Trinidad Alvarez Lira had waited years for proof that she had been born in 1898.  
1  IMF chief backs Athens as permanent Olympic host                                                                                                                                          
2  The president of France says if Brexit won, so can Donald Trump                                                                                                                           
3  British Man Who Must Give Police 24 Hours' Notice of Sex Threatens Hunger Strike: The man is the subject of a sexual risk order despit

In [24]:
news.columns

Index(['Date', 'News'], dtype='object')

In [29]:
news.describe()

Unnamed: 0,Date,News
count,73608,73608
unique,2943,73537
top,2008-10-26,b'Why Russias response to Georgia was right'
freq,50,3


In [30]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73608 entries, 0 to 73607
Data columns (total 2 columns):
Date    73608 non-null object
News    73608 non-null object
dtypes: object(2)
memory usage: 1.1+ MB


### Dataset that combines World News & DIJA

In [38]:
## Display full dataframe information
pd.set_option('display.max_colwidth', -1)
filename_C='Combined_News_DJIA.csv'
combined = pd.read_csv(filename_C)
print (combined.head(1))

         Date  Label  \
0  2008-08-08  0       

                                                                         Top1  \
0  b"Georgia 'downs two Russian warplanes' as countries move to brink of war"   

                                      Top2  \
0  b'BREAKING: Musharraf to be impeached.'   

                                                                                          Top3  \
0  b'Russia Today: Columns of troops roll into South Ossetia; footage from fighting (YouTube)'   

                                                                                                                                          Top4  \
0  b'Russian tanks are moving towards the capital of South Ossetia, which has reportedly been completely destroyed by Georgian artillery fire'   

                                                                                                                          Top5  \
0  b"Afghan children raped with 'impunity,' U.N. official says - this is si

In [21]:
combined.columns

Index(['Date', 'Label', 'Top1', 'Top2', 'Top3', 'Top4', 'Top5', 'Top6', 'Top7',
       'Top8', 'Top9', 'Top10', 'Top11', 'Top12', 'Top13', 'Top14', 'Top15',
       'Top16', 'Top17', 'Top18', 'Top19', 'Top20', 'Top21', 'Top22', 'Top23',
       'Top24', 'Top25'],
      dtype='object')

In [31]:
combined.describe()

Unnamed: 0,Label
count,1989.0
mean,0.535445
std,0.498867
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [32]:
combined.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1989 entries, 0 to 1988
Data columns (total 27 columns):
Date     1989 non-null object
Label    1989 non-null int64
Top1     1989 non-null object
Top2     1989 non-null object
Top3     1989 non-null object
Top4     1989 non-null object
Top5     1989 non-null object
Top6     1989 non-null object
Top7     1989 non-null object
Top8     1989 non-null object
Top9     1989 non-null object
Top10    1989 non-null object
Top11    1989 non-null object
Top12    1989 non-null object
Top13    1989 non-null object
Top14    1989 non-null object
Top15    1989 non-null object
Top16    1989 non-null object
Top17    1989 non-null object
Top18    1989 non-null object
Top19    1989 non-null object
Top20    1989 non-null object
Top21    1989 non-null object
Top22    1989 non-null object
Top23    1988 non-null object
Top24    1986 non-null object
Top25    1986 non-null object
dtypes: int64(1), object(26)
memory usage: 419.6+ KB
