In [1]:
from __future__ import print_function
#import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.colors as colors
import seaborn as sns
import os
import glob
from textblob import TextBlob

In [2]:
# preprocess news data
symbols = []
text = []
date = []
with open("../shared/stock_data/data/news_article/newsdata.txt", 'rb') as myfile:
    for line in myfile.readlines():
        line = line.strip("\n").split("||")
        symbols.append(line[0])
        text.append(line[1])
        date.append(line[2])
news_data = pd.DataFrame({"company": symbols, "text": text, "date":date})
news_data['date'] = pd.to_datetime(news_data['date'])

In [3]:
news_data.head()

Unnamed: 0,company,date,text
0,MMM,2017-10-24,3M Jumps Most in Eight Years as Sales Strength...
1,MMM,2017-10-19,3 Things to Look Out for When 3M Co ReportsEar...
2,MMM,2017-11-16,3M Company (MMM) Closes 0.84% Down on the Day ...
3,MMM,2017-11-06,Watch Point Trust Co Has $542000 Holdings in 3...
4,MMM,2017-11-06,3M Company (NYSE:MMM) Stake Increased by Boys ...


In [12]:
price_movement_data_list = []
for f in glob.glob('../shared/stock_data/data/price_movement/*.csv'):
    dataset_file = pd.read_csv( f, sep=',', comment='#',  
                      index_col = False, encoding='utf-8')
    dataset_file['Comp_code'] = os.path.splitext(os.path.basename(f))[0]
    price_movement_data_list.append(dataset_file)

price_movement_data_set   = pd.concat(price_movement_data_list, ignore_index=True, axis = 0)


In [13]:
price_movement_data_set.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Comp_code
0,2000-01-03,41.4375,41.6875,39.8125,40.1875,27.366121,2638200,BA
1,2000-01-04,40.1875,41.125,39.75,40.125,27.323591,3592100,BA
2,2000-01-05,41.375,43.3125,41.375,42.625,29.026001,7631700,BA
3,2000-01-06,42.625,43.4375,41.125,43.0625,29.323893,4922200,BA
4,2000-01-07,43.6875,44.875,43.6875,44.3125,30.175095,6008300,BA


In [14]:
revenue_data_list = []
for f in glob.glob('../shared/stock_data/data/revenue/*.csv'):
    dataset_file = pd.read_csv( f, sep=',', comment='#',  
                      index_col = False, encoding='utf-8')
    dataset_file['Comp_code'] = os.path.splitext(os.path.basename(f))[0].split('_')[0]
    revenue_data_list.append(dataset_file)

revenue_data_set   = pd.concat(revenue_data_list, ignore_index=True, axis = 0)

In [15]:
revenue_data_set.head()

Unnamed: 0,ticker,year,quarter,basicdilutedeps,basiceps,cashdividendspershare,dilutedeps,incometaxexpense,netincome,netincomecontinuing,...,totalinterestexpense,totaloperatingexpenses,totaloperatingincome,totalotherincome,totalpretaxincome,totalrevenue,weightedavebasicdilutedsharesos,weightedavebasicsharesos,weightedavedilutedsharesos,Comp_code
0,AMZN,2009,Q1,0.41,0.41,0.0,0.41,69000000.0,177000000.0,177000000.0,...,12000000.0,482000000.0,244000000.0,4000000.0,248000000.0,4889000000.0,431700000.0,429000000.0,437000000.0,AMZN
1,AMZN,2009,Q2,0.33,0.33,0.0,0.32,39000000.0,142000000.0,142000000.0,...,7000000.0,565000000.0,159000000.0,20000000.0,179000000.0,4651000000.0,430300000.0,431000000.0,440000000.0,AMZN
2,AMZN,2009,Q3,0.46,0.46,0.0,0.45,60000000.0,199000000.0,199000000.0,...,7000000.0,556000000.0,251000000.0,11000000.0,262000000.0,5449000000.0,432600000.0,432000000.0,441000000.0,AMZN
3,AMZN,2009,Q4,0.88,0.88,0.0,0.86,85000000.0,384000000.0,384000000.0,...,8000000.0,747000000.0,475000000.0,-3000000.0,472000000.0,9520000000.0,433700000.0,433000000.0,442000000.0,AMZN
4,AMZN,2009,FY,2.08,2.08,0.0,2.04,253000000.0,902000000.0,902000000.0,...,34000000.0,2350000000.0,1129000000.0,32000000.0,1161000000.0,24509000000.0,433700000.0,433000000.0,442000000.0,AMZN


# Exporatory Data Analysis

2. Explore different ways of extracting feature information from news articles

In [6]:
def get_polarity(text):
    blob = TextBlob(text)
    return blob.sentiment.polarity
def get_subjectivity(text):
    blob = TextBlob(text)
    return blob.sentiment.subjectivity

In [7]:
# sentiments
news_data["positivity"] = news_data['text'].map(lambda x : get_polarity(x))
# public opinion
news_data["subjectivity"] = news_data['text'].map(lambda x : get_subjectivity(x))

In [8]:
#some direct text feature
news_data['good'] = news_data['text'].map(lambda x : 1 if 'good' in x.lower() else 0)
news_data['scandal'] = news_data['text'].map(lambda x : 1 if 'scandal' in x.lower() else 0)
#news_data['CFO'] = news_data['text'].map(lambda x : 1 if 'CFO' in x.lower() else 0) #none of these have CFO
news_data['buy'] = news_data['text'].map(lambda x : 1 if 'buy' in x.lower() else 0)
news_data['sell'] = news_data['text'].map(lambda x : 1 if 'sell' in x.lower() else 0)
news_data['decline'] = news_data['text'].map(lambda x : 1 if 'decline' in x.lower() else 0)
news_data['decline'].value_counts()

0    54478
1      228
Name: decline, dtype: int64

In [None]:
new_data_sentiments = news_data[['company', 'date', 'scandal', 'buy', 'sell', 'decline', 'positivity', 'subjectivity']]
for_analysis = for_analysis.groupby(['company', 'date'], as_index=False).agg(sum)
for_analysis.head()