In [1]:
from __future__ import division 
# This tells matplotlib not to try opening a new window for each plot.
#%matplotlib inline

# General libraries.
import os
import codecs
import json
import csv

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
import glob
import pickle
import time
# SK-learn libraries for learning.
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

# SK-learn library for importing the newsgroup data.
from sklearn.datasets import fetch_20newsgroups

# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.decomposition import NMF

from nltk.stem import WordNetLemmatizer
import nltk

In [4]:
%%time
data=pickle.load(open("../../Objects/Fulldata_wY_correct", 'rb'))
data=data.drop(["Unnamed: 0"], axis=1)

In [9]:
%%time
wordnet_lemmatizer = WordNetLemmatizer()
lemmatized_text=[wordnet_lemmatizer.lemmatize(text) for text in data.text]

CPU times: user 20.1 s, sys: 0 ns, total: 20.1 s
Wall time: 32.4 s


In [10]:
lemmatized_text[0][0:1000]

"\n\nTEXT:\nCheck the appropriate box below if the Form 8-K filing is intended to simultaneously satisfy the filing obligation of the registrant under any of the following provisions (General Instruction A.2. below):\nsee\nItem 2.02 Results of Operations and Financial Condition\nOn December 17, 2009, Accenture issued a press release announcing financial results for its first quarter of fiscal year 2010, which fiscal quarter ended on November 30, 2009.\nA copy of the press release is attached hereto as Exhibit 99.1. All information in the press release is furnished but not filed.\nNon-GAAP Financial Information\nIn the attached press release Accenture discloses the following non-GAAP financial measures:\nReconciliations of these non-GAAP financial measures to the most directly comparable financial measures calculated and presented in accordance with GAAP are included in the press release. While Accenture's management believes that this non-GAAP financial information is useful in evaluat

In [11]:
%%time
vectorizer1=CountVectorizer(stop_words='english', min_df=10)
text_vector=vectorizer1.fit_transform(lemmatized_text)
text_vector.shape

(37502, 35786)

In [12]:
pickle.dump(text_vector, open("../../Objects/Text_vector_lemmatized", 'wb'))

In [18]:
%%time
##  select top features using feature selection packages
labels=data.label
ktop=SelectKBest(chi2, k=3000).fit_transform(text_vector, labels)
ktop.shape

CPU times: user 836 ms, sys: 180 ms, total: 1.02 s
Wall time: 2.17 s


In [20]:
%%time
## Non-negative factorization of the top unigram features, with 100 dimensions
model100 = NMF(n_components=100, init='random', random_state=1, alpha=.1, l1_ratio=.5)
topVec100 = model100.fit_transform(ktop)



CPU times: user 8min 58s, sys: 36.1 s, total: 9min 34s
Wall time: 13min 53s


### Random Forest model using only the text features

In [42]:
alldata=pd.DataFrame(np.hstack((data.as_matrix(), topVec100)))
alldata.columns=np.array(['Company', 'ticker', 'Surprise', 'Reported_EPS', 'Consensus_EPS',
       'Date', 'timestamp', 'bow', 'items', 'text', 'orig_file',
       'release_time_type', 'return', 'stock_performance',
       'market_performance', 'normalized_performance', 'label']+range(100))
allfeatures=alldata.drop(["Company", "ticker",'bow', 'orig_file', 'stock_performance', \
                              'market_performance', 'normalized_performance', 'text',\
                          'timestamp' , 'Reported_EPS', 'Consensus_EPS', "items", "return", 'Surprise', 'release_time_type'], axis=1).dropna(axis=0, how="any")
allfeatures.head(3)

Unnamed: 0,Date,label,0,1,2,3,4,5,6,7,...,90,91,92,93,94,95,96,97,98,99
0,2009-12-17 00:00:00,DOWN,0,0,0.0,0.684804,0.278163,0.751668,0,0.0451815,...,0.0881711,0.0294431,0.0,0.131948,0.0180724,0.0,0.972815,0.0,0,0.347473
1,2009-12-17 00:00:00,UP,0,0,0.124732,0.526862,0.0510855,0.116285,0,0.0,...,0.124763,0.0367622,0.0,0.172028,0.0,0.00507936,0.860997,0.00239063,0,0.160223
2,2009-12-17 00:00:00,STAY,0,0,0.0841313,0.525951,0.207595,0.104889,0,0.00700761,...,0.347984,0.0627882,0.0159936,0.0976504,0.0,0.0,0.33123,0.00204289,0,0.22133


In [43]:
train_data = allfeatures.loc[allfeatures.Date < pd.to_datetime('2009-01-01'), :].drop(['Date', 'label'], axis=1)
dev_data = allfeatures.loc[(allfeatures.Date >= pd.to_datetime('2009-01-01')) & \
                           (allfeatures.Date <= pd.to_datetime('2010-12-31')), :].drop(['Date', 'label'], axis=1)
test_data = allfeatures.loc[allfeatures.Date >= pd.to_datetime('2011-01-01'), :].drop(['Date'], axis=1)
test_label=test_data['label']
test_data=test_data.drop(["label"], axis=1)

train_label=allfeatures.loc[allfeatures.Date < pd.to_datetime('2009-01-01'), "label"]
dev_label = allfeatures.loc[(allfeatures.Date >= pd.to_datetime('2009-01-01')) & \
                           (allfeatures.Date <= pd.to_datetime('2010-12-31')), 'label']


In [44]:
print train_data.shape, dev_data.shape, test_data.shape

(17500, 100) (9774, 100) (10228, 100)


In [45]:
rf=RandomForestClassifier(n_estimators=2000)
model_text=rf.fit(train_data, train_label)

In [49]:
%%time
# Dev set accuracy
preds_dev = model_text.predict(dev_data)
F_Score_dev = metrics.f1_score(dev_label, preds_dev, average='weighted')
#model_output(pred_probas, F_Score, preds)
conf_dev=confusion_matrix(dev_label.values, preds_dev,labels=["UP", "STAY", "DOWN"] , )
print(conf_dev/len(preds_dev))
print("F-score : {:3.3f}".format(F_Score_dev))
print("Accuracy : {:3.3f}".format(np.sum(preds_dev==dev_label)/len(dev_label)))

[[ 0.21997135  0.03345611  0.12308165]
 [ 0.15418457  0.06281973  0.07530182]
 [ 0.18252507  0.03652548  0.11213423]]
F-score : 0.380
Accuracy : 0.395


In [50]:
# Test set accuracy
preds_test = model_text.predict(test_data)
F_Score_test = metrics.f1_score(test_label, preds_test, average='weighted')
pred_probas = model_text.predict_proba(test_data)
#model_output(pred_probas, F_Score, preds)
conf_test=confusion_matrix(test_label, preds_test)
print(conf_test/len(preds_test))
print("F-score : {:3.3f}".format(F_Score_test))
print("Accuracy : {:3.3f}".format(np.sum(preds_test==test_label)/len(test_label)))

[[ 0.04908095  0.07723895  0.19710598]
 [ 0.03363316  0.13306609  0.1622018 ]
 [ 0.04487681  0.07714118  0.22565506]]
F-score : 0.381
Accuracy : 0.408


### Random Forest model using text features combined with EPS surprises


In [36]:
alldata=pd.DataFrame(np.hstack((data.as_matrix(), topVec100)))
alldata.columns=np.array(['Company', 'ticker', 'Surprise', 'Reported_EPS', 'Consensus_EPS',
       'Date', 'timestamp', 'bow', 'items', 'text', 'orig_file',
       'release_time_type', 'return', 'stock_performance',
       'market_performance', 'normalized_performance', 'label']+range(100))
allfeatures=alldata.drop(["Company", "ticker",'bow', 'orig_file', 'stock_performance', \
                              'market_performance', 'normalized_performance', 'text',\
                          'timestamp' , 'Reported_EPS', 'Consensus_EPS', "items", "return"], axis=1).dropna(axis=0, how="any")
allfeatures.head()

Unnamed: 0,Surprise,Date,release_time_type,label,0,1,2,3,4,5,...,90,91,92,93,94,95,96,97,98,99
0,3.08,2009-12-17 00:00:00,3,DOWN,0,0,0.0,0.684804,0.278163,0.751668,...,0.0881711,0.0294431,0.0,0.131948,0.0180724,0.0,0.972815,0.0,0.0,0.347473
1,25.0,2009-12-17 00:00:00,1,UP,0,0,0.124732,0.526862,0.0510855,0.116285,...,0.124763,0.0367622,0.0,0.172028,0.0,0.00507936,0.860997,0.00239063,0.0,0.160223
2,2.38,2009-12-17 00:00:00,3,STAY,0,0,0.0841313,0.525951,0.207595,0.104889,...,0.347984,0.0627882,0.0159936,0.0976504,0.0,0.0,0.33123,0.00204289,0.0,0.22133
3,3.77,2009-12-17 00:00:00,1,DOWN,0,0,0.0,0.410618,0.0,0.00467109,...,0.317526,0.0354484,0.0,0.151472,0.0,0.0,0.17213,0.0,0.0,0.0
4,6.94,2009-12-17 00:00:00,1,UP,0,0,0.0,0.201398,0.0,0.251613,...,0.611339,0.0729158,0.0,0.230406,0.0,0.0,0.439413,0.0,0.0123564,0.731953


In [37]:
train_data = allfeatures.loc[allfeatures.Date < pd.to_datetime('2009-01-01'), :].drop(['Date', 'label'], axis=1)
dev_data = allfeatures.loc[(allfeatures.Date >= pd.to_datetime('2009-01-01')) & \
                           (allfeatures.Date <= pd.to_datetime('2010-12-31')), :].drop(['Date', 'label'], axis=1)
test_data = allfeatures.loc[allfeatures.Date >= pd.to_datetime('2011-01-01'), :].drop(['Date'], axis=1)
test_label=test_data['label']
test_data=test_data.drop(["label"], axis=1)

train_label=allfeatures.loc[allfeatures.Date < pd.to_datetime('2009-01-01'), "label"]
dev_label = allfeatures.loc[(allfeatures.Date >= pd.to_datetime('2009-01-01')) & \
                           (allfeatures.Date <= pd.to_datetime('2010-12-31')), 'label']


In [39]:
print train_data.shape, dev_data.shape, test_data.shape

(17449, 102) (9715, 102) (10191, 102)


In [None]:
rf=RandomForestClassifier(n_estimators=2000)
model_lem=rf.fit(train_data, train_label)

In [141]:
# Dev set accuracy
preds_dev = model_lem.predict(dev_data)
F_Score_dev = metrics.f1_score(dev_label, preds, average='weighted')
pred_probas_dev = model_lem.predict_proba(dev_data)
#model_output(pred_probas, F_Score, preds)
conf_dev=confusion_matrix(dev_label.values, preds_dev,labels=["UP", "STAY", "DOWN"] , )
print(conf_dev/len(preds_dev))
print("F-score : {:3.3f}".format(F_Score_dev))
print("Accuracy : {:3.3f}".format(np.sum(preds_dev==dev_label)/len(dev_label)))

[[ 0.299228    0.01791045  0.05898096]
 [ 0.16222337  0.04632012  0.08378796]
 [ 0.13772517  0.02367473  0.17014925]]
F-score : 0.481
Accuracy : 0.516


In [148]:
# Test set accuracy
preds_test = model.predict(test_data)
F_Score_test = metrics.f1_score(test_label, preds_test, average='weighted')
pred_probas = model.predict_proba(test_data)
#model_output(pred_probas, F_Score, preds)
conf_test=confusion_matrix(test_label, preds_test)
print(conf_test/len(preds_test))
print("F-score : {:3.3f}".format(F_Score_test))
print("Accuracy : {:3.3f}".format(np.sum(preds_test==test_label)/len(test_label)))

[[ 0.1627907   0.06083799  0.10028456]
 [ 0.08419193  0.10695712  0.13757237]
 [ 0.04631538  0.0456285   0.25542145]]
F-score : 0.512
Accuracy : 0.525
