In [1]:
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier
# SK-learn libraries for feature extraction from text.
from sklearn.feature_extraction.text import *
import pandas as pd
import pickle

data = pickle.load(open("/Users/Maximus/downloads/Fulldata_combined_w_eps", "rb"))

market_data = pickle.load(open("/Users/Maximus/downloads/market_data", "rb"))

## Create Label from market and stock return (UP, STAY, DOWN)

In [2]:
##########Drop Columns not used#########
data = data.drop(['Before Market', 'During Market', 'After Market','High','Low','Close','Volume',\
                  'Open', 'Adj Close'], axis=1)

#########Combine stock return with market return
combined = pd.merge(data, market_data[['Date', 'Market Before', 'Market During', 'Market After' ]],\
                    how='left', on=['Date'])

#########Select market return based on report release time####
combined['market_performance'] = 0
combined['market_performance'] = \
  combined.release_time_type.apply(lambda x: 1 if x ==1 else 0)*combined['Market Before']\
+ combined.release_time_type.apply(lambda x: 1 if x ==2 else 0)*combined['Market During']\
+ combined.release_time_type.apply(lambda x: 1 if x ==3 else 0)*combined['Market After']

combined = combined.drop(['Market Before', 'Market During', 'Market After'], axis=1)


#Finally, create the label for our model.  A normalized return by taking the difference
#between stock return and market return.  If the difference >  1%: UP
#                                                           < -1%: DOWN
#                                                            else: STAY
############################################################################################################
combined['normalized_performance'] = combined.stock_performance - combined.market_performance
combined['label'] = combined.normalized_performance
combined['label'] = combined.label.apply(lambda x: 'UP' if x >1 else x)
combined['label'] = combined.label.apply(lambda x: 'DOWN' if (isinstance(x, str) == 0 and x <-1) else x)
combined['label'] = combined.label.apply(lambda x: x if isinstance(x, str) else 'STAY')

In [3]:
# 8-K reports for all S&P 500 companies between 2002 and 2012, with the last two years (2011-2012)
#reserved for the final evaluation, the previous two (2009-2010) for development, and the remainder for training.
#combined.loc[combined.Date == '2009-12-17',:]
combined.Date.min()

Timestamp('2002-11-01 00:00:00')

In [6]:
combined.text[0]

'\n\nTEXT:\nCheck the appropriate box below if the Form 8-K filing is intended to simultaneously satisfy the filing obligation of the registrant under any of the following provisions (General Instruction A.2. below):\nsee\nItem 2.02 Results of Operations and Financial Condition\nOn December 17, 2009, Accenture issued a press release announcing financial results for its first quarter of fiscal year 2010, which fiscal quarter ended on November 30, 2009.\nA copy of the press release is attached hereto as Exhibit 99.1. All information in the press release is furnished but not filed.\nNon-GAAP Financial Information\nIn the attached press release Accenture discloses the following non-GAAP financial measures:\nReconciliations of these non-GAAP financial measures to the most directly comparable financial measures calculated and presented in accordance with GAAP are included in the press release. While Accenture\'s management believes that this non-GAAP financial information is useful in evalua

In [17]:
combined.to_pickle("/Users/Maximus/downloads/Data_12_03")

In [3]:
train_data = combined.loc[combined.Date < '2009-01-01', :]
dev_data = combined.loc[(combined.Date >= '2009-01-01') & (combined.Date <= '2010-12-31'), :]
test_data = combined.loc[combined.Date >= '2011-01-01', :]

train_label = train_data.label
dev_label = dev_data.label
test_label = test_data.label

In [11]:
train_data.head(2)

Unnamed: 0.1,Unnamed: 0,Company,ticker,Surprise,Reported_EPS,Consensus_EPS,Date,timestamp,bow,items,text,orig_file,release_time_type,return,stock_performance,market_performance,normalized_performance,label
13,3,COMTECH TELECOM CO,CMTL,43.75,0.46,0.32,2004-12-07,2004-12-07 10:01:18,"Counter({'-': 22, 'million': 13, '31,': 10, 't...",['Results of Operations and Financial Conditio...,\n\nTEXT:\n8\nFORM\n-K\nCURRENT\nREPORT\n13 15...,CMTL/CMTL-8K-20041207100118.txt.gz,2,6.543967,6.521739,-1.10733,7.62907,UP
14,6,UTI WORLDWIDE INC,UTIW,10.71,0.62,0.56,2004-12-07,2004-12-07 09:12:35,"Counter({'worldwide': 6, 'uti': 6, 'inc.': 6, ...","['Other Events', 'Financial Statements and Exh...",\n\nTEXT:\nTable of Contents\nCheck the approp...,UTIW/UTIW-8K-20041207091235.txt.gz,1,-1.695672,1.026457,0.0,1.026457,UP


## Create Word Count Matrix

In [4]:
vectorizer = CountVectorizer(min_df=1, stop_words='english')
train_text_features = vectorizer.fit_transform(train_data.bow)

dev_text_features = vectorizer.transform(dev_data.bow)

In [5]:
#train_text_features.shape
train_data['Surprise'].fillna(0, inplace=True)
dev_data['Surprise'].fillna(0, inplace=True)
#train_data.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)


In [15]:
#Base model, text features only
clf = RandomForestClassifier(random_state = 99, n_estimators = 3000)
clf = clf.fit(train_text_features, train_label)

preds_dev = clf.predict(dev_text_features)

accuracy = np.where(preds_dev==dev_label, 1, 0).sum() / float(len(dev_label))
print "\nAccuracy of model prediction: %0.4f" %  accuracy


Accuracy of model prediction: 0.4019


In [16]:
from sklearn.metrics import confusion_matrix
confusion_matrix(dev_label, preds_dev, labels=['UP','STAY','DOWN'])

array([[2756,  350,  574],
       [1881,  660,  316],
       [2385,  340,  512]])

## Check length of time...

In [125]:
all_data_train[:,-1:]

array([[ 43.75],
       [ 10.71],
       [-17.39],
       ..., 
       [ 52.63],
       [  0.  ],
       [ -2.94]])

In [7]:
all_data_train = np.append(train_text_features.toarray(),\
                           np.array(train_data.Surprise).reshape(len(train_data.Surprise),1), 1)
all_data_dev = np.append(dev_text_features.toarray(),\
                         np.array(dev_data.Surprise).reshape(len(dev_data.Surprise),1), 1)

In [8]:
#Base model, text features only
clf = RandomForestClassifier(random_state = 99)
clf = clf.fit(all_data_train, train_label)

preds_dev = clf.predict(all_data_dev)

accuracy = np.where(preds_dev==dev_label, 1, 0).sum() / float(len(dev_label))
print "\nAccuracy of model prediction: %0.4f" %  accuracy


Accuracy of model prediction: 0.3856


In [114]:
a = np.array([[1, 2], [3, 4]])
b = np.array(train_data.Surprise).reshape(len(train_data.Surprise),1)
#>>> np.concatenate((a, b), axis=0)
print b.shape
print train_text_features.shape

(17500, 1)
(17500, 96306)


In [31]:
#combined.shape
combined.dtypes

Unnamed: 0                         int64
Company                           object
ticker                            object
Surprise                         float64
Reported_EPS                     float64
Consensus_EPS                    float64
Date                      datetime64[ns]
timestamp                 datetime64[ns]
bow                               object
items                             object
text                              object
orig_file                         object
release_time_type                 object
return                           float64
stock_performance                float64
market_performance               float64
normalized_performance           float64
label                             object
dtype: object

In [26]:
#data.loc[(data["ticker"] =='AA'),:]
del data