In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, precision_score

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
from pymongo import MongoClient
import ystockquote

In [4]:
from datetime import datetime
import math

# Load transcripts

In [5]:
client = MongoClient('localhost', 27017)
earnings_transcript_collection = client.python_import.earnings_transcript

In [6]:
earnings_transcript = pd.DataFrame(list(earnings_transcript_collection.find()))
earnings_transcript.drop('_id', axis=1, inplace=True)

In [7]:
earnings_transcript['publishDate_str'] = earnings_transcript.apply(
    lambda row: str(datetime.strptime(row['publishDate'], '%Y-%m-%dT%H:%M:%SZ').date()), 
    axis=1)
earnings_transcript.head(1)

Unnamed: 0,publishDate,qAndAText,rawText,tradingSymbol,url,publishDate_str
0,2016-07-27T01:01:38Z,Operator Your first question will come from Sh...,"Apple, Inc. (NASDAQ: AAPL ) Q3 2016 Earnings C...",AAPL,https://seekingalpha.com/article/3991811-apple...,2016-07-27


In [8]:
earnings_transcript.set_index(['tradingSymbol', 'publishDate_str'], inplace=True)
earnings_transcript.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,publishDate,qAndAText,rawText,url
tradingSymbol,publishDate_str,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AAPL,2016-07-27,2016-07-27T01:01:38Z,Operator Your first question will come from Sh...,"Apple, Inc. (NASDAQ: AAPL ) Q3 2016 Earnings C...",https://seekingalpha.com/article/3991811-apple...
AAPL,2012-10-26,2012-10-26T01:33:03Z,Operator (Operator Instructions) Your first qu...,Apple Inc. (NASDAQ: AAPL ) F4Q12 Earnings Call...,https://seekingalpha.com/article/952971-apples...


In [9]:
earnings_transcript.tail(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,publishDate,qAndAText,rawText,url
tradingSymbol,publishDate_str,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
VRML,2013-05-15,2013-05-15T22:20:04Z,Operator [Operator Instructions] Our first que...,Vermillion (NASDAQ: VRML ) Q1 2013 Earnings Ca...,https://seekingalpha.com/article/1438321-vermi...
VRML,2013-08-14,2013-08-14T20:26:56Z,,The following audio is from an earnings confer...,https://seekingalpha.com/article/1634312-vermi...


### Load all ticker data for transcripts

In [10]:
earnings_transcript.index.levels[0]

Index(['AAPL', 'ADBE', 'AMD', 'BLDP', 'GOOG', 'VRML'], dtype='object', name='tradingSymbol')

In [11]:
class Applyer:
    
    label = ''
    label_to_check_against = None
    
    @staticmethod
    def label_calc(row):
        
        val_to_check = 0
        if Applyer.label_to_check_against is not None:
            val_to_check = row[Applyer.label_to_check_against]
        
        if math.isnan(row[Applyer.label]) or math.isnan(val_to_check):
            return 0

        if abs(row[Applyer.label]) > val_to_check:
            if row[Applyer.label] < 0:
                return -1
            else:
                return 1
        else:
            return 0
    
    @staticmethod
    def all_label_calc(df):
        df['Std Dev'] = pd.Series(data=df['Close']).rolling(window=20,center=False).std()

        df['1day return'] = df['Close'].shift(-1) - df['Close']
        df['5day return'] = df['Close'].shift(-5) - df['Close']

        Applyer.label_to_check_against = None

        Applyer.label = '1day return'
        df['1day label'] = df.apply(Applyer.label_calc, axis=1)
        Applyer.label = '5day return'
        df['5day label'] = df.apply(Applyer.label_calc, axis=1)

        Applyer.label_to_check_against = 'Std Dev'

        Applyer.label = '1day return'
        df['1day significant label'] = df.apply(Applyer.label_calc, axis=1)
        Applyer.label = '5day return'
        df['5day significant label'] = df.apply(Applyer.label_calc, axis=1)

In [12]:
tickers = earnings_transcript.index.levels[0]
all_stocks = None
for ticker in tickers:
    stocks = ystockquote.get_historical_prices(ticker, '2000-01-01', '2017-12-31')
    print('{} ticker has {} long list'.format(ticker, len(stocks)))
    df = pd.DataFrame(stocks).transpose()
    if 'Adj Close' in df.columns:
        df['Close'] = pd.to_numeric(df['Adj Close'], errors='ignore')
    elif 'Close' in df.columns:
        df['Close'] = pd.to_numeric(df['Close'], errors='ignore')
    else:
        print('{} is not available'.format(ticker))
        continue
    df.drop(axis=1, labels=[col for col in df.columns if col not in ['Close']], inplace=True)
    indexes = pd.MultiIndex.from_product([[ticker], df.index.values.tolist()], names=['tradingSymbol', 'publishDate_str'])
    df.set_index(indexes, inplace=True)
    
    Applyer.all_label_calc(df)
    df.dropna(inplace=True)
    
    if all_stocks is None:
        all_stocks = df
    else:
        all_stocks = all_stocks.append(df)

AAPL ticker has 4339 long list
ADBE ticker has 4339 long list
AMD ticker has 4339 long list
BLDP ticker has 4339 long list
GOOG ticker has 3177 long list
VRML ticker has 4150 long list


In [13]:
len(all_stocks)

24539

In [14]:
all_stocks.sample(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Std Dev,1day return,5day return,1day label,5day label,1day significant label,5day significant label
tradingSymbol,publishDate_str,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AMD,2003-07-03,6.71,0.181323,0.47,0.49,1,1,1,1
AAPL,2015-04-15,121.895703,1.375203,-0.5865,1.769109,-1,1,0,1
VRML,2009-11-25,19.1,2.893484,1.9,1.9,1,1,0,0
VRML,2012-11-20,1.18,0.052763,0.0,0.48,0,1,0,1
ADBE,2002-02-05,17.27413,0.506549,-0.383206,0.766413,-1,1,0,1
AMD,2006-02-15,40.240002,1.957389,1.5,0.149997,1,1,0,0
AMD,2012-07-23,4.15,0.528274,-0.09,-0.05,-1,-1,0,0
VRML,2017-03-16,2.2,0.240309,0.13,-0.06,1,-1,0,0
AMD,2016-03-28,2.78,0.243321,0.08,0.05,1,1,0,0
BLDP,2009-09-08,1.75,0.026926,0.02,0.16,1,1,0,1


# Merge transcripts with stock data

In [15]:
earnings_transcript = earnings_transcript.merge(all_stocks, left_index=True, right_index=True)
earnings_transcript.sample(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,publishDate,qAndAText,rawText,url,Close,Std Dev,1day return,5day return,1day label,5day label,1day significant label,5day significant label
tradingSymbol,publishDate_str,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
GOOG,2012-04-13,2012-04-13T02:10:04Z,Operator [Operator Instructions] And we'll tak...,Google (NASDAQ: GOOG ) Q1 2012 Earnings Call A...,https://seekingalpha.com/article/495351-google...,311.988542,4.533853,-9.255766,-14.255762,-1,-1,-1,-1
AMD,2011-10-28,2011-10-28T01:10:15Z,Operator [Operator Instructions] Our first que...,Advanced Micro Devices (NASDAQ: AMD ) Q3 2011 ...,https://seekingalpha.com/article/303021-advanc...,5.94,0.326076,-0.11,-0.27,-1,-1,0,0
AAPL,2013-07-23,2013-07-23T23:38:02Z,Operator (Operator Instructions) Your first qu...,Apple Inc. (NASDAQ: AAPL ) F3Q 2013 Earnings C...,https://seekingalpha.com/article/1566012-apple...,55.450012,1.565132,2.848003,4.543307,1,1,1,1
AAPL,2005-10-13,2005-10-13T19:00:40Z,,Here's the entire text of the prepared remarks...,https://seekingalpha.com/article/3562-apple-co...,6.96253,0.172668,0.033686,0.310944,1,1,0,1
GOOG,2010-10-28,2010-10-28T17:05:16Z,Operator [Operator Instructions] It looks like...,Motorola (MOT) Q3 2010 Earnings Call October 2...,https://seekingalpha.com/article/233036-motoro...,308.981555,20.184649,-2.437587,2.842168,-1,1,0,0
AAPL,2011-07-20,2011-07-20T01:40:10Z,Operator [Operator Instructions] Your first qu...,Apple (NASDAQ: AAPL ) Q3 2011 Earnings Call Ju...,https://seekingalpha.com/article/280344-apple-...,50.126589,2.279584,0.050531,0.737195,1,1,0,0
AAPL,2007-07-26,2007-07-26T02:58:14Z,Operator (Operator Instructions) First up to...,Apple Inc. (NASDAQ: AAPL ) F3Q07 Earnings Ca...,https://seekingalpha.com/article/42374-apple-f...,18.915695,0.943476,-0.278554,-1.232112,-1,-1,0,-1
AAPL,2005-10-13,2005-10-13T19:06:13Z,[Rebecca Runkle of Morgan and Stanley] [Q – Re...,Here’s the entire text of the Q&A from Apple’s...,https://seekingalpha.com/article/3563-apple-co...,6.96253,0.172668,0.033686,0.310944,1,1,0,1
ADBE,2009-12-16,2009-12-16T02:51:07Z,Operator (Operator Instructions) We will go fi...,Adobe Systems Incorporated (NASDAQ: ADBE ) F4Q...,https://seekingalpha.com/article/178350-adobe-...,37.860001,0.582621,-0.850003,-0.930001,-1,-1,-1,-1
AMD,2012-07-20,2012-07-20T00:50:02Z,Operator [Operator Instructions] Our first que...,Advanced Micro Devices (NASDAQ: AMD ) Q2 2012 ...,https://seekingalpha.com/article/734631-advanc...,4.22,0.478813,-0.07,-0.13,-1,-1,0,0


# Create train data and test data

In [16]:
X = earnings_transcript['rawText']
y = earnings_transcript['1day significant label']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

# Tf-Idf and LogisticRegression model

### GridSearch on pipeline - it takes 20+ minutes on my machine!!!!!!!
###### http://scikit-learn.org/stable/auto_examples/model_selection/grid_search_digits.html

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer(norm='l2')),
    ('logreg_model', LogisticRegression())
])
#pipeline.fit(X_train, y_train)

params = dict(bow__ngram_range=[(1,1), (1,2), (1,3)],
              bow__max_df=np.arange(0.1,0.6,0.1),
              bow__min_df=[1,2],
              logreg_model__C=np.arange(0.5, 1.5, 0.3))
grid_search = GridSearchCV(pipeline, 
                           param_grid=params, 
                           verbose=3, 
                           scoring='precision_macro', 
                           n_jobs=4,
                           refit=True,
                           cv=4)
grid_search.fit(X_train, y_train)
#grid_search.fit(X, y)

Fitting 4 folds for each of 120 candidates, totalling 480 fits


[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:   36.7s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:  4.6min


In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

### Fitting the model

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer(stop_words='english', max_df=0.2, min_df=1, ngram_range=(1,1))),
    ('tfidf', TfidfTransformer(norm='l2')),
    ('bayes_model', LogisticRegression(C=0.5))
])
pipeline.fit(X_train, y_train)

In [None]:
y_predicted = pipeline.predict(X_test)

#### Confusion matrix
By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`
is equal to the number of observations known to be in group :math:`i` but
predicted to be in group :math:`j`.

In [None]:
cm = confusion_matrix(y_test, y_predicted)

In [None]:
sns.heatmap(cm, cmap='magma', annot=True)

In [None]:
cr = classification_report(y_test, y_predicted)
print(cr)