In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
from pymongo import MongoClient
import ystockquote

In [3]:
from datetime import datetime, timedelta, timezone
import pytz
import math

# Load transcripts

In [4]:
est_tz = pytz.timezone('US/Eastern')
def utc_to_est(utc_dt):
    local_dt = utc_dt.replace(tzinfo=pytz.utc).astimezone(est_tz)
    return est_tz.normalize(local_dt)

In [5]:
def shift_time_according_to_closing_time(dt):
    if dt.hour >= 16:
        return dt + timedelta(days=1)
    return dt

In [6]:
def load_and_clean_transcripts():
    client = MongoClient('localhost', 27017)
    earnings_transcript_collection = client.python_import.earnings_transcript
    
    earnings_transcript = pd.DataFrame(list(earnings_transcript_collection.find()))
    earnings_transcript.drop('_id', axis=1, inplace=True)
    earnings_transcript['shifted_publishDate'] = earnings_transcript.apply(
        lambda row: datetime.strptime(row['publishDate'], '%Y-%m-%dT%H:%M:%SZ'), axis=1
    )
    earnings_transcript['shifted_publishDate'] = earnings_transcript.apply(
        lambda row: utc_to_est(row['shifted_publishDate']), axis=1
    )
    earnings_transcript['shifted_publishDate'] = earnings_transcript.apply(
        lambda row: shift_time_according_to_closing_time(row['shifted_publishDate']), axis=1
    )
    earnings_transcript['publishDate_str'] = earnings_transcript.apply(
        lambda row: str(row['shifted_publishDate'].date()), axis=1
    )
    earnings_transcript.set_index(['tradingSymbol', 'publishDate_str'], inplace=True)
    
    return earnings_transcript

In [7]:
earnings_transcript = load_and_clean_transcripts()

In [8]:
earnings_transcript.head(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,date_number,h_tone,publishDate,qAndAText,q_and_a_h_tone,q_and_a_wordSize,rawText,time_number,url,wordSize,shifted_publishDate
tradingSymbol,publishDate_str,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
A,2014-11-18,1141117.0,"{'negativeCount': 0, 'positiveCount': 0}",2014-11-17T22:25:00Z,,"{'negativeCount': 0, 'positiveCount': 0}",0.0,The following audio is from a conference call ...,222500.0,https://seekingalpha.com/article/2675895-agile...,40.0,2014-11-18 17:25:00-05:00
AA,2014-10-09,1141009.0,"{'negativeCount': 65, 'positiveCount': 341}",2014-10-09T01:18:03Z,Operator [Operator instructions.] Our first qu...,"{'negativeCount': 23, 'positiveCount': 91}",4868.0,Alcoa (NYSE: AA ) Q3 2014 Results Earnings Con...,11803.0,https://seekingalpha.com/article/2549515-alcoa...,13228.0,2014-10-09 21:18:03-04:00


In [9]:
earnings_transcript.tail(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,date_number,h_tone,publishDate,qAndAText,q_and_a_h_tone,q_and_a_wordSize,rawText,time_number,url,wordSize,shifted_publishDate
tradingSymbol,publishDate_str,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
ABT,2017-04-19,,,2017-04-19T17:46:06Z,Operator Thank you. [Operator Instructions] An...,,,Abbott Laboratories (NYSE: ABT ) Q1 2017 Earni...,,https://seekingalpha.com/article/4063446-abbot...,,2017-04-19 13:46:06-04:00
ADTN,2017-04-19,,,2017-04-19T19:52:04Z,Operator [Operator Instructions] We can take o...,,,"ADTRAN, Inc. (NASDAQ: ADTN ) Q1 2017 Earnings ...",,https://seekingalpha.com/article/4063495-adtra...,,2017-04-19 15:52:04-04:00


### Load all ticker data for transcripts

In [10]:
def construct_dataframe_from_stock_data(arr, ticker):
    df = pd.DataFrame(arr).transpose()
    if 'Adj Close' in df.columns:
        df['Close'] = pd.to_numeric(df['Adj Close'], errors='ignore')
    elif 'Close' in df.columns:
        df['Close'] = pd.to_numeric(df['Close'], errors='ignore')
    else:
        print('{} does not have close prices'.format(ticker))
        
    df.drop(axis=1, labels=[col for col in df.columns if col not in ['Close']], inplace=True)
    indexes = pd.MultiIndex.from_product([[ticker], df.index.values.tolist()], names=['tradingSymbol', 'publishDate_str'])
    df.set_index(indexes, inplace=True)
    
    return df

In [11]:
def calc_labels(df):
    df['Std Dev'] = pd.Series(data=df['Close']).rolling(window=20,center=False).std()
    df['1day return'] = df['Close'].shift(-1) - df['Close']
    df['1day pct change'] = df['1day return']/df['Close']*100
    df['5day return'] = df['Close'].shift(-5) - df['Close']
    df['5day pct change'] = df['5day return']/df['Close']*100

In [12]:
SPY = ystockquote.get_historical_prices('SPY', '2000-01-01', '2017-12-31')

In [13]:
SPY_df = construct_dataframe_from_stock_data(SPY, 'SPY')

In [14]:
SPY_df.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Close
tradingSymbol,publishDate_str,Unnamed: 2_level_1
SPY,2000-01-03,105.366938
SPY,2000-01-04,101.246443
SPY,2000-01-05,101.427563
SPY,2000-01-06,99.797478
SPY,2000-01-07,105.593338


In [15]:
calc_labels(SPY_df)
SPY_df.dropna(inplace=True)

In [16]:
SPY_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Std Dev,1day return,1day pct change,5day return,5day pct change
tradingSymbol,publishDate_str,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
SPY,2000-01-31,101.110602,2.375208,0.996164,0.985222,2.037608,2.015227
SPY,2000-02-01,102.106766,2.346908,0.09056,0.088691,2.445129,2.394679
SPY,2000-02-02,102.197326,2.313233,1.539526,1.506425,0.158448,0.155041
SPY,2000-02-03,103.736852,2.2714,-0.430195,-0.414698,-1.177285,-1.134876
SPY,2000-02-04,103.306657,2.105601,-0.158447,-0.153375,-2.829977,-2.739395


In [17]:
tickers = earnings_transcript.index.levels[0]
all_stocks = None
try:
    all_stocks = pd.read_pickle('all_stocks.pkl')
except:
    pass

print('Downloading missing stock data')

for ticker in tickers:
    if (all_stocks is not None) and (ticker in all_stocks.index):
        continue
    try:
        stocks = ystockquote.get_historical_prices(ticker, '2000-01-01', '2017-12-31')
    except:
        print('{} is not available on Yahoo'.format(ticker))
        continue
    print('{} ticker has {} long list'.format(ticker, len(stocks)))
    
    df = construct_dataframe_from_stock_data(stocks, ticker)
    calc_labels(df)
    df.dropna(inplace=True)
    
    SPY_df.index.set_levels([[ticker], SPY_df.index.levels[1]], inplace=True)
    
    df['1day relative pct change'] = df['1day pct change'] - SPY_df['1day pct change']
    df['5day relative pct change'] = df['5day pct change'] - SPY_df['5day pct change']
    
    if all_stocks is None:
        all_stocks = df
    else:
        all_stocks = all_stocks.append(df)
        
all_stocks.to_pickle('all_stocks.pkl')

Downloading missing stock data
ACE is not available on Yahoo
ACT is not available on Yahoo
ALTR is not available on Yahoo


In [18]:
all_stocks.sample(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Std Dev,1day return,1day pct change,5day return,5day pct change,1day relative pct change,5day relative pct change
tradingSymbol,publishDate_str,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
AIZ,2012-11-16,32.503392,1.630181,-0.868342,-2.671543,-1.0327,-3.177207,-4.695456,-6.609053
AFAM,2012-02-07,18.04605,1.090749,0.154318,0.855135,-0.108928,-0.603611,0.558369,-0.900376
ACIW,2014-03-18,20.223333,0.5381,-0.09,-0.445031,-0.159999,-0.79116,0.087848,-0.509952
AEL,2013-08-06,18.465829,0.727268,0.086919,0.470702,0.376656,2.039746,0.794748,2.110443
AMWD,2002-04-01,28.030113,1.863715,-0.295055,-1.052636,0.540174,1.92712,-0.502756,3.358559
AON,2005-12-22,31.046781,0.314658,0.215962,0.695602,0.00864,0.027829,0.640348,1.748565
AIRM,2013-07-19,33.290001,1.325968,-0.34,-1.021328,1.25,3.754881,-1.216399,3.790347
CTAS,2011-03-01,25.163335,0.55262,0.208863,0.830029,0.435887,1.732231,0.616164,0.472008
AEIS,2012-07-12,11.86,0.53915,-0.2,-1.686341,0.12,1.011804,-3.364121,-2.149008
AIRM,2008-04-24,12.763551,0.926444,-0.103083,-0.807636,0.053107,0.416083,-1.733025,-1.608199


In [19]:
all_stocks.tail(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Std Dev,1day return,1day pct change,5day return,5day pct change,1day relative pct change,5day relative pct change
tradingSymbol,publishDate_str,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
DLPH,2017-04-03,78.25,1.96317,-2.269997,-2.900955,-2.720001,-3.47604,-2.964692,-3.480286
DLPH,2017-04-04,75.980003,1.962236,-0.890007,-1.17137,-1.069999,-1.408264,-0.874106,-1.229906
DLPH,2017-04-05,75.089996,2.104154,0.030007,0.039961,-0.809997,-1.078702,-0.241154,-0.759254
DLPH,2017-04-06,75.120003,2.208314,0.019996,0.026619,-1.370003,-1.823753,0.128558,-0.579271
DLPH,2017-04-07,75.139999,2.350559,0.39,0.519031,-0.43,-0.572265,0.459508,-0.304412


# Merge transcripts with stock data

In [20]:
earnings_transcript = earnings_transcript[earnings_transcript['rawText'].apply(len) > 5000]

In [21]:
earnings_transcript.reset_index(inplace=True)

In [30]:
earn_split = earnings_transcript.apply(lambda x: [x['rawText'].split('.')], axis=1)

In [31]:
earn_split

Unnamed: 0,tradingSymbol,publishDate_str,date_number,h_tone,publishDate,qAndAText,q_and_a_h_tone,q_and_a_wordSize,rawText,time_number,url,wordSize,shifted_publishDate
0,[Alcoa (NYSE: AA ) Q3 2014 Results Earnings Co...,[Alcoa (NYSE: AA ) Q3 2014 Results Earnings Co...,[Alcoa (NYSE: AA ) Q3 2014 Results Earnings Co...,[Alcoa (NYSE: AA ) Q3 2014 Results Earnings Co...,[Alcoa (NYSE: AA ) Q3 2014 Results Earnings Co...,[Alcoa (NYSE: AA ) Q3 2014 Results Earnings Co...,[Alcoa (NYSE: AA ) Q3 2014 Results Earnings Co...,[Alcoa (NYSE: AA ) Q3 2014 Results Earnings Co...,[Alcoa (NYSE: AA ) Q3 2014 Results Earnings Co...,[Alcoa (NYSE: AA ) Q3 2014 Results Earnings Co...,[Alcoa (NYSE: AA ) Q3 2014 Results Earnings Co...,[Alcoa (NYSE: AA ) Q3 2014 Results Earnings Co...,[Alcoa (NYSE: AA ) Q3 2014 Results Earnings Co...
1,"[Start Time: 08:31 End Time: 09:18 Alcoa, Inc,...","[Start Time: 08:31 End Time: 09:18 Alcoa, Inc,...","[Start Time: 08:31 End Time: 09:18 Alcoa, Inc,...","[Start Time: 08:31 End Time: 09:18 Alcoa, Inc,...","[Start Time: 08:31 End Time: 09:18 Alcoa, Inc,...","[Start Time: 08:31 End Time: 09:18 Alcoa, Inc,...","[Start Time: 08:31 End Time: 09:18 Alcoa, Inc,...","[Start Time: 08:31 End Time: 09:18 Alcoa, Inc,...","[Start Time: 08:31 End Time: 09:18 Alcoa, Inc,...","[Start Time: 08:31 End Time: 09:18 Alcoa, Inc,...","[Start Time: 08:31 End Time: 09:18 Alcoa, Inc,...","[Start Time: 08:31 End Time: 09:18 Alcoa, Inc,...","[Start Time: 08:31 End Time: 09:18 Alcoa, Inc,..."
2,"[Alcoa Inc, (NYSE: AA ) Q4 2014 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q4 2014 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q4 2014 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q4 2014 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q4 2014 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q4 2014 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q4 2014 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q4 2014 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q4 2014 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q4 2014 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q4 2014 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q4 2014 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q4 2014 Earnings Conf..."
3,"[Alcoa Inc, (NYSE: AA ) Q1 2015 Results Earni...","[Alcoa Inc, (NYSE: AA ) Q1 2015 Results Earni...","[Alcoa Inc, (NYSE: AA ) Q1 2015 Results Earni...","[Alcoa Inc, (NYSE: AA ) Q1 2015 Results Earni...","[Alcoa Inc, (NYSE: AA ) Q1 2015 Results Earni...","[Alcoa Inc, (NYSE: AA ) Q1 2015 Results Earni...","[Alcoa Inc, (NYSE: AA ) Q1 2015 Results Earni...","[Alcoa Inc, (NYSE: AA ) Q1 2015 Results Earni...","[Alcoa Inc, (NYSE: AA ) Q1 2015 Results Earni...","[Alcoa Inc, (NYSE: AA ) Q1 2015 Results Earni...","[Alcoa Inc, (NYSE: AA ) Q1 2015 Results Earni...","[Alcoa Inc, (NYSE: AA ) Q1 2015 Results Earni...","[Alcoa Inc, (NYSE: AA ) Q1 2015 Results Earni..."
4,"[Alcoa, Inc, (NYSE: AA ) Q2 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q2 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q2 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q2 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q2 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q2 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q2 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q2 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q2 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q2 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q2 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q2 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q2 2015 Earnings Con..."
5,"[Alcoa Inc, (NYSE: AA ) Q3 2015 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q3 2015 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q3 2015 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q3 2015 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q3 2015 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q3 2015 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q3 2015 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q3 2015 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q3 2015 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q3 2015 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q3 2015 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q3 2015 Earnings Conf...","[Alcoa Inc, (NYSE: AA ) Q3 2015 Earnings Conf..."
6,"[Alcoa, Inc, (NYSE: AA ) Q4 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q4 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q4 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q4 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q4 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q4 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q4 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q4 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q4 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q4 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q4 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q4 2015 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q4 2015 Earnings Con..."
7,"[Alcoa, Inc, (NYSE: AA ) Q1 2016 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q1 2016 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q1 2016 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q1 2016 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q1 2016 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q1 2016 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q1 2016 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q1 2016 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q1 2016 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q1 2016 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q1 2016 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q1 2016 Earnings Con...","[Alcoa, Inc, (NYSE: AA ) Q1 2016 Earnings Con..."
8,"[Alcoa, Inc, (NYSE: AA ) Update on Alcoa’s Se...","[Alcoa, Inc, (NYSE: AA ) Update on Alcoa’s Se...","[Alcoa, Inc, (NYSE: AA ) Update on Alcoa’s Se...","[Alcoa, Inc, (NYSE: AA ) Update on Alcoa’s Se...","[Alcoa, Inc, (NYSE: AA ) Update on Alcoa’s Se...","[Alcoa, Inc, (NYSE: AA ) Update on Alcoa’s Se...","[Alcoa, Inc, (NYSE: AA ) Update on Alcoa’s Se...","[Alcoa, Inc, (NYSE: AA ) Update on Alcoa’s Se...","[Alcoa, Inc, (NYSE: AA ) Update on Alcoa’s Se...","[Alcoa, Inc, (NYSE: AA ) Update on Alcoa’s Se...","[Alcoa, Inc, (NYSE: AA ) Update on Alcoa’s Se...","[Alcoa, Inc, (NYSE: AA ) Update on Alcoa’s Se...","[Alcoa, Inc, (NYSE: AA ) Update on Alcoa’s Se..."
9,"[Alcoa, Inc, (NYSE: AA ) Q2 2016 Results Earn...","[Alcoa, Inc, (NYSE: AA ) Q2 2016 Results Earn...","[Alcoa, Inc, (NYSE: AA ) Q2 2016 Results Earn...","[Alcoa, Inc, (NYSE: AA ) Q2 2016 Results Earn...","[Alcoa, Inc, (NYSE: AA ) Q2 2016 Results Earn...","[Alcoa, Inc, (NYSE: AA ) Q2 2016 Results Earn...","[Alcoa, Inc, (NYSE: AA ) Q2 2016 Results Earn...","[Alcoa, Inc, (NYSE: AA ) Q2 2016 Results Earn...","[Alcoa, Inc, (NYSE: AA ) Q2 2016 Results Earn...","[Alcoa, Inc, (NYSE: AA ) Q2 2016 Results Earn...","[Alcoa, Inc, (NYSE: AA ) Q2 2016 Results Earn...","[Alcoa, Inc, (NYSE: AA ) Q2 2016 Results Earn...","[Alcoa, Inc, (NYSE: AA ) Q2 2016 Results Earn..."


In [23]:
df = (pd.DataFrame({'name': ['A.J. Price'] * 3, 
                    'opponent': ['76ers', 'blazers', 'bobcats'], 
                    'nearest_neighbors': [['Zach LaVine', 'Jeremy Lin', 'Nate Robinson', 'Isaia']] * 3})
      .set_index(['name', 'opponent']))

In [28]:
df

Unnamed: 0,name,opponent,nearest_neighbors
0,A.J. Price,76ers,"[Zach LaVine, Jeremy Lin, Nate Robinson, Isaia]"
1,A.J. Price,blazers,"[Zach LaVine, Jeremy Lin, Nate Robinson, Isaia]"
2,A.J. Price,bobcats,"[Zach LaVine, Jeremy Lin, Nate Robinson, Isaia]"


In [24]:
df.reset_index(inplace=True)
rows = []
_ = df.apply(lambda row: [rows.append([row['name'], row['opponent'], nn]) 
                         for nn in row.nearest_neighbors], axis=1)
df_new = pd.DataFrame(rows, columns=df.columns).set_index(['name', 'opponent'])

In [27]:
df_new

Unnamed: 0_level_0,Unnamed: 1_level_0,nearest_neighbors
name,opponent,Unnamed: 2_level_1
A.J. Price,76ers,Zach LaVine
A.J. Price,76ers,Jeremy Lin
A.J. Price,76ers,Nate Robinson
A.J. Price,76ers,Isaia
A.J. Price,blazers,Zach LaVine
A.J. Price,blazers,Jeremy Lin
A.J. Price,blazers,Nate Robinson
A.J. Price,blazers,Isaia
A.J. Price,bobcats,Zach LaVine
A.J. Price,bobcats,Jeremy Lin


In [51]:
earn_split

0       [Alcoa (NYSE: AA ) Q3 2014 Results Earnings Co...
1       [Start Time: 08:31 End Time: 09:18 Alcoa, Inc,...
2       [Alcoa Inc,  (NYSE: AA ) Q4 2014 Earnings Conf...
3       [Alcoa Inc,  (NYSE: AA ) Q1 2015 Results Earni...
4       [Alcoa, Inc,  (NYSE: AA ) Q2 2015 Earnings Con...
5       [Alcoa Inc,  (NYSE: AA ) Q3 2015 Earnings Conf...
6       [Alcoa, Inc,  (NYSE: AA ) Q4 2015 Earnings Con...
7       [Alcoa, Inc,  (NYSE: AA ) Q1 2016 Earnings Con...
8       [Alcoa, Inc,  (NYSE: AA ) Update on Alcoa’s Se...
9       [Alcoa, Inc,  (NYSE: AA ) Q2 2016 Results Earn...
10      [Alcoa, Inc,  (NYSE: AA ) Q3 2016 Earnings Con...
11      [Alcoa, Inc,  (NYSE: AA ) Q4 2016 Earnings Con...
12      [Agilent Technologies Inc,  (NYSE: A ) Q4 2014...
13      [Alcoa (NYSE: AA ) Q2 2011 Earnings Call July ...
14      [Alcoa (NYSE: AA ) Q3 2011 Earnings Call Octob...
15      [Alcoa, Inc,  (NYSE: AA ) November 09, 2011  9...
16      [Alcoa (NYSE: AA ) Q4 2011 Earnings Call Janua...
17      [Alcoa

In [20]:
merged_data = earnings_transcript.merge(all_stocks, left_index=True, right_index=True)

In [21]:
merged_data.sample(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,publishDate,qAndAText,rawText,url,shifted_publishDate,Close,Std Dev,1day return,1day pct change,5day return,5day pct change,1day relative pct change,5day relative pct change
tradingSymbol,publishDate_str,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
APA,2011-11-04,2011-11-04T01:30:12Z,Operator [Operator Instructions] Your first qu...,Apache (NYSE: APA ) Q3 2011 Earnings Call Nove...,https://seekingalpha.com/article/305203-apache...,2011-11-04 21:30:12-04:00,93.375693,5.321505,1.183508,1.267469,4.20284,4.501,0.645858,3.560611
DHI,2014-07-24,2014-07-24T13:55:00Z,,The following audio is from a conference call ...,https://seekingalpha.com/article/2336945-q3-20...,2014-07-24 09:55:00-04:00,21.286876,0.635615,-0.320176,-1.5041,-1.203086,-5.651773,-1.035944,-2.852882
ATI,2016-10-25,2016-10-25T17:43:12Z,Operator Sure. We will now begin the question-...,"Allegheny Technologies, Inc. (NYSE: ATI ) Q3 2...",https://seekingalpha.com/article/4014658-alleg...,2016-10-25 13:43:12-04:00,15.12,0.734599,-0.85,-5.621693,-1.75,-11.574074,-5.420922,-10.098609
AKAM,2017-03-31,2017-03-31T17:04:36Z,,The following slide deck was published by Akam...,https://seekingalpha.com/article/4059499-akama...,2017-03-31 13:04:36-04:00,59.700001,1.307269,-1.600003,-2.680072,-1.600003,-2.680072,-2.50615,-2.451003
ADI,2006-02-16,2006-02-16T18:12:15Z,,Analog Devices Inc. (NYSE: ADI ) Q1 2006 Ear...,https://seekingalpha.com/article/6884-analog-d...,2006-02-16 13:12:15-05:00,29.059107,0.628357,-0.580009,-1.995963,-0.806209,-2.774376,-1.724977,-2.967935


In [22]:
merged_data = merged_data[merged_data['rawText'].apply(len) > 5000]

In [23]:
merged_data.to_pickle('merged_data_pct_change.pkl')