In [3]:
###############################################################################################
#
# StockPriceCorrelator
#
# By James King
#
# Shows stock price reaction to descrete events
#
###############################################################################################
%matplotlib inline
##############################################################################################
# Import libraries
##############################################################################################
import pandas as pd
import numpy as np
import os
import os.path
import pylab as plt
from datetime import datetime as dt
from datetime import timedelta as dt_delta
from pandas.io.json import json_normalize
import requests
import editdistance
import pickle
from scipy import stats



##############################################################################################
#  Helper function definitions
##############################################################################################
def tickerLookupNasdaq():
    '''Returns a pandas dataframe with all tickers & companies on the NYSE,
    AMEX, and NASDAQ exchanges according to the NASDAQ website.'''
    ## Download current company lists
    exchanges = {'NYSE':'http://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=nyse&render=download',
    'AMEX':'http://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=amex&render=download',
    'NASDAQ':'http://www.nasdaq.com/screening/companies-by-name.aspx?letter=0&exchange=nasdaq&render=download'}

    companies=[]
    for ex in exchanges.keys():
        saveFileName = ex + '_company_list.csv'
        companies.append(saveFileName)
        url = exchanges[ex]
        r = requests.get(url)

        with open(saveFileName,'w') as output:
            output.write(r.content)

    codf = pd.DataFrame()
    for co in companies:
        codf = pd.concat([codf, pd.read_csv(co)])

    return codf


##############################################################################################
def tickerLookupOats():
    '''Returns all tickers & companies from the OATS webpage in a pandas dataframe'''
    
    url = 'http://oatsreportable.finra.org/OATSReportableSecurities-SOD.txt'
    # returns a pipe-delimited text file
    r = pd.DataFrame.from_csv(url, sep='|', index_col=False)
    
    return r

##############################################################################################
def tickerLookupYahoo(searchTermString, 
                      exchange_list = ['NYSE','AMEX','NASDAQ'], 
                      return_closest = True, 
                      verbose = False):
    
    '''Given a string of search words (separated by spaces), returns
        a pandas data frame with the responses from Yahoo.  If exchange is set,
        filter results to include only the exchanges listed.  Set to None for any
        exchange.'''

    try:

        searchTermString = searchTermString.replace(' ','+')
        url = 'http://d.yimg.com/aq/autoc?query=' + searchTermString + '&region=US&lang=en-US'
        js = pd.read_json(url)
        r = json_normalize(js.ResultSet.Result)

        if exchange_list is not None:
            r = r[r['exchDisp'].isin(exchange_list)]

        if return_closest:
            distances = [editdistance.eval(searchTermString.replace('+',' '), word) for word in r.name]
            #print distances
            return r[distances == np.min(distances)]

        else:
            return r
    
    except Exception as e:
        if verbose:
            print 'Search term(s) not found: ', e
        return None
    
##############################################################################################
def fetchStockPrices(query_symbol='T', incident_date='2010-06-16',pm_days = 14):
    incident_date = dt.strptime(incident_date,'%Y-%m-%d')
    pm_delta = dt_delta(days=14)
    query_start_date = str((incident_date - pm_delta).date())
    query_end_date = str((incident_date + pm_delta).date())

    query_url = '''https://query.yahooapis.com/v1/public/''' + \
    '''yql?q=select%20*%20from%20yahoo.finance.historicaldata%20where%20'''+ \
    '''symbol%20%3D%20%22'''    + query_symbol + '''%22%20and%20'''+ \
    '''startDate%20%3D%20%22''' + query_start_date + '''%22%20and%20'''+ \
    '''endDate%20%3D%20%22'''   + query_end_date +'''%22&format=json&'''+ \
    '''diagnostics=false&env=store%3A%2F%2Fdatatables.org%2Falltableswithkeys'''

    try:
        stock_prices_json = pd.read_json(query_url)
        stock_prices_df = json_normalize(stock_prices_json.iloc[3].query['quote'])
                     
        return stock_prices_df
    except Exception as e:
        print query_url, ' returned no data'
        return None
    
##############################################################################################
def fetchIndexPrices(incident_date='2010-06-16',pm_days = 14):
    nasdaq_composite = '^IXIC'
    sp_composite = '^GSPC'
    dj_composite = '^DJI'
    
    nasdaq_df = fetchStockPrices(query_symbol = nasdaq_composite, 
                                 incident_date = incident_date, pm_days = pm_days)
    
    sp_df = fetchStockPrices(query_symbol = sp_composite, 
                                 incident_date = incident_date, pm_days = pm_days)
    
    dj_df = fetchStockPrices(query_symbol = dj_composite, 
                                 incident_date = incident_date, pm_days = pm_days)
    
    return nasdaq_df, sp_df, dj_df


In [6]:
##############################################################################################
# Load data about breaches
##############################################################################################

raw_data = pd.read_csv('breaches.csv', header=0)

df = pd.DataFrame()
df['Name'] = raw_data.Name
df['Date_String'] = raw_data[[1]]
df['Place'] = raw_data.Location

## Get rid of rows without dates
df = df.dropna(axis=0, subset=['Date_String'])

#Convert dates to computer-friendly format
df['Date'] = [dt.strptime(ds, '%B %d, %Y') for ds in df.Date_String]

IOError: File breaches.csv does not exist

In [5]:
##############################################################################################
# Check each breached organization name against the Yahoo Finance ticker symbol lookup API
#  and store results.   Many organizations have several possible matches which will need to be 
#  worked through by hand.
##############################################################################################

try:
    # Attempt to load the data from disk, first
    hits = pickle.load(open('yahoo_hits.pickle'))
    
except:    
    hits = []
    print 
    for company_index in range(len(df.Name)):
        try:
            bork =  tickerLookupYahoo(df.iloc[company_index].Name,return_closest=False)
            if bork is not None:
                print bork.symbol[1]
                hits.append(bork)
            else:
                hits.append(None)

        except Exception as e:
            hits.append(None)

        print '\r'+str(len(hits))+' of '+str(len(df.Name)),

        if len(hits) > 5000:
            break
    pickle.dump(hits,open('yahoo_hits.pickle','w'))
        
df['yahoo_hits']=hits




NameError: name 'df' is not defined

In [None]:
##############################################################################################
# Add tickers to companies which have them in df and discard the rest.  
##############################################################################################

# Load hand-edited ticker file
tickers = pd.DataFrame.from_csv('corrFileEditedUnique.csv', sep='|', index_col=False,header=None)
tickers.columns = ['breach_company_name','yahoo_symbol','yahoo_company_name','exchange']

# Strip extra spaces off the names of companies
df.Name = df.Name.str.strip()
tickers.breach_company_name = tickers.breach_company_name.str.strip()
tickers.yahoo_company_name = tickers.yahoo_company_name.str.strip()
tickers.yahoo_symbol = tickers.yahoo_symbol.str.strip()

# Merge yahoo ticker data with breach data
merged = pd.merge(df, tickers, how='inner', on=None, left_on='Name', right_on='breach_company_name',
      left_index=False, right_index=False, sort=True)

# Get rid of uninteresting columns
merged.drop('breach_company_name', axis=1, inplace=True)
merged.drop('yahoo_hits', axis=1, inplace=True)

# Save for posteritypriceList = []


for entry in merged.index:
    thisDate = str(merged.iloc[entry].Date).split()[0]
    thisSym = str(merged.iloc[entry].yahoo_symbol)
    priceList.append(fetchStockPrices(query_symbol = thisSym, incident_date = thisDate))
pickle.dump(merged,open('public_companies.csv','w'))




In [1]:
##############################################################################################
# Add results to data frame & get rid of incidents w/o data (these were probably
#  not publicly traded at the time of the incident announcement)
##############################################################################################

full_df = merged
full_df['prices']=priceList
keeps = [full_df.prices[xx] is not None for xx in range(len(full_df))]
full_df = full_df[keeps]
full_df.iloc[40]


NameError: name 'merged' is not defined

In [None]:
from datetime import date
    
    
def plotBreachChart(full_df, current_incident=0, include_index = 'sp'):
    
    f = plt.figure()
    ax = plt.gca()
    this_prices_df = full_df.prices[current_incident]
    this_prices_df = this_prices_df.iloc[::-1] # data are returned with newest first--fix
    plt.plot(this_prices_df.Close,'.-')
    plt.hold('on')
    #plt.plot(this_prices_df.Open,'r.-')
    plt.ylabel('Share Price at Close ($)')
    plt.xticks(range(len(this_prices_df)),np.array(this_prices_df.Date))
    f.autofmt_xdate(rotation=90)

        

    # Find the appropriate place to draw a reference line
    before_incident_bool=list(full_df.Date[current_incident].strftime('%Y-%m-%d')<=this_prices_df.Date)  # '>=' because incidents don't always happen on a day with market data 
    incident_index = before_incident_bool.index(True)
    #print 'Date of Incident: ', full_df.Date[current_incident]
    this_prices_df.Date[incident_index]

    ax.axvline(incident_index, color='m', linestyle='--')
    title_text='Stock Price Before and \nAfter Breach Announcement'
    title_text += '\nCompany: '+ full_df.yahoo_company_name[current_incident] 
    title_text += ' (' + full_df.yahoo_symbol[current_incident] + ')'
    plt.title(title_text)
    plt.xlabel('Date')
    #plt.ylabel('Price ($)')
    #plt.legend(['Opening Price','Closing Price'])
    
    # Draw the index fund on the right vertical
    if include_index == 'sp':
        ax2 = ax.twinx()
        nasdaq, sp, dj = fetchIndexPrices(
            incident_date=full_df.Date[current_incident].strftime('%Y-%m-%d'))
        ax2.plot(sp.Close,'.-',color='0.75')
        plt.ylabel('S&P 500 Composite at Close ($)')

#f = plt.gcf()
#plt.figure(num=None, figsize=(10, 15), dpi=80, facecolor='w', edgecolor='k')

counter = 0 #use this instead of current_incident as incident #s aren't necessarily sequential
for current_incident in full_df.index:
    #plt.subplot(70,2,counter)
    plotBreachChart(full_df, current_incident)
    #f = plt.gcf()
    #f.autofmt_xdate(rotation=90)
    #plt.subplots_adjust(left=None, bottom=5, right=None, top=15, wspace=1, hspace=1)
    counter += 1
    if counter > 2:
        break


plt.show()

In [None]:
from scipy.stats import ttest_ind


ups = 0
downs = 0
nones = 0
#def breach_t_test():
for current_incident in full_df.index:
    #current_incident=33
    this_prices_df = full_df.prices[current_incident]
    this_prices_df = this_prices_df.iloc[::-1] # data are returned with newest first--fix
    this_prices_df.Close.apply(str)
    before = np.array(this_prices_df.Close.loc[:10],dtype=float)
    after = np.array(this_prices_df.Close.loc[10:],dtype=float)

    s,p = ttest_ind(before, after)
    
    if (p < 0.05) and (np.mean(before)>np.mean(after)):
        #print p, np.mean(before), np.mean(after), 'Significant Drop'
        downs += 1
    elif (p < 0.05) and (np.mean(before)<np.mean(after)):
          #print p, np.mean(before), np.mean(after), 'Significant Rise'
        ups += 1
    else:
        #print p, np.mean(before), np.mean(after), 'No effect'
        nones += 1

print 'Sig Rise: ', float(ups)/len(full_df.index)
print 'Sig Drop: ', float(downs)/len(full_df.index)
print 'No Change: ',float(nones)/len(full_df.index)