In [1]:
# styling
from IPython import utils  
from IPython.core.display import HTML  
import os  
def css_styling():  
    """Load default custom.css file from ipython profile"""
    base = utils.path.get_ipython_dir()
    styles = "<style>\n%s\n</style>" % (open(os.path.join(base,'profile_default/static/custom/custom.css'),'r').read())
    return HTML(styles)
css_styling()  

  warn("get_ipython_dir has moved to the IPython.paths module")


In [2]:
%matplotlib inline



In [3]:
# libraries + pandas options
import numpy as np
import scipy as sp
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels import discrete
import re
import regex
import collections
import pandas as pd
import math 
import csv
import time
import dateutil
from datetime import datetime
import seaborn as sns
import json

pd.set_option('display.width', 1500)
pd.set_option('max_colwidth',100)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
pd.options.display.float_format = '{:,.3f}'.format
sns.set_style("whitegrid")
sns.set_context("poster")

import matplotlib as mpl
import matplotlib.cm as cm
import matplotlib.pyplot as plt
from matplotlib import gridspec
from matplotlib import ticker
    

In [4]:
# EventRegistry login
from eventregistry import *
er = EventRegistry()
er.login("rradovanovic@college.harvard.edu","ruski.EVENT1")

Event Registry host: http://eventregistry.org


{u'action': u'success', u'desc': u'Login successful'}

# Compute Correlations Between Stock Stuff

In [65]:
stocksdf = pd.read_csv('../data/financial/only_relevant_stocks.csv')
stocksdf.date = pd.to_datetime(stocksdf.date)

In [66]:
# there are some stocks with a return value of C, we remove them and convert to float
stocksdf.drop(stocksdf[stocksdf.ret == 'C'].index, inplace=1)
stocksdf.ret = stocksdf.ret.astype(float)

In [67]:
# we have some duplicates in our data set
stocksdf[stocksdf.duplicated(["date","ticker"])].ticker.value_counts()[:15]

TAP     1510
BIO     1510
GEF     1510
HVT     1510
LEN     1510
MKC     1510
CBS     1510
STZ     1510
WSO     1510
TDS      520
AAN      239
DO         9
COLB       8
CBSH       7
NAN        7
Name: ticker, dtype: int64

In [68]:
# let's check if these duplicates also have different vwretd data
stocksdf[stocksdf.duplicated(["date","ticker","ret", "sprtrn"])].shape

(147, 20)

Since they don't, we don't have to care, woohoo! 

In [90]:
# we reshape our data to get a dataframe with a date index and each ticker as a column
temp = stocksdf[["date", "sprtrn", "ticker","ret","vol"]].copy().drop_duplicates(["date","ticker"])

# add variables
temp["return_over_market"] = temp.ret - temp.sprtrn
temp["abolute_return"] = np.absolute(temp.ret)

# get necessary unstacked dataframes
returns = temp[["date","ticker","return_over_market"]].copy().set_index(["date","ticker"]).unstack('ticker')
absreturns = temp[["date","ticker","abolute_return"]].copy().set_index(["date","ticker"]).unstack('ticker')
volumes = temp[["date","ticker","vol"]].copy().set_index(["date","ticker"]).unstack('ticker')



In [91]:
# we quickly check for null entries (not too many)
print returns.isnull().sum().sum()
print volumes.isnull().sum().sum()
print absreturns.isnull().sum().sum()

19
23
19


In [92]:
%%time
# we run a test to see how long it takes to compute the correlations of 100 columns
return_correlations = returns.corr(method='pearson')
absreturn_correlations = absreturns.corr(method='pearson')
volume_correlations = volumes.corr(method='pearson')

Wall time: 2min 50s


In [94]:
return_correlations.to_csv('../data/return_correlations.csv')
absreturn_correlations.to_csv('../data/absreturn_correlations.csv')
volume_correlations.to_csv('../data/volume_correlations.csv')