# Data Explorer

This document provides a way to explore the inner contents of the Compustat and CRSP datasets. You should use this tool before working iwth a particular variable to make sure that the data "looks right"

In [1]:
import pandas as pd
from pandas.tseries.offsets import MonthEnd
import numpy as np
from beakerx import *
from beakerx.object import beakerx
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sci
from copy import copy
from IPython.display import HTML
from multiprocessing import Pool, cpu_count
import sys
sys.path.append('../Code/')
from utils import *

In [35]:
merged = pd.read_hdf('../Output/merged.h5')
merged.head()

## Some Utility Functions

In [4]:
def find_permcos(dataframe, ticker):
    """
    Returns a dataframe with all the potential PERMCO's corresponding to the ticker. The dataframe is in sorted order, sorted by the date of the most recent observation for each company
    """
    right_ticker = dataframe.loc[dataframe['Ticker'] == ticker]
    right_ticker = right_ticker.safe_index(['Permco'])
    possible = right_ticker.groupby(by = ['Permco']).last()
    possible = possible.sort_values(['datadate'], ascending = False)
    return possible

def smart_permco(dataframe, ticker):
    ret = find_permcos(dataframe, ticker)
    return {'Permco': ret.index[0], 'Name': ret['Company Name.crsp'].values[0]}

find_permcos(merged, 'FB')

In [5]:
smart_permco(merged, 'FB')

{'Permco': 54084, 'Name': 'FACEBOOK INC'}

## Plotting Functions

In [22]:
# Build a visualizer

def simple_plot(dataframe, variable, plot_title, **kwargs):
    """
    Makes a simple line plot of "variable" from dataframe.
    
    :param dataframe -- a dataframe with a multi-index, the second level of which is the date variable
    :param variable -- the variable to plot
    :param plot_title -- the title to use for the plot
    """
    plot = TimePlot(title = plot_title, legendLayout=LegendLayout.HORIZONTAL,\
                          legendPosition=LegendPosition(position=LegendPosition.Position.TOP),\
                        initWidth = 500, **kwargs)
    plot.add(Line(displayName = variable, \
                  x = dataframe.index.get_level_values(1),\
                  y = dataframe[variable]))
    return plot

def price_and_market_cap(dataframe, permco, company_name):
    target_view = dataframe.xs(permco, level = 'Permco', drop_level = False)
    
    lg = GridOutputContainerLayoutManager(3)
    og = OutputContainer()
    og.setLayoutManager(lg)
    og.addItem(simple_plot(target_view, 'Cumulative Return', 'Log Cumulative Return of ' + company_name))
    og.addItem(simple_plot(target_view, 'Market Cap (Billions, CRSP)', 'Market Cap', logY = True, logYBase = np.exp(1)))
    og.addItem(simple_plot(target_view, 'Volume (% of Market Cap, 3mma)', 'Volume as % of Market Cap'))
    return og

def smart(function):
    def smart_function(dataframe, ticker):
        guess = smart_permco(dataframe, ticker)
        return function(dataframe, guess['Permco'], guess['Name'])
    
    return smart_function
    

In [33]:
find_permcos(merged, 'FB')

In [32]:
smart(price_and_market_cap)(merged, 'FB')

GridView(children=(BeakerxHBox(children=(TimePlot(model={'chart_title': 'Log Cumulative Return of FACEBOOK INC…

## Reconciling Compustat and CRSP Market Cap

In [9]:
aapl = merged.xs(7, level = 'Permco', drop_level = False)

In [19]:
def market_cap_reconciliation(dataframe, permco, company_name):
    view = dataframe.xs(permco, level = 'Permco', drop_level = False)
    plot = TimePlot(title = company_name + ' Market Cap Reconciliation', legendLayout=LegendLayout.HORIZONTAL,\
                              legendPosition=LegendPosition(position=LegendPosition.Position.TOP),\
                            logY = True,
                            yLogBase = np.exp(1),
                            initWidth = 700)
    plot.add(Line(displayName = 'CRSP', \
                  x = view.index.get_level_values(1),\
                  y = view['Market Cap (Billions, CRSP)']))
    plot.add(Line(displayName = 'Compustat', \
                  x = view.index.get_level_values(1),\
                  y = view['Market Cap (Compustat)']))
    return plot

def reconcile_list(dataframe, tickers):
    lg = GridOutputContainerLayoutManager(3)
    og = OutputContainer()
    og.setLayoutManager(lg)
    
    for t in tickers:
        og.addItem(smart(market_cap_reconciliation)(dataframe, t))
        
    return og

def three_in_a_row(chart_function):
    
    def list_charter(dataframe, ticker_list):
        lg = GridOutputContainerLayoutManager(3)
        og = OutputContainer()
        og.setLayoutManager(lg)
        
        for t in ticker_list:
            og.addItem(chart_function(dataframe, t))
        
        return og
    
    return list_charter

three_in_a_row(smart(market_cap_reconciliation))(merged, ['MSFT', 'AAPL', 'AMZN', 'FB', 'JNJ', 'V'])

GridView(children=(BeakerxHBox(children=(TimePlot(model={'chart_title': 'MICROSOFT CORP Market Cap Reconciliat…

Clear that I'm not getting all the share classes right. But I think this is OK. The CRSP market cap is the right representation of how much is tradable.

## Making sure the returns look reasonable

In [34]:
def return_plot(dataframe, permco, company_name):
    return simple_plot(dataframe.xs(permco, level = 'Permco', drop_level = False), 'Cumulative Return', company_name)
    
three_in_a_row(smart(return_plot))(merged, ['DOW', 'AAPL', 'AMZN', 'FB', 'JNJ', 'V', 'NFLX', 'DAL', 'MMM'])

GridView(children=(BeakerxHBox(children=(TimePlot(model={'chart_title': 'DOW CHEMICAL CO', 'constant_bands': […