# Program Header

In [30]:
import pandas as pd
import numpy as np
import autoreload
import os
import sys
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
from header import *

In [31]:
data_root = initialize_data_root("AY")

In [32]:
import math

# Table of contents
1. [Import](#Import)
2. [Data Understanding](#DataUnderstanding)
3. [Clean](#Clean)
4. [Merge](#Merge)
5. [Further Cleaning](#FurtherCleaning)
6. [Save](#Save)

## Import <a name="Import"></a>

In [33]:
raw_num_data = pd.read_csv(data_root+'02-build/raw/2019q3/num.csv')
raw_pre_data = pd.read_csv(data_root+'02-build/raw/2019q3/pre.csv')
raw_sub_data = pd.read_csv(data_root+'02-build/raw/2019q3/sub.csv')
raw_tag_data = pd.read_csv(data_root+'02-build/raw/2019q3/tag.csv')

## Data Understanding <a name="DataUnderstanding"></a>

Useful columns based on reading the readme.txt:
* **num**: Numeric data and actual values associated to `adsh-ddate-tag-etc`. All columns are relevant
* **pre**: Location of tags and a preferred label (`plabel`). `plabel` (and joining columns) seems relevant
* **sub**: Summary information about submission. Business-specific information including industry and company name are relevant
* **tag**: Tag-level dataset. `doc` seems to be a more descriptive `plabel` from **pre** dataset

## Clean <a name="Clean"></a>
Clean the individual datasets to prepare for the merge

In [34]:
raw_num_data.head()

Unnamed: 0,adsh,tag,version,coreg,ddate,qtrs,uom,value,footnote
0,0001625376-19-000017,EntityPublicFloat,dei/2014,,20180430,0,USD,0.0,
1,0000034563-19-000064,DerivativeNonmonetaryNotionalAmount,invest/2013,,20190630,0,lb,48183000.0,
2,0000034563-19-000064,DerivativeNonmonetaryNotionalAmount,invest/2013,,20180630,0,lb,43459000.0,
3,0001370946-19-000033,DerivativeNonmonetaryNotionalAmount,invest/2013,,20190630,0,MMBTU,2.0,
4,0000225648-19-000108,DerivativeNonmonetaryNotionalAmount,invest/2013,,20190630,0,MW,4300000.0,


In [35]:
raw_pre_data.head()

Unnamed: 0,adsh,report,line,stmt,inpth,rfile,tag,version,plabel,negating
0,0001625376-19-000017,1,9,CP,0,H,EntityPublicFloat,dei/2014,Entity Public Float,0
1,0001625376-19-000017,1,14,CP,0,H,DocumentFiscalYearFocus,dei/2014,Document Fiscal Year Focus,0
2,0001625376-19-000019,1,11,CP,0,H,DocumentFiscalYearFocus,dei/2014,Document Fiscal Year Focus,0
3,0001766016-19-000005,1,10,CP,0,H,DocumentFiscalYearFocus,dei/2014,Document Fiscal Year Focus,0
4,0001047469-19-004442,1,6,CP,0,H,DocumentFiscalYearFocus,dei/2014,Document Fiscal Year Focus,0


In [36]:
raw_sub_data.head()

Unnamed: 0,adsh,cik,name,sic,countryba,stprba,cityba,zipba,bas1,bas2,...,period,fy,fp,filed,accepted,prevrpt,detail,instance,nciks,aciks
0,0000002178-19-000086,2178,"ADAMS RESOURCES & ENERGY, INC.",5172.0,US,TX,HOUSTON,77027,17 S. BRIAR HOLLOW LN.,,...,20190630,2019.0,Q2,20190807,2019-08-07 17:21:00.0,0,1,ae-20190630_htm.xml,1,
1,0000002488-19-000104,2488,ADVANCED MICRO DEVICES INC,3674.0,US,CA,SANTA CLARA,95054,2485 AUGUSTINE DRIVE,,...,20190630,2019.0,Q2,20190731,2019-07-31 16:27:00.0,0,1,amd0629201910q_htm.xml,1,
2,0000002488-19-000113,2488,ADVANCED MICRO DEVICES INC,3674.0,US,CA,SANTA CLARA,95054,2485 AUGUSTINE DRIVE,,...,20190731,2019.0,Q3,20190808,2019-08-08 16:10:00.0,0,0,amdform8k08082019_htm.xml,1,
3,0000002488-19-000144,2488,ADVANCED MICRO DEVICES INC,3674.0,US,CA,SANTA CLARA,95054,2485 AUGUSTINE DRIVE,,...,20190831,2019.0,Q3,20190906,2019-09-06 16:11:00.0,0,0,amdform8kequitysalesse_htm.xml,1,
4,0000002969-19-000042,2969,AIR PRODUCTS & CHEMICALS INC /DE/,2810.0,US,PA,ALLENTOWN,18195-1501,7201 HAMILTON BLVD,,...,20190630,2019.0,Q3,20190725,2019-07-25 13:02:00.0,0,1,apd-10qx30jun2019_htm.xml,1,


In [37]:
raw_tag_data.head()

Unnamed: 0,tag,version,custom,abstract,datatype,iord,crdr,tlabel,doc
0,EntityPublicFloat,dei/2014,0,0,monetary,I,C,Entity Public Float,State aggregate market value of voting and non...
1,DocumentFiscalYearFocus,dei/2014,0,0,gYear,D,,Document Fiscal Year Focus,This is focus fiscal year of the document repo...
2,DocumentPeriodEndDate,dei/2014,0,0,date,D,,Document Period End Date,The end date of the period reflected on the co...
3,CurrentFiscalYearEndDate,dei/2014,0,0,gMonthDay,D,,Current Fiscal Year End Date,End date of current fiscal year in the format ...
4,InvestmentAxis,invest/2013,0,1,axis,,,Investment [Axis],"A categorization of investments (securities, d..."


## Clean <a name="Clean"></a>

In [38]:
pre_data = raw_num_data\
    .filter(['adsh', 'tag', 'version', 'plabel'], axis = 'columns')\
    .drop_duplicates()

In [39]:
sub_data = raw_sub_data\
    .filter(['adsh', 'cik', 'name', 'sic', 'countryba', 'period', 'form', 'fye', 'accepted', 'instance'], axis = 'columns')\
    .rename(mapper = {'name':'company_name', 'countryba':'country'}, axis = 'columns')\
    .assign(period = pd.to_datetime(raw_sub_data['period'], format='%Y%m%d', errors = 'coerce'))\
    .assign(dyear = raw_num_data['ddate'].map(lambda x: math.floor(x/10000)))\
    .query("form in ['10-Q', '10-K']") # Keep only the quarterly and annual filings

In [40]:
tag_data = raw_tag_data\
    .filter(['tag', 'version', 'doc'])\
    .drop_duplicates()

In [41]:
tag_data['doc'][0]

"State aggregate market value of voting and non-voting common equity held by non-affiliates computed by reference to price at which the common equity was last sold, or average bid and asked price of such common equity, as of the last business day of registrant's most recently completed second fiscal quarter. The public float should be reported on the cover page of the registrants form 10K."

In [87]:
# Get rid of coregister/parent company
# Keep only the balance sheet and single quarter flows

num_data = raw_num_data\
    .drop('footnote', axis = 'columns')\
    .assign(no_coreg = pd.isna(raw_num_data['coreg']))\
    .assign(no_value = pd.isna(raw_num_data['value']))\
    .assign(ddate = pd.to_datetime(raw_num_data['ddate'], format = '%Y%m%d', errors = 'coerce'))\
    .query("(no_coreg == True) & (no_value == False)")\
    .query("(qtrs in (0,1))")\
    .query("uom in ('USD', 'shares', 'pure')")\
    .query("(ddate >= '2019-01-01')&(ddate <= '2019-12-31')")

In [43]:
num_data.shape

(697104, 10)

## Merge <a name="Merge"></a>

In [44]:
num_data.columns

Index(['adsh', 'tag', 'version', 'coreg', 'ddate', 'qtrs', 'uom', 'value',
       'no_coreg', 'no_value'],
      dtype='object')

In [63]:
sec_data = num_data.merge(pre_data, on = ['adsh', 'tag', 'version'])\
    .merge(sub_data, on = ['adsh'])\
    .merge(tag_data, on = ['tag', 'version'])\
    .reset_index(drop = T)

## Further Cleaning  <a name="FurtherCleaning"></a>

We need to decide whether to use the `period` as our variable date or `ddate` as our variable date. According to the readme, `ddate` is more accurate. However, it is also much more unstable and some `ddates` even have impossible values, i.e. 2021 shares outstanding values for 2019q3 data. We've decided to keep the maximum `ddate` observation within an `adsh, tag, period` observation.

See if we have **sec_data** observations at the `adsh, tag, period` level

In [64]:
multiple_matches = sec_data.groupby(['adsh', 'tag', 'period'], as_index = False)\
    .count()\
    .sort_values('value', ascending = False)\
    .query("value > 1")

In [66]:
multiple_matches

Unnamed: 0,adsh,tag,period,index,version,coreg,ddate,qtrs,uom,value,...,cik,company_name,sic,country,form,fye,accepted,instance,dyear,doc
260680,0001476045-19-000069,CommonStockDividendsPerShareDeclared,2019-06-30,8,8,0,8,8,8,8,...,8,8,8,8,8,8,8,8,8,8
260743,0001476045-19-000069,RegularDistributionsOnLongTermInvestmentPlan,2019-06-30,8,8,0,8,8,8,8,...,8,8,8,8,8,8,8,8,8,8
382950,0001564590-19-031281,StockRepurchasedAndRetiredDuringPeriodShares,2019-06-30,6,6,0,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
382946,0001564590-19-031281,SharePrice,2019-06-30,6,6,0,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
45433,0000726728-19-000079,CommonStockDividendsPerShareCashPaid,2019-06-30,6,6,0,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
381038,0001564590-19-031229,StockRepurchasedDuringPeriodShares,2019-06-30,6,6,0,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
381036,0001564590-19-031229,StockRedeemedOrCalledDuringPeriodShares,2019-06-30,6,6,0,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
381034,0001564590-19-031229,SharePrice,2019-06-30,6,6,0,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
60389,0000826675-19-000046,CommonStockDividendsPerShareDeclared,2019-06-30,6,6,0,6,6,6,6,...,6,6,6,6,6,6,6,6,6,6
458766,0001717547-19-000014,CommonStockDividendsPerShareDeclared,2019-06-30,5,5,0,5,5,5,5,...,5,5,5,5,5,5,5,5,5,5


We'll keep only the max `ddate` within each `adsh, tag, period`.

In [65]:
sec_data = sec_data.loc[sec_data.groupby(['adsh', 'tag', 'period'], as_index = False)['ddate'].idxmax()]

In [67]:
multiple_matches = sec_data.groupby(['adsh', 'tag', 'period'], as_index = False)\
    .count()\
    .sort_values('value', ascending = False)\
    .query("value > 1")

In [68]:
multiple_matches

Unnamed: 0,adsh,tag,period,index,version,coreg,ddate,qtrs,uom,value,...,cik,company_name,sic,country,form,fye,accepted,instance,dyear,doc


In [78]:
old_rows = raw_num_data.shape[0]
new_rows = sec_data.shape[0]

print("We've gone from ", old_rows, " to ", new_rows, " observations.")
print("That is a ",math.floor((1-new_rows/old_rows)*100),"% decrease.", sep = "")

We've gone from  2325267  to  465552  observations.
That is a 79% decrease.


The issue we will tackle in the next part will be that there are too many tags to reshape the data easily:

In [85]:
print("Raw version tags:",len(raw_num_data.tag.unique()))
print("Cleaned version tags:",len(sec_data.tag.unique()))

Raw version tags: 120892
Cleaned version tags: 19300


## Save <a name="Save"></a>
Create CSV versions in build/raw

In [80]:
sec_data.columns

Index(['index', 'adsh', 'tag', 'version', 'coreg', 'ddate', 'qtrs', 'uom',
       'value', 'no_coreg', 'no_value', 'cik', 'company_name', 'sic',
       'country', 'period', 'form', 'fye', 'accepted', 'instance', 'dyear',
       'doc'],
      dtype='object')

In [79]:
sec_data.to_csv(data_root+'02-build/clean/2019q3/sec_data_long.csv', index = False)