# Program Header

In [22]:
import pandas as pd
import numpy as np
import autoreload
import os
import sys
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
from header import *

In [23]:
data_root = initialize_data_root("AY")

# Table of contents
1. [Import](#Import)
2. [Data Understanding](#DataUnderstanding)
2. [Clean](#Clean)
3. [Merge](#Merge)
5. [Save](#Save)

## Import <a name="Import"></a>

In [24]:
raw_num_data = pd.read_csv(data_root+'02-build/raw/2019q3/num.csv')
raw_pre_data = pd.read_csv(data_root+'02-build/raw/2019q3/pre.csv')
raw_sub_data = pd.read_csv(data_root+'02-build/raw/2019q3/sub.csv')
raw_tag_data = pd.read_csv(data_root+'02-build/raw/2019q3/tag.csv')

## Data Understanding <a name="DataUnderstanding"></a>

Useful columns based on reading the readme.txt:
* **num**: Numeric data and actual values associated to `adsh-ddate-tag-etc`. All columns are relevant
* **pre**: Location of tags and a preferred label (`plabel`). `plabel` (and joining columns) seems relevant
* **sub**: Summary information about submission. Business-specific information including industry and company name are relevant
* **tag**: Tag-level dataset. `doc` seems to be a more descriptive `plabel` from **pre** dataset

## Clean <a name="Clean"></a>
Clean the individual datasets to prepare for the merge

In [25]:
raw_num_data.head()

Unnamed: 0,adsh,tag,version,coreg,ddate,qtrs,uom,value,footnote
0,0001625376-19-000017,EntityPublicFloat,dei/2014,,20180430,0,USD,0.0,
1,0000034563-19-000064,DerivativeNonmonetaryNotionalAmount,invest/2013,,20190630,0,lb,48183000.0,
2,0000034563-19-000064,DerivativeNonmonetaryNotionalAmount,invest/2013,,20180630,0,lb,43459000.0,
3,0001370946-19-000033,DerivativeNonmonetaryNotionalAmount,invest/2013,,20190630,0,MMBTU,2.0,
4,0000225648-19-000108,DerivativeNonmonetaryNotionalAmount,invest/2013,,20190630,0,MW,4300000.0,


In [26]:
raw_pre_data.head()

Unnamed: 0,adsh,report,line,stmt,inpth,rfile,tag,version,plabel,negating
0,0001625376-19-000017,1,9,CP,0,H,EntityPublicFloat,dei/2014,Entity Public Float,0
1,0001625376-19-000017,1,14,CP,0,H,DocumentFiscalYearFocus,dei/2014,Document Fiscal Year Focus,0
2,0001625376-19-000019,1,11,CP,0,H,DocumentFiscalYearFocus,dei/2014,Document Fiscal Year Focus,0
3,0001766016-19-000005,1,10,CP,0,H,DocumentFiscalYearFocus,dei/2014,Document Fiscal Year Focus,0
4,0001047469-19-004442,1,6,CP,0,H,DocumentFiscalYearFocus,dei/2014,Document Fiscal Year Focus,0


In [27]:
raw_sub_data.head()

Unnamed: 0,adsh,cik,name,sic,countryba,stprba,cityba,zipba,bas1,bas2,...,period,fy,fp,filed,accepted,prevrpt,detail,instance,nciks,aciks
0,0000002178-19-000086,2178,"ADAMS RESOURCES & ENERGY, INC.",5172.0,US,TX,HOUSTON,77027,17 S. BRIAR HOLLOW LN.,,...,20190630,2019.0,Q2,20190807,2019-08-07 17:21:00.0,0,1,ae-20190630_htm.xml,1,
1,0000002488-19-000104,2488,ADVANCED MICRO DEVICES INC,3674.0,US,CA,SANTA CLARA,95054,2485 AUGUSTINE DRIVE,,...,20190630,2019.0,Q2,20190731,2019-07-31 16:27:00.0,0,1,amd0629201910q_htm.xml,1,
2,0000002488-19-000113,2488,ADVANCED MICRO DEVICES INC,3674.0,US,CA,SANTA CLARA,95054,2485 AUGUSTINE DRIVE,,...,20190731,2019.0,Q3,20190808,2019-08-08 16:10:00.0,0,0,amdform8k08082019_htm.xml,1,
3,0000002488-19-000144,2488,ADVANCED MICRO DEVICES INC,3674.0,US,CA,SANTA CLARA,95054,2485 AUGUSTINE DRIVE,,...,20190831,2019.0,Q3,20190906,2019-09-06 16:11:00.0,0,0,amdform8kequitysalesse_htm.xml,1,
4,0000002969-19-000042,2969,AIR PRODUCTS & CHEMICALS INC /DE/,2810.0,US,PA,ALLENTOWN,18195-1501,7201 HAMILTON BLVD,,...,20190630,2019.0,Q3,20190725,2019-07-25 13:02:00.0,0,1,apd-10qx30jun2019_htm.xml,1,


In [28]:
raw_tag_data.head()

Unnamed: 0,tag,version,custom,abstract,datatype,iord,crdr,tlabel,doc
0,EntityPublicFloat,dei/2014,0,0,monetary,I,C,Entity Public Float,State aggregate market value of voting and non...
1,DocumentFiscalYearFocus,dei/2014,0,0,gYear,D,,Document Fiscal Year Focus,This is focus fiscal year of the document repo...
2,DocumentPeriodEndDate,dei/2014,0,0,date,D,,Document Period End Date,The end date of the period reflected on the co...
3,CurrentFiscalYearEndDate,dei/2014,0,0,gMonthDay,D,,Current Fiscal Year End Date,End date of current fiscal year in the format ...
4,InvestmentAxis,invest/2013,0,1,axis,,,Investment [Axis],"A categorization of investments (securities, d..."


## Clean <a name="Clean"></a>

In [29]:
pre_data = raw_num_data\
    .filter(['adsh', 'tag', 'version', 'plabel'], axis = 'columns')\
    .drop_duplicates()

In [30]:
sub_data = raw_sub_data\
    .filter(['adsh', 'cik', 'name', 'sic', 'countryba', 'period', 'form', 'fye', 'accepted', 'instance'], axis = 'columns')\
    .rename(mapper = {'name':'company_name', 'countryba':'country'}, axis = 'columns')\
    .query("form in ['10-Q', '10-K']") # Keep only the quarterly and annual filings

In [31]:
tag_data = raw_tag_data\
    .filter(['tag', 'version', 'doc'])\
    .drop_duplicates()

In [32]:
tag_data['doc'][0]

"State aggregate market value of voting and non-voting common equity held by non-affiliates computed by reference to price at which the common equity was last sold, or average bid and asked price of such common equity, as of the last business day of registrant's most recently completed second fiscal quarter. The public float should be reported on the cover page of the registrants form 10K."

In [33]:
# Get rid of coregister/parent company
# Keep only the balance sheet and single quarter flows

num_data = raw_num_data\
    .drop('footnote', axis = 'columns')\
    .assign(no_coreg = pd.isna(raw_num_data['coreg']))\
    .assign(no_value = pd.isna(raw_num_data['value']))\
    .assign(ddate = pd.to_datetime(raw_num_data['ddate'], format='%Y%m%d', errors = 'coerce'))\
    .query("(no_coreg == True) & (no_value == False) & (ddate > '2018-01-01')")\
    .query("(qtrs in (0,1))")\
    .query("uom in ('USD', 'shares', 'pure')")

In [34]:
num_data.shape

(1273597, 10)

See if we have **num_data** observations at the `adsh, tag, ddate` level

In [35]:
multiple_matches = num_data.groupby(['adsh', 'tag', 'ddate'], as_index = False)\
    .count()\
    .sort_values('value', ascending = False)\
    .query("value > 1")
duplicate_causing_tags = multiple_matches.tag.unique()

In [36]:
print("%.2f%% tags create duplicates"% round(len(duplicate_causing_tags)/len(num_data.tag.unique())*100,2))

0.20% tags create duplicates


Since only a small percent of tags are trouble, we'll just remove them for now.

In [37]:
num_data = num_data.query("tag not in @duplicate_causing_tags")

Note for checking: These tags may just be variations of other important tags. Investigate if we're dropping anything important. Needs further investigation during checking process.

Now, we have unique rows for each `adsh, tag, ddate`.

In [38]:
multiple_matches = num_data.groupby(['adsh', 'tag', 'ddate'], as_index = False)\
    .count()\
    .sort_values('value', ascending = False)\
    .query("value > 1")
multiple_matches

Unnamed: 0,adsh,tag,ddate,version,coreg,qtrs,uom,value,no_coreg,no_value


## Merge <a name="Merge"></a>

In [39]:
num_data.columns

Index(['adsh', 'tag', 'version', 'coreg', 'ddate', 'qtrs', 'uom', 'value',
       'no_coreg', 'no_value'],
      dtype='object')

In [40]:
sec_data = num_data.merge(pre_data, on = ['adsh', 'tag', 'version'])\
    .merge(sub_data, on = ['adsh'])\
    .merge(tag_data, on = ['tag', 'version'])

In [41]:
sec_data

Unnamed: 0,adsh,tag,version,coreg,ddate,qtrs,uom,value,no_coreg,no_value,cik,company_name,sic,country,period,form,fye,accepted,instance,doc
0,0001326771-19-000035,DerivativeNotionalAmount,invest/2013,,2019-06-30,0,USD,3.024480e+10,True,False,1326771,FEDERAL HOME LOAN BANK OF CINCINNATI,6111.0,US,20190630,10-Q,1231.0,2019-08-08 12:19:00.0,fhlbcinq2201910-q_htm.xml,Aggregate notional amount specified by the der...
1,0001326771-19-000035,DerivativeNotionalAmount,invest/2013,,2018-12-31,0,USD,1.380677e+10,True,False,1326771,FEDERAL HOME LOAN BANK OF CINCINNATI,6111.0,US,20190630,10-Q,1231.0,2019-08-08 12:19:00.0,fhlbcinq2201910-q_htm.xml,Aggregate notional amount specified by the der...
2,0001392091-19-000074,DerivativeNotionalAmount,invest/2013,,2019-06-30,0,USD,0.000000e+00,True,False,1392091,"BLUEKNIGHT ENERGY PARTNERS, L.P.",4610.0,US,20190630,10-Q,1231.0,2019-08-08 16:25:00.0,bkep-20190630.xml,Aggregate notional amount specified by the der...
3,0001558370-19-007743,DerivativeNotionalAmount,invest/2013,,2018-12-31,0,USD,7.059350e+08,True,False,1527590,READY CAPITAL CORP,6798.0,US,20190630,10-Q,1231.0,2019-08-08 17:02:00.0,rc-20190630.xml,Aggregate notional amount specified by the der...
4,0001558370-19-007743,DerivativeNotionalAmount,invest/2013,,2019-06-30,0,USD,9.058640e+08,True,False,1527590,READY CAPITAL CORP,6798.0,US,20190630,10-Q,1231.0,2019-08-08 17:02:00.0,rc-20190630.xml,Aggregate notional amount specified by the der...
5,0000877860-19-000070,DerivativeNotionalAmount,invest/2013,,2019-06-30,0,USD,6.100000e+08,True,False,877860,NATIONAL HEALTH INVESTORS INC,6798.0,US,20190630,10-Q,1231.0,2019-08-07 19:01:00.0,nhi-6302019x10q_htm.xml,Aggregate notional amount specified by the der...
6,0001254699-19-000011,DerivativeNotionalAmount,invest/2013,,2019-06-30,0,USD,1.250000e+08,True,False,1254699,QVC INC,5961.0,,20190630,10-Q,1231.0,2019-08-08 16:49:00.0,qvc-20190630.xml,Aggregate notional amount specified by the der...
7,0001331465-19-000121,DerivativeNotionalAmount,invest/2013,,2019-06-30,0,USD,6.435700e+10,True,False,1331465,FEDERAL HOME LOAN BANK OF ATLANTA,6111.0,US,20190630,10-Q,1231.0,2019-08-08 11:31:00.0,fhlb-atlq22019_htm.xml,Aggregate notional amount specified by the der...
8,0001331465-19-000121,DerivativeNotionalAmount,invest/2013,,2018-12-31,0,USD,5.951800e+10,True,False,1331465,FEDERAL HOME LOAN BANK OF ATLANTA,6111.0,US,20190630,10-Q,1231.0,2019-08-08 11:31:00.0,fhlb-atlq22019_htm.xml,Aggregate notional amount specified by the der...
9,0001518715-19-000167,DerivativeNotionalAmount,invest/2013,,2019-06-30,0,USD,4.103995e+09,True,False,1518715,"HOMESTREET, INC.",6022.0,US,20190630,10-Q,1231.0,2019-08-08 12:51:00.0,hmst-20190630x10q_htm.xml,Aggregate notional amount specified by the der...


## Save <a name="Save"></a>
Create CSV versions in build/raw

In [42]:
sec_data.to_csv(data_root+'02-build/clean/2019q3/sec_data_long.csv', index = False)