# Program Header

In [60]:
import pandas as pd
import numpy as np
import autoreload
import os
import sys
import jellyfish
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
from header import *

In [23]:
data_root = initialize_data_root("AY")

# Table of contents
1. [Import](#Import)
2. [Data Understanding](#DataUnderstanding)
2. [Clean](#Clean)
3. [Merge](#Merge)
4. [Create Features](#CreateFeatures)
5. [Save](#Save)

## Import <a name="Import"></a>

In [24]:
raw_sec_data = pd.read_csv(data_root+'02-build/clean/2019q3/sec_data_long.csv')
raw_tag_map = pd.read_csv(data_root+'02-build/clean/2019q3/tag_map.csv')

## Clean <a name="Clean"></a>

In [None]:
sec_data = raw_sec_data.copy()
tag_map = raw_tag_map.copy()

## Clean <a name="Clean"></a>

In [29]:
pre_data = raw_num_data\
    .filter(['adsh', 'tag', 'version', 'plabel'], axis = 'columns')\
    .drop_duplicates()

In [30]:
sub_data = raw_sub_data\
    .filter(['adsh', 'cik', 'name', 'sic', 'countryba', 'period', 'form', 'fye', 'accepted', 'instance'], axis = 'columns')\
    .rename(mapper = {'name':'company_name', 'countryba':'country'}, axis = 'columns')\
    .query("form in ['10-Q', '10-K']") # Keep only the quarterly and annual filings

In [31]:
tag_data = raw_tag_data\
    .filter(['tag', 'version', 'doc'])\
    .drop_duplicates()

In [32]:
tag_data['doc'][0]

"State aggregate market value of voting and non-voting common equity held by non-affiliates computed by reference to price at which the common equity was last sold, or average bid and asked price of such common equity, as of the last business day of registrant's most recently completed second fiscal quarter. The public float should be reported on the cover page of the registrants form 10K."

In [33]:
# Get rid of coregister/parent company
# Keep only the balance sheet and single quarter flows

num_data = raw_num_data\
    .drop('footnote', axis = 'columns')\
    .assign(no_coreg = pd.isna(raw_num_data['coreg']))\
    .assign(no_value = pd.isna(raw_num_data['value']))\
    .assign(ddate = pd.to_datetime(raw_num_data['ddate'], format='%Y%m%d', errors = 'coerce'))\
    .query("(no_coreg == True) & (no_value == False) & (qtrs in (0,1)) & (ddate > '2018-01-01')")

In [34]:
num_data.shape

(1300874, 10)

See if we have **num_data** observations at the `adsh, tag, ddate` level

In [35]:
multiple_matches = num_data.groupby(['adsh', 'tag', 'ddate'], as_index = False)\
    .count()\
    .sort_values('value', ascending = False)\
    .query("value > 1")
duplicate_causing_tags = multiple_matches.tag.unique()

In [36]:
print("%.2f%% tags create duplicates"% round(len(duplicate_causing_tags)/len(num_data.tag.unique())*100,2))

1.07% tags create duplicates


Since only a small percent of tags are trouble, we'll just remove them for now.

In [37]:
num_data = num_data.query("tag not in @duplicate_causing_tags")

Note for checking: These tags may just be variations of other important tags. Investigate if we're dropping anything important. Needs further investigation during checking process.

Now, we have unique rows for each `adsh, tag, ddate`.

In [38]:
multiple_matches = num_data.groupby(['adsh', 'tag', 'ddate'], as_index = False)\
    .count()\
    .sort_values('value', ascending = False)\
    .query("value > 1")
multiple_matches

Unnamed: 0,adsh,tag,ddate,version,coreg,qtrs,uom,value,no_coreg,no_value


## Merge <a name="Merge"></a>

In [39]:
num_data.columns

Index(['adsh', 'tag', 'version', 'coreg', 'ddate', 'qtrs', 'uom', 'value',
       'no_coreg', 'no_value'],
      dtype='object')

In [40]:
sec_data = num_data.merge(pre_data, on = ['adsh', 'tag', 'version'])\
    .merge(sub_data, on = ['adsh'])\
    .merge(tag_data, on = ['tag', 'version'])

In [41]:
sec_data.columns

Index(['adsh', 'tag', 'version', 'coreg', 'ddate', 'qtrs', 'uom', 'value',
       'no_coreg', 'no_value', 'cik', 'company_name', 'sic', 'country',
       'period', 'form', 'fye', 'accepted', 'instance', 'doc'],
      dtype='object')

In [None]:
test = sec_data.copy()
test['clean_tag'] = np.where((test['tag'].str.match('revenue', case = False))&
                             (test['qtrs'] == 1)&
                             (test['uom'] == 'USD'), 
                             'Revenue', test['tag'])

test_group = test.groupby(['adsh', 'ddate', 'clean_tag']).apply(lambda g: g[g['value'] == g['value'].max()]).reset_index()

## Reshape <a name="Reshape"></a>

Check again that we're unique on the `adsh-ddate-tag` level

In [128]:
sec_data.groupby(['adsh', 'tag', 'ddate'], as_index = False)\
    .count()\
    .sort_values('value', ascending = False)\
    .query("value > 1")

Unnamed: 0,adsh,tag,ddate,version,coreg,qtrs,uom,value,no_coreg,no_value,cik,company_name,sic,country,period,form,fye,accepted,instance,doc


In [85]:
sec_data_wide = sec_data.pivot_table(values = 'value',
                                     index = ['adsh', 'company_name', 'ddate', 'qtrs', 
                                                        'uom', 'cik', 'sic', 'country', 'period'],
                                    columns = 'tag')

In [None]:
sec_data_wide.reset_index()

In [129]:
sec_data_wide.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,tag,ACAIndustryFeeExpense,AMPSImpactonStockholdersEquity,AOCIImpactofNEPDeconsolidation,AOCIreclassedtoRetainedEarningsincometax,APICEquityBasedPaymentArrangementIncreaseForCostRecognition,APICSharebasedPaymentArrangementESOPIncreaseforCostRecognition,APICSharebasedPaymentArrangementIncreaseforCostRecognitionServicesRendered,APICSharebasedPaymentArrangementReclassificationfromLiabilitytoEquity,APICSharebasedPaymentArrangementReversalOfCostRecognition,ATMAndCheckCardExpense,...,WriteoffOfCreditFacilityAmendmentFees,WriteoffOfDeferredOfferingCosts,WriteoffOfFullyDepreciatedAsset,WriteoffOfInsuranceReceivable,WriteoffofNetDiscountandDebtIssuanceCosts,WrittenOptionsAtFairValue,WrittendownValue,WrittendownValueFixedAssets,WrittendownValueOtherAssets,property
adsh,company_name,ddate,qtrs,uom,cik,sic,country,period,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
0000002178-19-000086,"ADAMS RESOURCES & ENERGY, INC.",20180630,1,USD,2178,5172.0,US,20190630,,,,,,,,,,,...,,,,,,,,,,
0000002178-19-000086,"ADAMS RESOURCES & ENERGY, INC.",20181231,0,USD,2178,5172.0,US,20190630,,,,,,,,,,,...,,,,,,,,,,
0000002178-19-000086,"ADAMS RESOURCES & ENERGY, INC.",20181231,0,shares,2178,5172.0,US,20190630,,,,,,,,,,,...,,,,,,,,,,
0000002178-19-000086,"ADAMS RESOURCES & ENERGY, INC.",20190331,0,shares,2178,5172.0,US,20190630,,,,,,,,,,,...,,,,,,,,,,
0000002178-19-000086,"ADAMS RESOURCES & ENERGY, INC.",20190331,1,USD,2178,5172.0,US,20190630,,,,,,,,,,,...,,,,,,,,,,


In [93]:
percent_missing = sec_data_wide.isnull().sum() * 100 / len(sec_data_wide)
missing_value_df = pd.DataFrame({'column_name': sec_data_wide.columns,
                                 'percent_missing': percent_missing}).sort_values('percent_missing', ascending = True)

In [130]:
missing_value_df.head()

Unnamed: 0_level_0,column_name,percent_missing
tag,Unnamed: 1_level_1,Unnamed: 2_level_1
CommonStockSharesAuthorized,CommonStockSharesAuthorized,83.81184
CommonStockSharesIssued,CommonStockSharesIssued,84.298172
CommonStockSharesOutstanding,CommonStockSharesOutstanding,85.14003
PreferredStockSharesAuthorized,PreferredStockSharesAuthorized,89.458326
InterestExpense,InterestExpense,89.560624


## Save <a name="Save"></a>
Create CSV versions in build/raw

In [88]:
sec_data.to_csv(data_root+'02-build/clean/2019q3/sec_data_long.csv')

In [89]:
sec_data_wide.to_csv(data_root+'02-build/clean/2019q3/sec_data_wide.csv')