# Program Header

In [5]:
import pandas as pd
import numpy as np
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
from header import *
initialize_data_root("AY")

'Initialized data root'

In [16]:
data_root

'C:/Users/Andrew/1001-term-project/data/'

# Table of contents
1. [Import](#Import)
2. [Data Understanding](#DataUnderstanding)
2. [Clean](#Clean)
3. [Merge](#Merge)
4. [Create Features](#CreateFeatures)
5. [Save](#Save)

## Import <a name="Import"></a>

In [17]:
raw_num_data = pd.read_csv(data_root+'02-build/raw/2019q3/num.csv')
raw_tag_data = pd.read_csv(data_root+'02-build/raw/2019q3/tag.csv')

## Clean <a name="Clean"></a>

In [14]:
tag_data = raw_tag_data\
    .filter(['tag', 'version', 'doc'])\
    .drop_duplicates()

In [15]:
tag_data

0
1


In [11]:
tag_data['doc'][0]

KeyError: 'doc'

In [None]:
# Get rid of coregister/parent company
# Keep only the balance sheet and single quarter flows

num_data = raw_num_data\
    .drop('footnote', axis = 'columns')\
    .assign(no_coreg = pd.isna(raw_num_data['coreg']))\
    .assign(no_value = pd.isna(raw_num_data['value']))\
    .assign(ddate = pd.to_datetime(raw_num_data['ddate'], format='%Y%m%d', errors = 'coerce'))\
    .query("(no_coreg == True) & (no_value == False) & (qtrs in (0,1)) & (uom in ['pur'])")

In [None]:
num_data.shape

See if we have **num_data** observations at the `adsh, tag, ddate` level

In [None]:
multiple_matches = num_data.groupby(['adsh', 'tag', 'ddate'], as_index = False)\
    .count()\
    .sort_values('value', ascending = False)\
    .query("value > 1")
duplicate_causing_tags = multiple_matches.tag.unique()
duplicate_causing_tags

In [None]:
print("%.2f%% tags create duplicates"% round(len(duplicate_causing_tags)/len(num_data.tag.unique())*100,2))

Since only a small percent of tags are trouble, we'll just remove them for now.

In [None]:
num_data = num_data.query("tag not in @duplicate_causing_tags")

Note for checking: These tags may just be variations of other important tags. Investigate if we're dropping anything important. Needs further investigation during checking process.

Now, we have unique rows for each `adsh, tag, ddate`.

In [None]:
multiple_matches = num_data.groupby(['adsh', 'tag', 'ddate'], as_index = False)\
    .count()\
    .sort_values('value', ascending = False)\
    .query("value > 1")
multiple_matches

## Merge <a name="Merge"></a>

In [None]:
num_data.columns

In [81]:
sec_data = num_data.merge(pre_data, on = ['adsh', 'tag', 'version'])\
    .merge(sub_data, on = ['adsh'])\
    .merge(tag_data, on = ['tag', 'version'])

## Create Features <a name="CreateFeatures"></a>
Create `ticker` and `clean_tag` based on financial statement line items.

In [140]:
sec_data[['cik','company_name','instance']].drop_duplicates().shape

(5776, 3)

In [323]:
pd.DataFrame(sec_data.tag.unique()).to_csv(data_root+"tags.csv")

In [325]:
test = sec_data.copy().sample(1000)
test.iloc[0:1,]

Unnamed: 0,adsh,tag,version,coreg,ddate,qtrs,uom,value,no_coreg,no_value,cik,company_name,sic,country,period,form,fye,accepted,instance,doc
0,0001766016-19-000005,EntityCommonStockSharesOutstanding,dei/2014,,20190630,0,shares,2071002.0,True,False,1766016,"CRUCIAL INNOVATIONS, CORP.",8200.0,CN,20190630,10-Q,1231.0,2019-07-19 18:35:00.0,none-20190630.xml,Indicate number of shares or other units outst...


In [None]:
test = sec_data.copy()
test['clean_tag'] = np.where((test['tag'].str.match('revenue', case = False))&
                             (test['qtrs'] == 1)&
                             (test['uom'] == 'USD'), 
                             'Revenue', test['tag'])

In [None]:
test

In [327]:
def check_for_problems(tag):
    problems = test.query('clean_tag == @tag')\
        .groupby(['adsh', 'ddate'], as_index = False)\
        .size()
    
    problems = problems[problems != 1].reset_index(name = 'count')
    
    return(problems)

tag_tracker = check_for_problems("Revenue")

AttributeError: Cannot access callable attribute 'query' of 'DataFrameGroupBy' objects, try using the 'apply' method

In [320]:
tag_tracker['count'].value_counts()

2    232
3     19
4      2
Name: count, dtype: int64

In [321]:
issue = 1
test.query('(clean_tag == @tag)&(adsh == @tag_tracker.adsh[@issue])&(ddate == @tag_tracker.ddate[@issue])')

Unnamed: 0,adsh,tag,version,coreg,ddate,qtrs,uom,value,no_coreg,no_value,...,company_name,sic,country,period,form,fye,accepted,instance,doc,clean_tag
333091,0000004962-19-000051,RevenuesNetOfInterestExpense,us-gaap/2019,,20190630,1,USD,10838000000.0,True,False,...,AMERICAN EXPRESS CO,6199.0,US,20190630,10-Q,1231.0,2019-07-23 15:55:00.0,axpq21910q_htm.xml,"Amount of revenue recognized from goods sold, ...",Revenue
338462,0000004962-19-000051,RevenueFromContractWithCustomerExcludingAssess...,us-gaap/2019,,20190630,1,USD,7080000000.0,True,False,...,AMERICAN EXPRESS CO,6199.0,US,20190630,10-Q,1231.0,2019-07-23 15:55:00.0,axpq21910q_htm.xml,"Amount, excluding tax collected from customer,...",Revenue


In [262]:
test.query('(clean_tag == @tag)&(adsh in @tag_tracker.adsh)&(ddate in @tag_tracker.ddate)').uom.unique()

array(['USD', 'CAD', 'pure'], dtype=object)

In [206]:
tag_tracker.index[0][0]

'0000002488-19-000104'

In [194]:
tag = 'Revenue'
problems = test.query('clean_tag == @tag')\
        .groupby(['adsh', 'ddate'])\
        .size()\
        .value_counts()

1    4637
2     423
3      25
4       3
dtype: int64

In [160]:
test_series = pd.Series(['revenue', 'rev', 'rev12', '4324324', 'reven'])

test_series.str.match('revenue', case = False)

0     True
1    False
2    False
3    False
4    False
dtype: bool

## Reshape <a name="Reshape"></a>

Check again that we're unique on the `adsh-ddate-tag` level

In [128]:
sec_data.groupby(['adsh', 'tag', 'ddate'], as_index = False)\
    .count()\
    .sort_values('value', ascending = False)\
    .query("value > 1")

Unnamed: 0,adsh,tag,ddate,version,coreg,qtrs,uom,value,no_coreg,no_value,cik,company_name,sic,country,period,form,fye,accepted,instance,doc


In [85]:
sec_data_wide = sec_data.pivot_table(values = 'value',
                                     index = ['adsh', 'company_name', 'ddate', 'qtrs', 
                                                        'uom', 'cik', 'sic', 'country', 'period'],
                                    columns = 'tag')

In [None]:
sec_data_wide.reset_index()

In [129]:
sec_data_wide.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,tag,ACAIndustryFeeExpense,AMPSImpactonStockholdersEquity,AOCIImpactofNEPDeconsolidation,AOCIreclassedtoRetainedEarningsincometax,APICEquityBasedPaymentArrangementIncreaseForCostRecognition,APICSharebasedPaymentArrangementESOPIncreaseforCostRecognition,APICSharebasedPaymentArrangementIncreaseforCostRecognitionServicesRendered,APICSharebasedPaymentArrangementReclassificationfromLiabilitytoEquity,APICSharebasedPaymentArrangementReversalOfCostRecognition,ATMAndCheckCardExpense,...,WriteoffOfCreditFacilityAmendmentFees,WriteoffOfDeferredOfferingCosts,WriteoffOfFullyDepreciatedAsset,WriteoffOfInsuranceReceivable,WriteoffofNetDiscountandDebtIssuanceCosts,WrittenOptionsAtFairValue,WrittendownValue,WrittendownValueFixedAssets,WrittendownValueOtherAssets,property
adsh,company_name,ddate,qtrs,uom,cik,sic,country,period,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1
0000002178-19-000086,"ADAMS RESOURCES & ENERGY, INC.",20180630,1,USD,2178,5172.0,US,20190630,,,,,,,,,,,...,,,,,,,,,,
0000002178-19-000086,"ADAMS RESOURCES & ENERGY, INC.",20181231,0,USD,2178,5172.0,US,20190630,,,,,,,,,,,...,,,,,,,,,,
0000002178-19-000086,"ADAMS RESOURCES & ENERGY, INC.",20181231,0,shares,2178,5172.0,US,20190630,,,,,,,,,,,...,,,,,,,,,,
0000002178-19-000086,"ADAMS RESOURCES & ENERGY, INC.",20190331,0,shares,2178,5172.0,US,20190630,,,,,,,,,,,...,,,,,,,,,,
0000002178-19-000086,"ADAMS RESOURCES & ENERGY, INC.",20190331,1,USD,2178,5172.0,US,20190630,,,,,,,,,,,...,,,,,,,,,,


In [93]:
percent_missing = sec_data_wide.isnull().sum() * 100 / len(sec_data_wide)
missing_value_df = pd.DataFrame({'column_name': sec_data_wide.columns,
                                 'percent_missing': percent_missing}).sort_values('percent_missing', ascending = True)

In [130]:
missing_value_df.head()

Unnamed: 0_level_0,column_name,percent_missing
tag,Unnamed: 1_level_1,Unnamed: 2_level_1
CommonStockSharesAuthorized,CommonStockSharesAuthorized,83.81184
CommonStockSharesIssued,CommonStockSharesIssued,84.298172
CommonStockSharesOutstanding,CommonStockSharesOutstanding,85.14003
PreferredStockSharesAuthorized,PreferredStockSharesAuthorized,89.458326
InterestExpense,InterestExpense,89.560624


## Save <a name="Save"></a>
Create CSV versions in build/raw

In [88]:
sec_data.to_csv(data_root+'02-build/clean/2019q3/sec_data_long.csv')

In [89]:
sec_data_wide.to_csv(data_root+'02-build/clean/2019q3/sec_data_wide.csv')