# Program Header

In [38]:
import pandas as pd
import numpy as np
import autoreload
import os
import sys
import jellyfish
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
from header import *

In [39]:
data_root = initialize_data_root("AY")

# Table of contents
1. [Import](#Import)
2. [Data Understanding](#DataUnderstanding)
2. [Clean](#Clean)
3. [Merge](#Merge)
4. [Create Features](#CreateFeatures)
5. [Save](#Save)

## Import <a name="Import"></a>

In [40]:
raw_sec_data = pd.read_csv(data_root+'02-build/clean/2019q3/sec_data_long.csv')
raw_tag_map = pd.read_csv(data_root+'02-build/clean/2019q3/tag_map.csv')

In [41]:
raw_sec_data.columns

Index(['adsh', 'tag', 'version', 'coreg', 'ddate', 'qtrs', 'uom', 'value',
       'no_coreg', 'no_value', 'cik', 'company_name', 'sic', 'country',
       'period', 'form', 'fye', 'accepted', 'instance', 'doc'],
      dtype='object')

In [42]:
raw_tag_map.columns

Index(['clean_tag', 'tag', 'uom', 'qtrs'], dtype='object')

## Merge <a name="Merge"></a>

In [46]:
sec_data = raw_sec_data.merge(raw_tag_map, on = ['qtrs', 'uom', 'tag'])\
    .groupby(['adsh', 'qtrs', 'uom', 'clean_tag', 'ddate', 'company_name', 
              'cik', 'sic', 'country', 'period', 'form'], as_index = False)['value']\
    .max()

## Reshape <a name="Reshape"></a>

Check again that we're unique on the `adsh-ddate-tag` level

In [47]:
sec_data.groupby(['adsh', 'clean_tag', 'ddate'])\
    .nunique()\
    .sort_values('value', ascending = False)\
    .query("value > 1")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,adsh,qtrs,uom,clean_tag,ddate,company_name,cik,sic,country,period,form,value
adsh,clean_tag,ddate,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1


In [48]:
sec_data_wide = sec_data.pivot_table(values = 'value',
                                     index = ['adsh', 'company_name', 'ddate', 'qtrs', 
                                                        'uom', 'cik', 'sic', 'country', 'period'],
                                    columns = 'clean_tag')\
    .reset_index()

In [49]:
percent_missing = sec_data_wide.isnull().sum() * 100 / len(sec_data_wide)
missing_value_df = pd.DataFrame({'column_name': sec_data_wide.columns,
                                 'percent_missing': percent_missing}).sort_values('percent_missing', ascending = True)

In [52]:
sec_data_wide.groupby('adsh').

Unnamed: 0_level_0,column_name,percent_missing
clean_tag,Unnamed: 1_level_1,Unnamed: 2_level_1
adsh,adsh,0.000000
company_name,company_name,0.000000
ddate,ddate,0.000000
qtrs,qtrs,0.000000
uom,uom,0.000000
cik,cik,0.000000
sic,sic,0.000000
country,country,0.000000
period,period,0.000000
StockholdersEquity,StockholdersEquity,69.058577


In [51]:
sec_data_wide

clean_tag,adsh,company_name,ddate,qtrs,uom,cik,sic,country,period,AOCIImpactofNEPDeconsolidation,...,WorkersCompensationClaimReceivablesNetOfValuationAllowance,WorkersCompensationRiskPoolDepositInReceivershipNet,WorkingCapitalPurchasePriceAdjustment,WorksiteEmployeePayrollCost,WriteDownOfAssetsHeldForSale,WriteOffOfDeferredDebtIssuanceCost,WriteOffOfOtherAssets,WritedownOfOtherRealEstateAndGainsLossesOnSale,WrittendownValue,property
0,0000002178-19-000086,"ADAMS RESOURCES & ENERGY, INC.",2018-03-31,0,USD,2178,5172.0,US,20190630,,...,,,,,,,,,,
1,0000002178-19-000086,"ADAMS RESOURCES & ENERGY, INC.",2018-06-30,0,USD,2178,5172.0,US,20190630,,...,,,,,,,,,,
2,0000002178-19-000086,"ADAMS RESOURCES & ENERGY, INC.",2018-06-30,1,USD,2178,5172.0,US,20190630,,...,,,,,,,,,,
3,0000002178-19-000086,"ADAMS RESOURCES & ENERGY, INC.",2018-12-31,0,USD,2178,5172.0,US,20190630,,...,,,,,,,,,,
4,0000002178-19-000086,"ADAMS RESOURCES & ENERGY, INC.",2018-12-31,0,shares,2178,5172.0,US,20190630,,...,,,,,,,,,,
5,0000002178-19-000086,"ADAMS RESOURCES & ENERGY, INC.",2019-03-31,0,USD,2178,5172.0,US,20190630,,...,,,,,,,,,,
6,0000002178-19-000086,"ADAMS RESOURCES & ENERGY, INC.",2019-03-31,0,shares,2178,5172.0,US,20190630,,...,,,,,,,,,,
7,0000002178-19-000086,"ADAMS RESOURCES & ENERGY, INC.",2019-03-31,1,USD,2178,5172.0,US,20190630,,...,,,,,,,,,,
8,0000002178-19-000086,"ADAMS RESOURCES & ENERGY, INC.",2019-03-31,1,shares,2178,5172.0,US,20190630,,...,,,,,,,,,,
9,0000002178-19-000086,"ADAMS RESOURCES & ENERGY, INC.",2019-06-30,0,USD,2178,5172.0,US,20190630,,...,,,,,,,,,,


## Save <a name="Save"></a>
Create CSV versions in build/raw

In [88]:
sec_data.to_csv(data_root+'02-build/clean/2019q3/sec_data_long.csv')

In [89]:
sec_data_wide.to_csv(data_root+'02-build/clean/2019q3/sec_data_wide.csv')