# Program Header

In [75]:
import pandas as pd
import numpy as np
import autoreload
import os
import sys
import jellyfish
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
from header import *

In [76]:
data_root = initialize_data_root("AY")

# Table of contents
1. [Import](#Import)
2. [Merge](#Merge)
3. [Clean](#Clean)
4. [Reshape](#Reshape)
5. [Save](#Save)

## Import <a name="Import"></a>

In [77]:
raw_sec_data = pd.read_csv(data_root+'02-build/clean/2019q3/sec_data_long.csv')
raw_tag_map = pd.read_csv(data_root+'02-build/clean/2019q3/tag_map.csv')
raw_cik_ticker_map = pd.read_csv(data_root+'02-build/clean/cik_ticker_map.csv')

In [78]:
raw_sec_data.columns

Index(['adsh', 'tag', 'version', 'coreg', 'ddate', 'qtrs', 'uom', 'value',
       'no_coreg', 'no_value', 'cik', 'company_name', 'sic', 'country',
       'period', 'form', 'fye', 'filed', 'accepted', 'instance', 'doc',
       'dyrqt', 'pyrqt'],
      dtype='object')

In [79]:
raw_tag_map.columns

Index(['clean_tag', 'tag', 'uom', 'qtrs'], dtype='object')

In [80]:
raw_cik_ticker_map.columns

Index(['cik', 'ticker'], dtype='object')

## Merge <a name="Merge"></a>

In [98]:
sec_data = raw_sec_data.merge(raw_tag_map, on = ['qtrs', 'uom', 'tag'])\
    .merge(raw_cik_ticker_map, on = ['cik'])

In [100]:
raw_cik_ticker_map

Unnamed: 0,cik,ticker
0,1750,1
1,1800,1
2,1961,1
3,2098,1
4,2135,1
5,2178,1
6,2186,1
7,2230,1
8,2488,1
9,2491,1


The goal is to get our data to the `adsh`, `dyrqt` level and use the `clean_tag`'s as columns, so we can't have duplicate values for the same `adsh-clean_tag-dyrqt`. In case of duplicates, let's keep the maximum value. The idea behind keeping the max is that smaller versions may be older (over 3 months instead of over 4 months) or the smaller groups may be categories of the larger one.

In [82]:
sec_data = sec_data.groupby(['adsh', 'qtrs', 'uom', 'clean_tag', 'filed',
                             'dyrqt', 'company_name', 'ticker',
                             'cik', 'sic', 'country', 'period', 'form'], as_index = False)['value']\
    .max()

## Clean <a name="Clean"></a>

Check that we're unique on the `adsh-dyrqt-cleantag` level

In [83]:
sec_data.groupby(['adsh', 'dyrqt', 'clean_tag'])\
    .nunique()\
    .sort_values('value', ascending = False)\
    .query("value > 1")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,adsh,qtrs,uom,clean_tag,filed,dyrqt,company_name,ticker,cik,sic,country,period,form,value
adsh,dyrqt,clean_tag,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0001428205-19-000214,201902,CommonStockDividendsPerShareDeclared,1,2,1,1,1,1,1,1,1,1,1,1,1,2
0001628280-19-010519,201902,GainsLossesOnExtinguishmentOfDebt,1,2,1,1,1,1,1,1,1,1,1,1,1,2
0001564590-19-030190,201902,CommonStockDividendsPerShareDeclared,1,2,1,1,1,1,1,1,1,1,1,1,1,2
0001748773-19-000004,201902,NetIncomeLoss,1,2,1,1,1,1,1,1,1,1,1,1,1,2
0001564590-19-030407,201902,CommonStockDividendsPerShareDeclared,1,2,1,1,1,1,1,1,1,1,1,1,1,2
0001691303-19-000036,201902,CommonStockDividendsPerShareDeclared,1,2,1,1,1,1,1,1,1,1,1,1,1,2
0001376139-19-000053,201902,CommonStockDividendsPerShareDeclared,1,2,1,1,1,1,1,1,1,1,1,1,1,2
0000896841-19-000165,201902,AdjustmentsToAdditionalPaidInCapitalStockIssuedIssuanceCosts,1,2,1,1,1,1,1,1,1,1,1,1,1,2
0001564590-19-031538,201902,AllocatedShareBasedCompensationExpense,1,2,1,1,1,1,1,1,1,1,1,1,1,2
0001697500-19-000050,201902,AllocatedShareBasedCompensationExpense,1,2,1,1,1,1,1,1,1,1,1,1,1,2


In [84]:
sec_data = sec_data.drop_duplicates(['adsh', 'clean_tag', 'dyrqt'], keep = 'first')

In [85]:
sec_data.groupby(['adsh', 'dyrqt', 'clean_tag'])\
    .nunique()\
    .sort_values('value', ascending = False)\
    .query("value > 1")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,adsh,qtrs,uom,clean_tag,filed,dyrqt,company_name,ticker,cik,sic,country,period,form,value
adsh,dyrqt,clean_tag,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1


## Reshape <a name="Reshape"></a>

In [86]:
sec_data

Unnamed: 0,adsh,qtrs,uom,clean_tag,filed,dyrqt,company_name,ticker,cik,sic,country,period,form,value
0,0000002178-19-000086,0,USD,AccountsPayableCurrent,2019-08-07 00:00:00,201902,"ADAMS RESOURCES & ENERGY, INC.",1,2178,5172.0,US,2019-06-30 00:00:00,10-Q,1.169830e+08
1,0000002178-19-000086,0,USD,AccumulatedDepreciationDepletionAndAmortizatio...,2019-08-07 00:00:00,201902,"ADAMS RESOURCES & ENERGY, INC.",1,2178,5172.0,US,2019-06-30 00:00:00,10-Q,9.764000e+07
2,0000002178-19-000086,0,USD,AdditionalPaidInCapital,2019-08-07 00:00:00,201902,"ADAMS RESOURCES & ENERGY, INC.",1,2178,5172.0,US,2019-06-30 00:00:00,10-Q,1.249700e+07
3,0000002178-19-000086,0,USD,AllowanceForDoubtfulAccountsReceivableCurrent,2019-08-07 00:00:00,201902,"ADAMS RESOURCES & ENERGY, INC.",1,2178,5172.0,US,2019-06-30 00:00:00,10-Q,1.170000e+05
4,0000002178-19-000086,0,USD,AssetRetirementObligationsNoncurrent,2019-08-07 00:00:00,201902,"ADAMS RESOURCES & ENERGY, INC.",1,2178,5172.0,US,2019-06-30 00:00:00,10-Q,1.550000e+06
5,0000002178-19-000086,0,USD,Assets,2019-08-07 00:00:00,201902,"ADAMS RESOURCES & ENERGY, INC.",1,2178,5172.0,US,2019-06-30 00:00:00,10-Q,3.017640e+08
6,0000002178-19-000086,0,USD,CashAndCashEquivalentsAtCarryingValue,2019-08-07 00:00:00,201902,"ADAMS RESOURCES & ENERGY, INC.",1,2178,5172.0,US,2019-06-30 00:00:00,10-Q,1.276700e+08
7,0000002178-19-000086,0,USD,CashCashEquivalentsRestrictedCashAndRestricted...,2019-08-07 00:00:00,201902,"ADAMS RESOURCES & ENERGY, INC.",1,2178,5172.0,US,2019-06-30 00:00:00,10-Q,1.325460e+08
8,0000002178-19-000086,0,USD,CommonStockValue,2019-08-07 00:00:00,201902,"ADAMS RESOURCES & ENERGY, INC.",1,2178,5172.0,US,2019-06-30 00:00:00,10-Q,4.230000e+05
9,0000002178-19-000086,0,USD,DeferredIncomeTaxLiabilitiesNet,2019-08-07 00:00:00,201902,"ADAMS RESOURCES & ENERGY, INC.",1,2178,5172.0,US,2019-06-30 00:00:00,10-Q,5.283000e+06


In [92]:
sec_data_wide = sec_data.pivot_table(values = 'value',
                                     index = ['adsh', 'company_name', 'dyrqt', 'filed',
                                              'cik', 'sic', 'ticker', 'country', 'period'],
                                    columns = 'clean_tag')\
    .reset_index()

In [93]:
percent_missing = sec_data_wide.isnull().sum() * 100 / len(sec_data_wide)
missing_value_df = pd.DataFrame({'column_name': sec_data_wide.columns,
                                 'percent_missing': percent_missing}).sort_values('percent_missing', ascending = True)

In [94]:
percent_missing_by_company = sec_data_wide.groupby('adsh').apply(lambda x: x.notnull().mean())

In [95]:
percent_missing.loc[percent_missing < 10]

clean_tag
adsh                                             0.000000
company_name                                     0.000000
dyrqt                                            0.000000
filed                                            0.000000
cik                                              0.000000
sic                                              0.000000
ticker                                           0.000000
country                                          0.000000
period                                           0.000000
AdditionalPaidInCapital                          8.965696
Assets                                           0.285863
CashAndCashEquivalentsAtCarryingValue            6.652807
EarningsPerShareBasic                            8.757796
Liabilities                                      0.311850
LiabilitiesAndStockholdersEquity                 0.233888
NetIncomeLoss                                    4.391892
PropertyPlantAndEquipmentNet                     9.615385
Reta

## Save <a name="Save"></a>
Create CSV versions in build/raw

In [96]:
sec_data_wide.to_csv(data_root+'02-build/clean/2019q3/sec_data_wide.csv')