# Program Header

In [129]:
import pandas as pd
import numpy as np
import autoreload
import os
import sys
import jellyfish
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
from header import *

In [130]:
data_root = initialize_data_root("AY")

# Table of contents
1. [Import](#Import)
2. [Merge](#Merge)
3. [Clean](#Clean)
4. [Reshape](#Reshape)
5. [Save](#Save)

## Import <a name="Import"></a>

In [131]:
raw_sec_data = pd.read_csv(data_root+'02-build/clean/2019q3/sec_data_long.csv')
raw_tag_map = pd.read_csv(data_root+'02-build/clean/2019q3/tag_map.csv')
raw_cik_ticker_map = pd.read_csv(data_root+'02-build/clean/cik_ticker_map.csv')

In [132]:
raw_sec_data.columns

Index(['adsh', 'tag', 'version', 'coreg', 'ddate', 'qtrs', 'uom', 'value',
       'no_coreg', 'no_value', 'cik', 'company_name', 'sic', 'country',
       'period', 'form', 'fye', 'filed', 'accepted', 'instance', 'doc',
       'dyrqt', 'pyrqt'],
      dtype='object')

In [133]:
raw_tag_map.columns

Index(['tag', 'clean_tag'], dtype='object')

In [134]:
raw_cik_ticker_map.columns

Index(['ticker', 'cik'], dtype='object')

## Merge <a name="Merge"></a>

In [136]:
sec_data = raw_sec_data.merge(raw_tag_map, on = ['tag'])\
    .merge(raw_cik_ticker_map, on = ['cik'])

The goal is to get our data to the `adsh`, `dyrqt` level and use the `clean_tag`'s as columns, so we can't have duplicate values for the same `adsh-clean_tag-dyrqt`. In case of duplicates, let's keep the maximum value. The idea behind keeping the max is that smaller versions may be older (over 3 months instead of over 4 months) or the smaller groups may be categories of the larger one.

In [138]:
sec_data = sec_data.groupby(['adsh', 'qtrs', 'uom', 'clean_tag', 'filed',
                             'dyrqt', 'company_name', 'ticker',
                             'cik', 'sic', 'country', 'period', 'form'], as_index = False)['value']\
    .max()

## Clean <a name="Clean"></a>

Check that we're unique on the `adsh-dyrqt-cleantag` level

In [139]:
sec_data.groupby(['adsh', 'dyrqt', 'clean_tag'])\
    .nunique()\
    .sort_values('value', ascending = False)\
    .query("value > 1")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,adsh,qtrs,uom,clean_tag,filed,dyrqt,company_name,ticker,cik,sic,country,period,form,value
adsh,dyrqt,clean_tag,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
0001564590-19-031873,201902,IncomeTaxExpenseBenefit,1,2,1,1,1,1,1,1,1,1,1,1,1,2
0001747009-19-000026,201902,CommonStockSharesOutstanding,1,1,2,1,1,1,1,1,1,1,1,1,1,2
0001437578-19-000018,201902,IncomeTaxExpenseBenefit,1,2,1,1,1,1,1,1,1,1,1,1,1,2
0001558370-19-006684,201902,IncomeTaxExpenseBenefit,1,2,1,1,1,1,1,1,1,1,1,1,1,2
0001489393-19-000045,201902,IncomeTaxExpenseBenefit,1,2,1,1,1,1,1,1,1,1,1,1,1,2
0000854775-19-000017,201902,IncomeTaxExpenseBenefit,1,2,1,1,1,1,1,1,1,1,1,1,1,2
0001104659-19-042952,201902,IncomeTaxExpenseBenefit,1,2,1,1,1,1,1,1,1,1,1,1,1,2
0001530721-19-000068,201902,CashAndCashEquivalents,1,2,1,1,1,1,1,1,1,1,1,1,1,2
0001401680-19-000013,201902,CommonStockSharesOutstanding,1,1,2,1,1,1,1,1,1,1,1,1,1,2
0001144204-19-036885,201902,IncomeTaxExpenseBenefit,1,2,1,1,1,1,1,1,1,1,1,1,1,2


In [140]:
sec_data = sec_data.drop_duplicates(['adsh', 'clean_tag', 'dyrqt'], keep = 'first')

In [141]:
sec_data.groupby(['adsh', 'dyrqt', 'clean_tag'])\
    .nunique()\
    .sort_values('value', ascending = False)\
    .query("value > 1")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,adsh,qtrs,uom,clean_tag,filed,dyrqt,company_name,ticker,cik,sic,country,period,form,value
adsh,dyrqt,clean_tag,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1


## Reshape <a name="Reshape"></a>

In [143]:
sec_data.head()

Unnamed: 0,adsh,qtrs,uom,clean_tag,filed,dyrqt,company_name,ticker,cik,sic,country,period,form,value
0,0000002178-19-000086,0,USD,AccountsPayableCurrent,2019-08-07 00:00:00,201902,"ADAMS RESOURCES & ENERGY, INC.",ae,2178,5172.0,US,2019-06-30 00:00:00,10-Q,116983000.0
1,0000002178-19-000086,0,USD,AccountsReceivableNetCurrent,2019-08-07 00:00:00,201902,"ADAMS RESOURCES & ENERGY, INC.",ae,2178,5172.0,US,2019-06-30 00:00:00,10-Q,74499000.0
2,0000002178-19-000086,0,USD,AccumulatedDepreciationDepletionAndAmortizatio...,2019-08-07 00:00:00,201902,"ADAMS RESOURCES & ENERGY, INC.",ae,2178,5172.0,US,2019-06-30 00:00:00,10-Q,97640000.0
3,0000002178-19-000086,0,USD,AdditionalPaidInCapital,2019-08-07 00:00:00,201902,"ADAMS RESOURCES & ENERGY, INC.",ae,2178,5172.0,US,2019-06-30 00:00:00,10-Q,12497000.0
4,0000002178-19-000086,0,USD,AllowanceForDoubtfulAccountsReceivableCurrent,2019-08-07 00:00:00,201902,"ADAMS RESOURCES & ENERGY, INC.",ae,2178,5172.0,US,2019-06-30 00:00:00,10-Q,117000.0


In [144]:
sec_data_wide = sec_data.pivot_table(values = 'value',
                                     index = ['adsh', 'company_name', 'dyrqt', 'filed',
                                              'cik', 'sic', 'ticker', 'country', 'period'],
                                    columns = 'clean_tag')\
    .reset_index()

In [145]:
percent_missing = sec_data_wide.isnull().sum() * 100 / len(sec_data_wide)
missing_value_df = pd.DataFrame({'column_name': sec_data_wide.columns,
                                 'percent_missing': percent_missing}).sort_values('percent_missing', ascending = True)

In [146]:
percent_missing_by_company = sec_data_wide.groupby('adsh').apply(lambda x: x.notnull().mean())

In [147]:
percent_missing.loc[percent_missing < 15]

clean_tag
adsh                                                0.000000
company_name                                        0.000000
dyrqt                                               0.000000
filed                                               0.000000
cik                                                 0.000000
sic                                                 0.000000
ticker                                              0.000000
country                                             0.000000
period                                              0.000000
AdditionalPaidInCapital                            10.241747
Assets                                              0.363920
CashAndCashEquivalents                              1.507668
CommonStockSharesOutstanding                       12.737198
EarningsPerShareDiluted                             9.539901
LiabilitiesAndStockholdersEquity                    0.363920
NetIncomeLoss                                       3.431245
OperatingIncom

In [148]:
sec_data_wide.fillna(value = 0, inplace = True)

## Save <a name="Save"></a>
Create CSV versions in build/raw

In [149]:
sec_data_wide.to_csv(data_root+'02-build/clean/2019q3/sec_data_wide.csv')