# Program Header

In [17]:
import pandas as pd
import numpy as np
import autoreload
import os
import sys
import jellyfish
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
from header import *

In [18]:
data_root = initialize_data_root("AY")

# Table of contents
1. [Import](#Import)
2. [Merge](#Merge)
3. [Clean](#Clean)
4. [Reshape](#Reshape)
5. [Save](#Save)

## Import <a name="Import"></a>

In [29]:
raw_sec_data = pd.read_csv(data_root+'02-build/clean/2019q3/sec_data_long.csv')
raw_tag_map = pd.read_csv(data_root+'02-build/clean/2019q3/tag_map.csv')

In [30]:
raw_sec_data.columns

Index(['index', 'adsh', 'tag', 'version', 'coreg', 'ddate', 'qtrs', 'uom',
       'value', 'no_coreg', 'no_value', 'cik', 'company_name', 'sic',
       'country', 'period', 'form', 'fye', 'accepted', 'instance', 'dyear',
       'doc'],
      dtype='object')

In [31]:
raw_tag_map.columns

Index(['clean_tag', 'tag', 'uom', 'qtrs'], dtype='object')

## Merge <a name="Merge"></a>

In [32]:
sec_data = raw_sec_data.merge(raw_tag_map, on = ['qtrs', 'uom', 'tag'])\
    .groupby(['adsh', 'qtrs', 'uom', 'clean_tag', 'ddate', 'company_name', 
              'cik', 'sic', 'country', 'period', 'form'], as_index = False)['value']\
    .max()

## Clean <a name="Clean"></a>

Check that we're unique on the `adsh-period-cleantag` level

In [41]:
sec_data.groupby(['adsh', 'period', 'clean_tag'])\
    .nunique()\
    .sort_values('value', ascending = False)\
    .query("value > 1")

Unnamed: 0,adsh,qtrs,uom,clean_tag,ddate,company_name,cik,sic,country,period,form,value
0,1,1,1,1,3,1,1,1,1,1,1,3
1,1,1,1,1,3,1,1,1,1,1,1,3
2,1,1,1,1,3,1,1,1,1,1,1,3
3,1,1,1,1,3,1,1,1,1,1,1,3
4,1,1,1,1,3,1,1,1,1,1,1,3
5,1,1,1,1,3,1,1,1,1,1,1,3
6,1,1,1,1,3,1,1,1,1,1,1,3
7,1,1,1,1,3,1,1,1,1,1,1,3
8,1,1,1,1,3,1,1,1,1,1,1,3
9,1,1,1,1,3,1,1,1,1,1,1,3


In [87]:
sec_data = sec_data.sort_values(['period', 'ddate']).drop_duplicates(['adsh', 'clean_tag', 'period'], keep = 'first')

In [88]:
sec_data.groupby(['adsh', 'period', 'clean_tag'])\
    .nunique()\
    .sort_values('value', ascending = False)\
    .query("value > 1")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,adsh,qtrs,uom,clean_tag,ddate,company_name,cik,sic,country,period,form,value
adsh,period,clean_tag,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1


## Reshape <a name="Reshape"></a>

In [96]:
sec_data_wide = sec_data.pivot_table(values = 'value',
                                     index = ['adsh', 'company_name', 'ddate', 
                                              'cik', 'sic', 'country', 'period'],
                                    columns = 'clean_tag')\
    .reset_index()

In [97]:
percent_missing = sec_data_wide.isnull().sum() * 100 / len(sec_data_wide)
missing_value_df = pd.DataFrame({'column_name': sec_data_wide.columns,
                                 'percent_missing': percent_missing}).sort_values('percent_missing', ascending = True)

In [98]:
percent_missing_by_company = sec_data_wide.groupby('adsh').apply(lambda x: x.notnull().mean())

In [102]:
percent_missing.sort_values()

clean_tag
adsh                                                                                             0.000000
company_name                                                                                     0.000000
ddate                                                                                            0.000000
cik                                                                                              0.000000
sic                                                                                              0.000000
country                                                                                          0.000000
period                                                                                           0.000000
Liabilities                                                                                     35.816619
LiabilitiesAndStockholdersEquity                                                                35.862464
Assets                              

## Save <a name="Save"></a>
Create CSV versions in build/raw

In [100]:
sec_data_wide.to_csv(data_root+'02-build/clean/2019q3/sec_data_wide.csv')