# Program Header

In [1]:
import pandas as pd
import numpy as np
import autoreload
import os
import sys
import jellyfish
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
from header import *

In [2]:
data_root = initialize_data_root("AY")

# Table of contents
1. [Import](#Import)
2. [Data Understanding](#DataUnderstanding)
2. [Clean](#Clean)
3. [Merge](#Merge)
4. [Create Features](#CreateFeatures)
5. [Save](#Save)

## Import <a name="Import"></a>

In [7]:
raw_sec_data = pd.read_csv(data_root+'02-build/clean/2019q3/sec_data_long.csv')
raw_tag_map = pd.read_csv(data_root+'02-build/clean/2019q3/tag_map.csv')

In [8]:
raw_sec_data.columns

Index(['index', 'adsh', 'tag', 'version', 'coreg', 'ddate', 'qtrs', 'uom',
       'value', 'no_coreg', 'no_value', 'cik', 'company_name', 'sic',
       'country', 'period', 'form', 'fye', 'accepted', 'instance', 'dyear',
       'doc'],
      dtype='object')

In [9]:
raw_tag_map.columns

Index(['clean_tag', 'tag', 'uom', 'qtrs'], dtype='object')

## Merge <a name="Merge"></a>

In [11]:
sec_data = raw_sec_data.merge(raw_tag_map, on = ['qtrs', 'uom', 'tag'])\
    .groupby(['adsh', 'qtrs', 'uom', 'clean_tag', 'ddate', 'company_name', 
              'cik', 'sic', 'country', 'period', 'form'], as_index = False)['value']\
    .max()

## Reshape <a name="Reshape"></a>

Check that we're unique on the `adsh-period-cleantag` level

In [16]:
sec_data.groupby(['adsh', 'period', 'clean_tag'])\
    .nunique()\
    .sort_values('value', ascending = False)\
    .query("value > 1")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,adsh,qtrs,uom,clean_tag,ddate,company_name,cik,sic,country,period,form,value
adsh,period,clean_tag,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0001493152-19-013320,2019-06-30,CommonStockSharesAuthorized,1,1,1,1,3,1,1,1,1,1,1,3
0001393905-19-000238,2019-03-31,CommonStockSharesAuthorized,1,1,1,1,3,1,1,1,1,1,1,3
0001564590-19-030053,2019-06-30,CommonStockSharesAuthorized,1,1,1,1,3,1,1,1,1,1,1,3
0001493152-19-012396,2019-06-30,CommonStockSharesAuthorized,1,1,1,1,3,1,1,1,1,1,1,3
0001493152-19-013048,2019-06-30,BusinessCombinationRecognizedIdentifiableAssetsAcquiredAndLiabilitiesAssumedAssets,1,1,1,1,3,1,1,1,1,1,1,3
0001493152-19-010518,2019-05-31,CommonStockSharesAuthorized,1,1,1,1,3,1,1,1,1,1,1,3
0001493152-19-012321,2019-06-30,CommonStockSharesAuthorized,1,1,1,1,3,1,1,1,1,1,1,3
0001668428-19-000069,2019-06-30,CommonStockSharesAuthorized,1,1,1,1,3,1,1,1,1,1,1,3
0001721868-19-000428,2019-07-31,Q1StockbasedCompensationInShares,1,1,1,1,3,1,1,1,1,1,1,3
0001437749-19-016786,2019-06-30,CommonStockSharesAuthorized,1,1,1,1,3,1,1,1,1,1,1,3


In [None]:
sec_data_wide = sec_data.pivot_table(values = 'value',
                                     index = ['adsh', 'company_name', 'ddate', 'qtrs', 
                                                        'uom', 'cik', 'sic', 'country', 'period'],
                                    columns = 'clean_tag')\
    .reset_index()

In [None]:
percent_missing = sec_data_wide.isnull().sum() * 100 / len(sec_data_wide)
missing_value_df = pd.DataFrame({'column_name': sec_data_wide.columns,
                                 'percent_missing': percent_missing}).sort_values('percent_missing', ascending = True)

In [None]:
sec_data_wide.groupby('adsh')['StockholdersEquity'].count().value_counts()

In [None]:
percent_missing_by_company = sec_data_wide.groupby('adsh').apply(lambda x: x.notnull().mean())

In [None]:
missing_value_df

## Save <a name="Save"></a>
Create CSV versions in build/raw

In [None]:
sec_data.to_csv(data_root+'02-build/clean/2019q3/sec_data_long.csv')

In [None]:
sec_data_wide.to_csv(data_root+'02-build/clean/2019q3/sec_data_wide.csv')