# Program Header

In [6]:
import pandas as pd
import numpy as np
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
from header import *

In [7]:
data_root = initialize_data_root("AY")

In [8]:
import difflib
import jellyfish
import fuzzywuzzy
from fuzzywuzzy import process

# Table of contents
1. [Import](#Import)
2. [Create Tag Map](#CreateTagMap)
5. [Save](#Save)

## Import <a name="Import"></a>

In [9]:
raw_sec_data = pd.read_csv(data_root+'02-build/clean/2019q3/sec_data_long.csv')

## Create Tag Map <a name="CreateTagMap"></a>

In [10]:
sec_data = raw_sec_data.copy()

Let's split the tags into groups that definitely don't belong together using `qtrs`, `uom`.

In [11]:
tag_counts = sec_data.groupby(['qtrs', 'uom'])['tag'].count()

In [12]:
tag_counts

qtrs  uom   
0     USD       249963
      pure         431
      shares     25490
1     USD       171873
      pure        1194
      shares     13455
Name: tag, dtype: int64

In [13]:
raw_tags = sec_data['tag'].unique()

A good tag has strong reach (at least 90%) and isn't so general as to have too many duplicates. To generate a strong tag, we use fuzzywuzzy to find similar tags and then find the longest substring between them. Then, we evaluate how good of a tag it is. Once a group of tags have been formed, they are removed from the raw tag group, making future operations faster and faster.

In [35]:
def test_tag_reach(list_of_tags):
    search_pattern = '|'.join(list_of_tags)
    total = sec_data['tag'].str.contains(search_pattern)
    return(sum(total)/len(sec_data.adsh.unique()))

In [48]:
def odd_one_out(list_of_tags):
    mean_values = sec_data.loc[sec_data.tag.isin(list_of_tags)].groupby('tag')['value'].mean()
    return(mean_values)

In [2]:
def matching_tags(test_tag):
    matching_tags = sec_data['tag'].str.contains(test_tag)
    return(sec_data['tag'].loc[matching_tags].unique())

In [51]:
def view_all_tags(adsh):
    tags = sec_data.loc[sec_data.adsh == adsh].tag.unique()
    return(tags)

In [19]:
test_tag_reach(['Revenue'])

9504

In [89]:
test_tag_reach('Inventory')

8014

In [21]:
process.extract('Revenue', sec_data['tag'].unique())

[('Revenues', 93),
 ('RevenueFromContractWithCustomerIncludingAssessedTax', 90),
 ('ContractWithCustomerLiabilityRevenueRecognized', 90),
 ('RevenueFromContractWithCustomerExcludingAssessedTax', 90),
 ('RevenueFromRelatedParties', 90)]

In [22]:
list_of_necessary_tags = ['Revenue', 'Net Income', 'Assets', 'Liabilities', 'Shareholder Equity', 'Operating Cash Flow', 'Depreciation']

In [56]:
test = process.extract('WeightedAverageNumberOfSharesOutstandingBasic', sec_data['tag'].unique())

In [57]:
test

[('WeightedAverageNumberOfSharesOutstandingBasic', 100),
 ('SharesOutstanding', 90),
 ('WeightedAverageNumberOfSharesOutstandingBasicAndDilutedFounderShares', 90),
 ('WeightedAverageNumberOfSharesOutstandingBasicIncludingLimitedPartnershipUnits',
  90),
 ('WeightedAverageNumberOfShareOutstandingBasicAndDiluted', 89)]

In [58]:
test_tag_reach([i[0] for i in test])

2.8088311688311687

In [55]:
odd_one_out([i[0] for i in test])

tag
ComprehensiveIncomeNetOfTax                                                        1.204429e+08
ComprehensiveIncomeNetOfTaxAttributableToNoncontrollingInterest                    9.120837e+06
ComprehensiveIncomeNetOfTaxAvailableToCommonStockholdersBasic                     -2.553000e+09
ComprehensiveIncomeNetOfTaxIncludingPortionAttributableToNoncontrollingInterest    2.007223e+08
StockholdersEquityBeforeAccumulatedOtherComprehensiveIncomeNetOfTaxes              3.926950e+08
Name: value, dtype: float64

In [53]:
from statistics import stdev

We'll convert these dictionary tag maps to a data-frame to easily use SQL-style joins.

In [15]:
tag_map_df = pd.DataFrame(columns = ['clean_tag', 'tag', 'uom', 'qtrs'])
tag_map_df

Unnamed: 0,clean_tag,tag,uom,qtrs


In [16]:
i = 0

for qtr in unique_qtr:
    for uom in unique_uom:
        map_df = pd.DataFrame(dict([(k,pd.Series(v)) for k,v in list_of_maps[i].items()]))\
            .melt()\
            .dropna()\
            .rename({'variable':'clean_tag', 'value':'tag'}, axis = 'columns')
        map_df['uom'] = uom
        map_df['qtrs'] = qtr
        tag_map_df = tag_map_df.append(map_df, ignore_index = True)
        
        i += 1

In [17]:
print("Consolidated ",round((1-len(tag_map_df.clean_tag.unique())/len(tag_map_df.tag.unique()))*100,2),"% of tags", sep = "")

Consolidated 78.11% of tags


In addition to the algorithm, we use domain knowledge to add in exceptions.

## Save <a name="Save"></a>
Save tag map to build/clean

In [20]:
# Save
tag_map_df.to_csv(data_root+'02-build/clean/2019q3/tag_map.csv', index = False)