# Program Header

In [26]:
import pandas as pd
import numpy as np
import autoreload
import os
import sys
import jellyfish
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)
from header import *

In [27]:
data_root = initialize_data_root("AY")

# Table of contents
1. [Import](#Import)
2. [Merge](#Merge)
3. [Clean](#Clean)
4. [Reshape](#Reshape)
5. [Save](#Save)

## Import <a name="Import"></a>

In [28]:
raw_sec_data = pd.read_csv(data_root+'02-build/clean/2019q3/sec_data_long.csv')
raw_tag_map = pd.read_csv(data_root+'02-build/clean/2019q3/tag_map.csv')
raw_cik_ticker_map = pd.read_csv(data_root+'02-build/clean/cik_ticker_map.csv')

In [29]:
raw_sec_data.columns

Index(['adsh', 'tag', 'version', 'coreg', 'ddate', 'qtrs', 'uom', 'value',
       'no_coreg', 'no_value', 'cik', 'company_name', 'sic', 'country',
       'period', 'form', 'fye', 'filed', 'accepted', 'instance', 'doc',
       'dyrqt'],
      dtype='object')

In [30]:
raw_tag_map.columns

Index(['clean_tag', 'tag', 'uom', 'qtrs'], dtype='object')

In [31]:
raw_cik_ticker_map.columns

Index(['cik', 'ticker'], dtype='object')

## Merge <a name="Merge"></a>

In [32]:
sec_data = raw_sec_data.merge(raw_tag_map, on = ['qtrs', 'uom', 'tag'])\
    .merge(raw_cik_ticker_map, on = ['cik'])

The goal is to get our data to the `adsh`, `dyrqt` level and use the `clean_tag`'s as columns, so we can't have duplicate values for the same `adsh-clean_tag-dyrqt`. In case of duplicates, let's keep the maximum value. The idea behind keeping the max is that smaller versions may be older (over 3 months instead of over 4 months) or the smaller groups may be categories of the larger one.

In [33]:
sec_data = sec_data.groupby(['adsh', 'qtrs', 'uom', 'clean_tag', 
                             'dyrqt', 'company_name', 'ticker',
                             'cik', 'sic', 'country', 'period', 'form'], as_index = False)['value']\
    .max()

## Clean <a name="Clean"></a>

Check that we're unique on the `adsh-dyrqt-cleantag` level

In [34]:
sec_data.groupby(['adsh', 'dyrqt', 'clean_tag'])\
    .nunique()\
    .sort_values('value', ascending = False)\
    .query("value > 1")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,adsh,qtrs,uom,clean_tag,dyrqt,company_name,ticker,cik,sic,country,period,form,value
adsh,dyrqt,clean_tag,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
0000896841-19-000165,201902,AdjustmentsToAdditionalPaidInCapitalStockIssuedIssuanceCosts,1,2,1,1,1,1,1,1,1,1,1,1,2
0001428205-19-000214,201902,CommonStockDividendsPerShareDeclared,1,2,1,1,1,1,1,1,1,1,1,1,2
0001628280-19-010519,201902,GainsLossesOnExtinguishmentOfDebt,1,2,1,1,1,1,1,1,1,1,1,1,2
0001691303-19-000036,201902,CommonStockDividendsPerShareDeclared,1,2,1,1,1,1,1,1,1,1,1,1,2
0001376139-19-000053,201902,CommonStockDividendsPerShareDeclared,1,2,1,1,1,1,1,1,1,1,1,1,2
0001564590-19-030407,201902,CommonStockDividendsPerShareDeclared,1,2,1,1,1,1,1,1,1,1,1,1,2
0001564590-19-031538,201902,AllocatedShareBasedCompensationExpense,1,2,1,1,1,1,1,1,1,1,1,1,2
0001564590-19-030190,201902,CommonStockDividendsPerShareDeclared,1,2,1,1,1,1,1,1,1,1,1,1,2
0001748773-19-000004,201902,NetIncomeLoss,1,2,1,1,1,1,1,1,1,1,1,1,2
0001697500-19-000050,201902,AllocatedShareBasedCompensationExpense,1,2,1,1,1,1,1,1,1,1,1,1,2


In [9]:
sec_data = sec_data.sort_values(['period', 'ddate']).drop_duplicates(['adsh', 'clean_tag', 'period'], keep = 'first')

In [10]:
sec_data.groupby(['adsh', 'period', 'clean_tag'])\
    .nunique()\
    .sort_values('value', ascending = False)\
    .query("value > 1")

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,adsh,qtrs,uom,clean_tag,ddate,company_name,cik,sic,country,period,form,value
adsh,period,clean_tag,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1


## Reshape <a name="Reshape"></a>

In [12]:
sec_data_wide = sec_data.pivot_table(values = 'value',
                                     index = ['adsh', 'company_name', 'ddate', 
                                              'cik', 'sic', 'country', 'period'],
                                    columns = 'clean_tag')\
    .reset_index()

In [13]:
percent_missing = sec_data_wide.isnull().sum() * 100 / len(sec_data_wide)
missing_value_df = pd.DataFrame({'column_name': sec_data_wide.columns,
                                 'percent_missing': percent_missing}).sort_values('percent_missing', ascending = True)

In [14]:
percent_missing_by_company = sec_data_wide.groupby('adsh').apply(lambda x: x.notnull().mean())

In [15]:
string = 'Ignore punctuation, please :)'
letter_occurences = pd.Series([i for i in string.lower()]).value_counts()

In [16]:
len(letter_occurences)

17

In [17]:
[j for j in range(26,26-len(letter_occurences),-1)]

[26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10]

In [18]:
from collections import Counter

In [19]:
clean_string = ''.join(x for x in string if x.isalpha()).lower()

In [20]:
counter = Counter(clean_string)

In [21]:
len(counter)

13

In [22]:
from itertools import accumulate

def get_max_contiguous_sum(sequence):
    int_sequence = [int(i) for i in sequence.split(",")]
    
    all_maxes = [-100]*len(int_sequence)
    for i in range(0, len(int_sequence)):
        max_sum = max(list(accumulate(int_sequence[i:])))
        all_maxes[i] = max_sum
    
    return(max(all_maxes))

In [23]:
get_max_contiguous_sum("2,3,-2,-1,10")

12

In [24]:
percent_missing.sort_values()

clean_tag
adsh                                                                                                                            0.000000
company_name                                                                                                                    0.000000
ddate                                                                                                                           0.000000
cik                                                                                                                             0.000000
sic                                                                                                                             0.000000
country                                                                                                                         0.000000
period                                                                                                                          0.000000
LiabilitiesAndStockholdersEquit

## Save <a name="Save"></a>
Create CSV versions in build/raw

In [25]:
sec_data_wide.to_csv(data_root+'02-build/clean/2019q3/sec_data_wide.csv')