## Parsing the 10-k file

Basic data extraction from the raw datasets to a semi-structured table

In [1]:
import re
import pandas as pd

In [2]:
with open('data/raw/8k-data/GOOG.txt', 'r') as f:
    data = f.read()

In [3]:
# Split into documents
documents = data.replace('<DOCUMENT>', '').split('</DOCUMENT>')

In [4]:
len(documents)

115

In [5]:
reg_FILE = re.compile('FILE:(.*)')
reg_TIME = re.compile('TIME:(.*)')
reg_EVENTS = re.compile('EVENTS:(.*)')
reg_ITEMS = re.compile('ITEM:(.*)')

In [6]:
def factory_parser(compiled_reg):
    return lambda d: map(lambda x: x.replace(',', '').strip(), re.findall(compiled_reg, d))

get_file = factory_parser(reg_FILE)
get_time = factory_parser(reg_TIME)
get_events = factory_parser(reg_EVENTS)
get_items = factory_parser(reg_ITEMS)

In [7]:
META_LINES = ['FILE', 'TIME', 'EVENTS', 'ITEM']

def get_text(doc):
    lines = [ ln for ln in doc.split('\n') if ln != ' ' ]
    g_lines = [ ln for ln in lines if not any(map(lambda x: ln.startswith(x), META_LINES)) ]
    return '\n'.join( g_lines )

In [8]:
# Parse documents
parsed_data = {
    'file': map(get_file, documents),
    'time': map(get_time, documents),
    'events': map(get_events, documents),
    'items': map(get_items, documents),
    'text': map(get_text, documents)
 }

In [9]:
struct_data = pd.DataFrame.from_dict(parsed_data)

In [10]:
struct_data.head()

Unnamed: 0,events,file,items,text,time
0,[Other events\tFinancial statements and exhibits],[GOOG/GOOG-8K-20040709060939.txt.gz],"[Other events, Financial statements and exhibits]","\nTEXT:\nOn July 6, 2004, Google Inc. (""Google...",[20040709060939]
1,[Other Events\tFinancial Statements and Exhibits],[GOOG/GOOG-8K-20041008171511.txt.gz],"[Other Events, Financial Statements and Exhibits]",\n\nTEXT:\nCheck the appropriate box below if ...,[20041008171511]
2,[Results of Operations and Financial Condition...,[GOOG/GOOG-8K-20041021162846.txt.gz],[Results of Operations and Financial Condition...,\n\nTEXT:\nCheck the appropriate box below if ...,[20041021162846]
3,[Other Events],[GOOG/GOOG-8K-20041119172648.txt.gz],[Other Events],\n\nTEXT:\nCheck the appropriate box below if ...,[20041119172648]
4,[Other Events\tFinancial Statements and Exhibits],[GOOG/GOOG-8K-20050113122048.txt.gz],"[Other Events, Financial Statements and Exhibits]",\n\nTEXT:\nCheck the appropriate box below if ...,[20050113122048]


In [11]:
struct_data.to_csv('data/parsed/8k-data/GOOG.csv')