In [None]:
# SageMaker JumpStart solution : "Dashboarding SEC Text for Financial NLP"
# SageMaker JumpStart Industry SDK: https://github.com/aws/sagemaker-jumpstart-industry-pack

In [None]:
%pip -q install sagemaker smjsindustry --upgrade

In [None]:
import boto3
import sagemaker
import smjsindustry
import pandas as pd
import re

In [None]:
print(sagemaker.__version__)

In [None]:
session = sagemaker.Session()
bucket  = session.default_bucket()
role    = sagemaker.get_execution_role()

In [None]:
from smjsindustry.finance.processor import DataLoader
from smjsindustry.finance.processor_config import EDGARDataSetConfig

sec_processed_folder = 'julsimon-sec-processed'
filename = 'dataset_10k_10q.csv'

dataset_config = EDGARDataSetConfig(
    tickers_or_ciks=['amzn'],
    form_types=['10-K', '10-Q'],
    filing_date_start='2019-01-01',
    filing_date_end='2020-12-31',
    email_as_user_agent='test-user@test.com')
    
data_loader = DataLoader(
    role=role,
    sagemaker_session=session,
    instance_count=1,
    instance_type='ml.c5.2xlarge',
    volume_size_in_gb=30)
    
data_loader.load(
    dataset_config,
    's3://{}/{}/{}'.format(bucket, sec_processed_folder, 'output'),
    filename,
    wait=True,
    logs=True)

In [None]:
client = boto3.client('s3')
client.download_file(bucket, '{}/{}/{}'.format(sec_processed_folder, 'output', filename), filename)
df = pd.read_csv(filename)
df

In [None]:
%run SEC_functions.ipynb

In [None]:
df_10K =process_filings(df, '10-K')

In [None]:
df_10K.head()

In [None]:
df_10Q =process_filings(df, '10-Q')

In [None]:
df_10Q.head()

In [None]:
df_10K.to_csv('dataset_10k.csv', index=False, header=True)
df_10Q.to_csv('dataset_10q.csv', index=False, header=True)