# Scraping

## Import Libraries

In [1]:
# Importing built-in libraries 

import os
import re
import unicodedata
import json
from time import gmtime, strftime
from datetime import datetime, timedelta


# Importing libraries you need to install
import requests
from lxml import html
import bs4 as bs
from tqdm import tqdm
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from gensim import corpora
from gensim.models import TfidfModel, LdaMulticore
from wordcloud import WordCloud
import pyLDAvis.gensim_models
from datetime import datetime
from collections import Counter
import seaborn as sns
from textblob import TextBlob
import pandas_datareader.data as web

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ducnguyen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
  PANDAS_VERSION = LooseVersion(pd.__version__)
  PANDAS_0210 = PANDAS_VERSION >= LooseVersion("0.21.0")
  PANDAS_0220 = PANDAS_VERSION >= LooseVersion("0.22.0")
  PANDAS_0230 = PANDAS_VERSION >= LooseVersion("0.23.0")


## Map Stock Tickers to CIKs

SEC EDGAR APIs require inputs to be CIKs. This step essentially maps stock tickers to their corresponding CIKs so that it is more convenient to perform API calls.

In [2]:
dict = {}

In [3]:
f = open("company_tickers.json") # open the file
data = json.load(f) # load data
print(data)

{'0': {'cik_str': 320193, 'ticker': 'AAPL', 'title': 'Apple Inc.'}, '1': {'cik_str': 789019, 'ticker': 'MSFT', 'title': 'MICROSOFT CORP'}, '2': {'cik_str': 1018724, 'ticker': 'AMZN', 'title': 'AMAZON COM INC'}, '3': {'cik_str': 1067983, 'ticker': 'BRK-B', 'title': 'BERKSHIRE HATHAWAY INC'}, '4': {'cik_str': 731766, 'ticker': 'UNH', 'title': 'UNITEDHEALTH GROUP INC'}, '5': {'cik_str': 200406, 'ticker': 'JNJ', 'title': 'JOHNSON & JOHNSON'}, '6': {'cik_str': 34088, 'ticker': 'XOM', 'title': 'EXXON MOBIL CORP'}, '7': {'cik_str': 104169, 'ticker': 'WMT', 'title': 'Walmart Inc.'}, '8': {'cik_str': 1046179, 'ticker': 'TSM', 'title': 'TAIWAN SEMICONDUCTOR MANUFACTURING CO LTD'}, '9': {'cik_str': 884394, 'ticker': 'SPY', 'title': 'SPDR S&P 500 ETF TRUST'}, '10': {'cik_str': 59478, 'ticker': 'LLY', 'title': 'ELI LILLY & Co'}, '11': {'cik_str': 19617, 'ticker': 'JPM', 'title': 'JPMORGAN CHASE & CO'}, '12': {'cik_str': 93410, 'ticker': 'CVX', 'title': 'CHEVRON CORP'}, '13': {'cik_str': 80424, 'tic

In [4]:
tickerCIK = {}
for i, l in data.items():
    currTicker = l['ticker']
    currCIK = l['cik_str']
    tickerCIK[currTicker] = currCIK
print(tickerCIK)

{'AAPL': 320193, 'MSFT': 789019, 'AMZN': 1018724, 'BRK-B': 1067983, 'UNH': 731766, 'JNJ': 200406, 'XOM': 34088, 'WMT': 104169, 'TSM': 1046179, 'SPY': 884394, 'LLY': 59478, 'JPM': 19617, 'CVX': 93410, 'PG': 80424, 'LVMUY': 824046, 'HD': 354950, 'NVDA': 1045810, 'MA': 1141391, 'ABBV': 1551152, 'BAC': 70858, 'NVO': 353278, 'KO': 21344, 'PFE': 78003, 'MRK': 310158, 'PEP': 77476, 'TSLA': 1318605, 'COST': 909832, 'BABA': 1577552, 'TMO': 97745, 'TM': 1094517, 'DHR': 313616, 'ABT': 1800, 'AVGO': 1730168, 'MCD': 63908, 'AZN': 901832, 'ASML': 937966, 'TMUS': 1283699, 'NVS': 1114448, 'DIS': 1744489, 'CSCO': 858877, 'ORCL': 1341439, 'WFC': 72971, 'ACN': 1467373, 'BMY': 14272, 'COP': 1163165, 'VZ': 732712, 'NEE': 753308, 'TXN': 97476, 'CRM': 1108524, 'NKE': 320187, 'LIN': 1707925, 'UPS': 1090727, 'MS': 895421, 'SCHW': 316709, 'PM': 1413329, 'ADBE': 796343, 'LOW': 60667, 'AMGN': 318154, 'CMCSA': 1166691, 'TTE': 879764, 'BHP': 811809, 'RTX': 101829, 'UNP': 100885, 'QCOM': 804328, 'RY': 1000275, 'HON'

## Actual Scraping

### Define Paths to Store Data

In [5]:
os.mkdir("10Ks")
os.listdir()

FileExistsError: [Errno 17] File exists: '10Ks'

### Actual Scraping

1. Scrape the webpage that contains all 10-K filings over the years.
2. For each 10-K filing, access their "Document" table.
3. Access the corresponding document link which has "Description" as 10-K.

In [6]:
# scrape the webpage that contains 10-K filings over the years
all_10Ks = "https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK={}&type=10-K&dateb=&owner=exclude&count=40&search_text="
filing_url_base_10k = 'http://www.sec.gov/Archives/edgar/data/%s/%s-index.html'
doc_url_base_10k = 'http://www.sec.gov/Archives/edgar/data/%s/%s/%s'

In [170]:
headers = {"User-Agent": "Sentient ducnguyen10214@berkeley.edu", "Accept-Encoding": "gzip, deflate", "Host": "www.sec.gov"}

In [171]:
def Scrape10K(base_url, filing_url, doc_url, cik, headers):
    
    # Check if we've already scraped this CIK
    try:
        os.mkdir(str(cik))
    except OSError:
        print("The CIK has already been scraped", cik)
        return
    
    # Setting current directory for that CIK
    os.chdir(str(cik))
    
    # Request list of 10-K filings --> STEP 1 in the pictures
    base_res = requests.get(base_url.format(cik), headers=headers)  #  STEP 1 in the pictures
    
    # Parse the response HTML using BeautifulSoup
    base_soup = bs.BeautifulSoup(base_res.text, "lxml")

    # Extract all tables from the response
    base_html_tables = base_soup.find_all('table')
    
    # Check that the table we're looking for exists and If it doesn't, exit
    if len(base_html_tables) < 3:
        os.chdir('..')
        return
    
    # Parse the Filings table
    fil_table = pd.read_html(str(base_html_tables[2]), header=0)[0]
    fil_table['Filings'] = [str(y) for y in fil_table['Filings']]

    # Get only 10-K and 10-K405 document filings
    fil_table = fil_table[(fil_table['Filings'] == '10-K')| (fil_table['Filings'] == '10-K405') ]
    
    # If filings table doesn't have any 10-Ks or 10-K405s, exit
    if len(fil_table) == 0:
        os.chdir('..')
        return
    
    # Get accession number for each 10-K and 10-K405 filing
    fil_table['Acc_No'] = [x.replace('\xa0',' ')
                               .split('Acc-no: ')[1]
                               .split(' ')[0] for x in fil_table['Description']]
    
    print("fil_table")
    print(fil_table)
    
    # Iterate through each filing and scrape the corresponding document...
    for index, row in fil_table.iterrows():
        # find the unique accession number for filing
        acc_no = str(row['Acc_No'])
        
        # find the page with the accession number and Parse the table of documents for the filing
        docs_page_html = bs.BeautifulSoup(requests.get(filing_url % (cik, acc_no), headers=headers).text, 'lxml') # STEP 2 in the pictures
        docs_tables = docs_page_html.find_all('table')
        
        if len(docs_tables)==0:
            continue
            
        #converting the HTML table to a Dataframe    
        docs_df = pd.read_html(str(docs_tables[0]), header=0)[0]
        docs_df['Type'] = [str(x) for x in docs_df['Type']]
        
        # Get the 10-K for the filing
        docs_df = docs_df[(docs_df['Type'] == '10-K')| (docs_df['Type'] == '10-K405')]
        
        # If there aren't any 10-K, skip to the next filing
        if len(docs_df)==0:
            continue
        elif len(docs_df)>0:
            docs_df = docs_df.iloc[0]
        
        docname = docs_df['Document']
    
        if str(docname) != 'nan':
        # STEP 3 in the pictures
            link = str(doc_url % (cik, acc_no.replace('-', ''), docname)).split()[0]
            file = requests.get(str(doc_url % (cik, acc_no.replace('-', ''), docname)).split()[0], headers=headers)

            # Save the file in appropriate format
            if '.txt' in str(docname):
                # Save text as TXT
                date = str(row['Filing Date'])
                filename = str(cik) + '_' + date + '.txt'
                html_file = open(filename, 'a')
                html_file.write(file.text)
                html_file.close()
            else:
                # Save text as HTML
                date = str(row['Filing Date'])
                filename = str(cik) + '_' + date + '.html'
                html_file = open(filename, 'a')
                html_file.write(file.text)
                html_file.close()
        
    return

In [156]:
path10K = "/Users/ducnguyen/Sentient/10Ks/"
os.chdir(path10K)
Scrape10K(base_url=all_10Ks, filing_url=filing_url_base_10k, doc_url=doc_url_base_10k, cik=tickerCIK['AAPL'], headers=headers)

# # Iterate over CIKs and scrape 10-Ks
# for i, l in tickerCIK.items():
#     Scrape10K(base_url=all_10Ks, filing_url=filing_url_base_10k, doc_url=doc_url_base_10k, cik=str(l))

fil_table
    Filings                      Format  \
0      10-K  Documents Interactive Data   
1      10-K  Documents Interactive Data   
2      10-K  Documents Interactive Data   
3      10-K  Documents Interactive Data   
4      10-K  Documents Interactive Data   
5      10-K  Documents Interactive Data   
6      10-K  Documents Interactive Data   
7      10-K  Documents Interactive Data   
8      10-K  Documents Interactive Data   
9      10-K  Documents Interactive Data   
10     10-K  Documents Interactive Data   
11     10-K  Documents Interactive Data   
13     10-K  Documents Interactive Data   
14     10-K                   Documents   
15     10-K                   Documents   
16     10-K                   Documents   
17     10-K                   Documents   
18     10-K                   Documents   
19     10-K                   Documents   
20     10-K                   Documents   
21  10-K405                   Documents   
22     10-K                   Documents   
2