# Set Up

In [1]:
import os
import re
import bs4 as BeautifulSoup 
import time
import requests
import random
import jsonlines

# Get the Company List

Work flow:
1. read the company list `txt` file;
2. get the symbols;
3. put them together <sup>[[1]](#ft1)</sup>

In [2]:
def get_industry_list() -> list:
    root_path = os.path.abspath(os.path.dirname(os.getcwd()))
    data_path = os.path.join(root_path, 'data')
    txt_pattern = re.compile('\w+(?=.txt)')
    names = []
    for file in os.listdir(data_path):
        if file.endswith('.txt'):
            file = re.search(txt_pattern, file)[0]
            names.append(file)
    return names

def get_company_list(l: list) -> dict:
    root_path = os.path.abspath(os.path.dirname(os.getcwd()))
    data_path = os.path.join(root_path, 'data')
    comp = dict()
    for n in l:
        file_name = n + '.txt' 
        path = os.path.join(data_path, file_name)
        # read the txt file
        with open(path, 'r') as f:
            f = f.readlines()
        # identify the symbol pattern of the company
        pattern = re.compile('(?<=^)[A-Z0-9-.&]+(?=\t)')

        company_list = []
        # get the company list
        for line in f:
            company_list.extend(re.findall(pattern, line)) 
        comp[n] = company_list
    return comp

In [3]:
industry = get_industry_list()
comps = get_company_list(industry)
print('data is ready.')

data is ready.


# Get the Profile Page for Each Company

Crawl profile pages from Yahoo Finance, each industry has a folder to store its companies.

In [22]:
def get_profile(inds: str, d: dict):
    """
    get profile for one industry
    """
    assert inds in d.keys(), 'This industry is not in the list for now.'
    root_path = os.path.abspath(os.path.dirname(os.getcwd()))
    web_path = os.path.join(root_path, 'webpage')
    inds_path = os.path.join(web_path, inds)
    try:
        # REF: https://www.geeksforgeeks.org/create-a-directory-in-python/
        os.mkdir(inds_path) # create the corresponding industry direction
    except:
        pass
    headers = {'User-agent': 'Mozilla/5.0'}
    count = 0
    total = len(d[inds]) - len(os.listdir(inds_path))
    t1 = time.time()
    for c in d[inds]:
        file_name = c + '.txt'
        if file_name in os.listdir(inds_path):
            continue
        else:  
            count += 1
            file_path = os.path.join(inds_path, file_name)
            url = ("https://finance.yahoo.com/quote/{s}/profile?p={s}".format(s= c))
            num = random.randint(3, 15)
            time.sleep(num) # sleep num's for each request
            webpage = requests.get(url, headers=headers)
            with open(file_path, 'w') as file:
                file.write(webpage.text)
        t2 = time.time()
        print('Progress {c}/{t}.'.format(c = count, t = total))
        print('Have cost {t:.3f} seconds; average cost time {s:.3f} seconds'.format(t = t2 - t1, s = (t2-t1)/count))
        print('Estimated time to complete {t:.3f} mins.'.format(t = (total-count)*(t2-t1)/count/60))
        print('\n')
    return
    
def get_all_profile(d: dict):
    for k in d.keys():
        get_profile(k, d)
        print('\n\n\n {i} is done'.format(i = k))
        
    print('\n\n\nall done.') 

In [None]:
get_profile('travel', comps)

Progress 1/268.
Have cost 9.964 seconds; average cost time 9.964 seconds
Estimated time to complete 44.342 mins.


Progress 2/268.
Have cost 18.982 seconds; average cost time 9.491 seconds
Estimated time to complete 42.077 mins.


Progress 3/268.
Have cost 22.960 seconds; average cost time 7.653 seconds
Estimated time to complete 33.802 mins.


Progress 4/268.
Have cost 38.936 seconds; average cost time 9.734 seconds
Estimated time to complete 42.830 mins.


Progress 5/268.
Have cost 49.922 seconds; average cost time 9.984 seconds
Estimated time to complete 43.765 mins.


Progress 6/268.
Have cost 64.951 seconds; average cost time 10.825 seconds
Estimated time to complete 47.270 mins.


Progress 7/268.
Have cost 72.853 seconds; average cost time 10.408 seconds
Estimated time to complete 45.273 mins.


Progress 8/268.
Have cost 82.641 seconds; average cost time 10.330 seconds
Estimated time to complete 44.764 mins.


Progress 9/268.
Have cost 92.488 seconds; average cost time 10.276 sec

Progress 71/268.
Have cost 711.192 seconds; average cost time 10.017 seconds
Estimated time to complete 32.888 mins.


Progress 72/268.
Have cost 720.020 seconds; average cost time 10.000 seconds
Estimated time to complete 32.668 mins.


Progress 73/268.
Have cost 731.854 seconds; average cost time 10.025 seconds
Estimated time to complete 32.583 mins.


Progress 74/268.
Have cost 736.906 seconds; average cost time 9.958 seconds
Estimated time to complete 32.198 mins.


Progress 75/268.
Have cost 749.887 seconds; average cost time 9.998 seconds
Estimated time to complete 32.162 mins.


Progress 76/268.
Have cost 761.786 seconds; average cost time 10.023 seconds
Estimated time to complete 32.075 mins.


Progress 77/268.
Have cost 776.562 seconds; average cost time 10.085 seconds
Estimated time to complete 32.105 mins.


Progress 78/268.
Have cost 786.510 seconds; average cost time 10.083 seconds
Estimated time to complete 31.931 mins.


Progress 79/268.
Have cost 794.514 seconds; averag

Progress 140/268.
Have cost 1359.341 seconds; average cost time 9.710 seconds
Estimated time to complete 20.714 mins.


Progress 141/268.
Have cost 1371.132 seconds; average cost time 9.724 seconds
Estimated time to complete 20.583 mins.


Progress 142/268.
Have cost 1382.971 seconds; average cost time 9.739 seconds
Estimated time to complete 20.452 mins.


Progress 143/268.
Have cost 1388.725 seconds; average cost time 9.711 seconds
Estimated time to complete 20.232 mins.


Progress 144/268.
Have cost 1395.491 seconds; average cost time 9.691 seconds
Estimated time to complete 20.028 mins.


Progress 145/268.
Have cost 1402.357 seconds; average cost time 9.671 seconds
Estimated time to complete 19.826 mins.


Progress 146/268.
Have cost 1415.223 seconds; average cost time 9.693 seconds
Estimated time to complete 19.710 mins.


Progress 147/268.
Have cost 1425.044 seconds; average cost time 9.694 seconds
Estimated time to complete 19.550 mins.


Progress 148/268.
Have cost 1437.046 sec

In [24]:
get_profile('agriculture', comps)

Progress 1/18.
Have cost 6.859 seconds; average cost time 6.859 seconds
Estimated time to complete 1.943 mins.


Progress 2/18.
Have cost 13.701 seconds; average cost time 6.851 seconds
Estimated time to complete 1.827 mins.


Progress 3/18.
Have cost 18.585 seconds; average cost time 6.195 seconds
Estimated time to complete 1.549 mins.


Progress 4/18.
Have cost 32.487 seconds; average cost time 8.122 seconds
Estimated time to complete 1.895 mins.


Progress 5/18.
Have cost 43.300 seconds; average cost time 8.660 seconds
Estimated time to complete 1.876 mins.


Progress 6/18.
Have cost 50.149 seconds; average cost time 8.358 seconds
Estimated time to complete 1.672 mins.


Progress 7/18.
Have cost 61.931 seconds; average cost time 8.847 seconds
Estimated time to complete 1.622 mins.


Progress 8/18.
Have cost 76.892 seconds; average cost time 9.612 seconds
Estimated time to complete 1.602 mins.


Progress 9/18.
Have cost 86.825 seconds; average cost time 9.647 seconds
Estimated time t

# Parse the Pages

The components interest us:

+ the fullname of the company;
+ the company address;
+ the section & industry the company located in;
+ the offical website;
+ and the profile page in Yahoo Finance.

Here is an example:
```
{"CTVA": {"name": "Corteva, Inc.", "location": ["Chestnut Run Plaza 735", "PO Box 80735", "United States"], "website": "http://www.corteva.com", "section": "Basic Materials", "industry": "Agricultural Inputs", "profile": "https://finance.yahoo.com/quote/CTVA/profile?p=CTVA"}}
```

The program will save each industry as a `jsonl` file, each line in `jsonl` is the components of one company.

In [42]:
def parse_page(inds: str) -> dict:
    """
    parse one industry pages and extract the full name, location, website, section, industry for each company
    Input:
        i = industry name
    Output:
        dictionary = {company:{name:, location:, webstion:, section:, industry:,}}
    """
    # access the dir
    root_path = os.path.abspath(os.path.dirname(os.getcwd()))
    web_path = os.path.join(root_path, 'webpage')
    assert inds in os.listdir(web_path), 'This industry is not supported.'
    inds_path = os.path.join(web_path, inds)
    
    # read file and parse each
    c_pattern = re.compile('\w+(?=.txt)')
    result = [] # return object
    for f in os.listdir(inds_path):
        if not f.endswith('.txt'): # avoid _DS.Store
            continue
        vals = {}
        c = re.search(c_pattern, f)[0] # get company name
        url = ("https://finance.yahoo.com/quote/{}/profile?p={}".format(c,c)) # profile page
        file_path = os.path.join(inds_path, f)
        with open(file_path, 'r') as webpage:
            soup = webpage.read()
            
        soup = BeautifulSoup.BeautifulSoup(soup)
        
        # get the company full name
        name = str(soup.find(class_="Fz(m) Mb(10px)"))
        name_p = re.compile('(?<=data-reactid="6">).+(?=</h3>)')
        try:
            name = re.findall(name_p, name)[0]
#             print(name)
        except:
            print(c, name)
            print(url)
            print('\n')
            result[c] = vals
            continue

        # get the company address and website address
        address = str(soup.find(class_="D(ib) W(47.727%) Pend(40px)"))
        location_p = re.compile('(?<=-->)[\d\w\s,]+(?=<!--)')
        location = re.findall(location_p, address)
        website_p = re.compile('(?<=target="_blank" title="">).+(?=</a></p>)')
        website = re.findall(website_p, address)
        if website:
            website = website[0]
        
        # get the section and industry along with
        info = str(soup.find(class_="D(ib) Va(t)"))
        section_p1 = re.compile('(?<=data-reactid="21">).+(?=</span><br data-reactid="22"/>)')
        section_p2 = re.compile('(?<=data-reactid="23">).+(?=</span><br data-reactid="24"/>)')
        if re.findall(section_p1, info):
            section = re.findall(section_p1, info)[0]
        else:
            section = re.findall(section_p2, info)[0]
        industry_p1 = re.compile('(?<=data-reactid="25">).+(?=</span><br data-reactid="26"/>)')
        industry_p2 = re.compile('(?<=data-reactid="27">).+(?=</span><br data-reactid="28"/>)')
        if re.findall(industry_p1, info):
            industry = re.findall(industry_p1, info)[0]
        else:
            industry = re.findall(industry_p2, info)[0]

        vals['name'] = name
        vals['location'] = location
        vals['website'] = website
        vals['section'] = section
        vals['industry'] = industry
        vals['profile'] = url
        result.append({c:vals})
        
    # save as jsonl
    file_name = inds + '.jsonl'
    file_path = os.path.join(web_path, file_name)
    with jsonlines.open(file_path, 'w') as f:
        f.write_all(result)
    return result

In [43]:
agr = parse_page('agriculture')

Corteva, Inc.
Marrone Bio Innovations, Inc.
American Vanguard Corporation
ICL Group Ltd
Intrepid Potash, Inc.
Nutrien Ltd.
MGP Ingredients, Inc.
The Scotts Miracle-Gro Company
CF Industries Holdings, Inc.
Yield10 Bioscience, Inc.
E. I. du Pont de Nemours and Company
CVR Partners, LP
China Green Agriculture, Inc.
E. I. du Pont de Nemours and Company
Arcadia Biosciences, Inc.
FMC Corporation
The Mosaic Company
Origin Agritech Limited


In [None]:
trv = parse_page('travel')

In [None]:
eng = parse_page('energy')

# Footnote

<a name="ft1">[1]</a>: Difference between `os.wal` and `os.listdir` https://www.cnblogs.com/cloud-ken/p/10017093.html