# Set Up

In [None]:
import os
import re
import bs4 as BeautifulSoup 
import time
import requests
import random
import jsonlines

# Get the Company List

Work flow:
1. read the company list `txt` file;
2. get the symbols;
3. put them together <sup>[[1]](#ft1)</sup>

In [None]:
def get_industry_list() -> list:
    root_path = os.path.abspath(os.path.dirname(os.getcwd()))
    data_path = os.path.join(root_path, 'data')
    txt_pattern = re.compile('\w+(?=.txt)')
    names = []
    for file in os.listdir(data_path):
        if file.endswith('.txt'):
            file = re.search(txt_pattern, file)[0]
            names.append(file)
    return names

def get_company_list(l: list) -> dict:
    root_path = os.path.abspath(os.path.dirname(os.getcwd()))
    data_path = os.path.join(root_path, 'data')
    comp = dict()
    for n in l:
        file_name = n + '.txt' 
        path = os.path.join(data_path, file_name)
        # read the txt file
        with open(path, 'r') as f:
            f = f.readlines()
        # identify the symbol pattern of the company
        pattern = re.compile('(?<=^)[A-Z0-9-.&]+(?=\t)')

        company_list = []
        # get the company list
        for line in f:
            company_list.extend(re.findall(pattern, line)) 
        comp[n] = company_list
    return comp

In [None]:
industry = get_industry_list()
comps = get_company_list(industry)
print('data is ready.')

# Get the Profile Page for Each Company

Crawl profile pages from Yahoo Finance, each industry has a folder to store its companies.

Time cost: 10.019 seconds for each company

In [None]:
def get_profile(inds: str, d: dict):
    """
    get profile for one industry
    """
    assert inds in d.keys(), 'This industry is not in the list for now.'
    root_path = os.path.abspath(os.path.dirname(os.getcwd()))
    web_path = os.path.join(root_path, 'webpage')
    inds_path = os.path.join(web_path, inds)
    try:
        # REF: https://www.geeksforgeeks.org/create-a-directory-in-python/
        os.mkdir(inds_path) # create the corresponding industry direction
    except:
        pass
    headers = {'User-agent': 'Mozilla/5.0'}
    count = 0
    total = len(d[inds]) - len(os.listdir(inds_path))
    t1 = time.time()
    for c in d[inds]:
        file_name = c + '.txt'
        if file_name in os.listdir(inds_path):
            continue
        else:  
            count += 1
            file_path = os.path.join(inds_path, file_name)
            url = ("https://finance.yahoo.com/quote/{s}/profile?p={s}".format(s= c))
            num = random.randint(3, 15)
            time.sleep(num) # sleep num's for each request
            webpage = requests.get(url, headers=headers)
            with open(file_path, 'w') as file:
                file.write(webpage.text)
        t2 = time.time()
        print('Progress {c}/{t}.'.format(c = count, t = total))
        print('Have cost {t:.3f} seconds; average cost time {s:.3f} seconds'.format(t = t2 - t1, s = (t2-t1)/count))
        print('Estimated time to complete {t:.3f} mins.'.format(t = (total-count)*(t2-t1)/count/60))
        print('\n')
    return
    
def get_all_profile(d: dict):
    for k in d.keys():
        get_profile(k, d)
        print('\n\n\n {i} is done'.format(i = k))
        
    print('\n\n\nall done.') 

In [None]:
get_profile('travel', comps)

In [None]:
get_profile('agriculture', comps)

In [None]:
get_profile('energy', comps)

# Parse the Pages

The components interest us:

+ the fullname of the company;
+ the company address;
+ the section & industry the company located in;
+ the offical website;
+ and the profile page in Yahoo Finance.

Here is an example:
```
{"CTVA": {
"name": "Corteva, Inc.", 
"location": ["Chestnut Run Plaza 735", "PO Box 80735", "United States"], 
"website": "http://www.corteva.com", 
"section": "Basic Materials", 
"industry": "Agricultural Inputs", 
"profile": "https://finance.yahoo.com/quote/CTVA/profile?p=CTVA"}
}
```

The program will save each industry as a `jsonl` file, each line in `jsonl` is the components of one company.

In [None]:
def parse_page(inds: str) -> dict:
    """
    parse one industry pages and extract the full name, location, website, section, industry for each company
    Input:
        i = industry name
    Output:
        dictionary = {company:{name:, location:, webstion:, section:, industry:,}}
    """
    # access the dir
    root_path = os.path.abspath(os.path.dirname(os.getcwd()))
    web_path = os.path.join(root_path, 'webpage')
    assert inds in os.listdir(web_path), 'This industry is not supported.'
    inds_path = os.path.join(web_path, inds)
    
    # read file and parse each
    c_pattern = re.compile('.+(?=.txt)')
    result = [] # return object
    count = 0
    total = 0
    for f in os.listdir(inds_path):
        total += 1
        if not f.endswith('.txt'): # avoid _DS.Store
            continue
        vals = {}
        c = re.search(c_pattern, f)[0] # get company name
#         print(c)
        url = ("https://finance.yahoo.com/quote/{}/profile?p={}".format(c,c)) # profile page
        file_path = os.path.join(inds_path, f)
        with open(file_path, 'r') as webpage:
            soup = webpage.read()
            
        soup = BeautifulSoup.BeautifulSoup(soup)
        
        # get the company full name
        name = str(soup.find(class_="Fz(m) Mb(10px)"))
        name_p = re.compile('(?<=data-reactid="6">).+(?=</h3>)')
        try:
            name = re.findall(name_p, name)[0]
#             print(name)
        except:
            count += 1
            print(c, name)
            print(url)
            print('\n')
            result.append({c:vals})
            continue

        # get the company address and website address
        address = str(soup.find(class_="D(ib) W(47.727%) Pend(40px)"))
        location_p = re.compile('(?<=-->)[\d\w\s,]+(?=<!--)')
        location = re.findall(location_p, address)
        website_p = re.compile('(?<=target="_blank" title="">).+(?=</a></p>)')
        website = re.findall(website_p, address)
        if website:
            website = website[0]
        
        # get the section and industry along with
        info = str(soup.find(class_="D(ib) Va(t)"))
        section_p1 = re.compile('(?<=data-reactid="21">).+(?=</span><br data-reactid="22"/>)')
        section_p2 = re.compile('(?<=data-reactid="23">).+(?=</span><br data-reactid="24"/>)')
        if re.findall(section_p1, info):
            section = re.findall(section_p1, info)[0]
        else:
            section = re.findall(section_p2, info)[0]
        industry_p1 = re.compile('(?<=data-reactid="25">).+(?=</span><br data-reactid="26"/>)')
        industry_p2 = re.compile('(?<=data-reactid="27">).+(?=</span><br data-reactid="28"/>)')
        if re.findall(industry_p1, info):
            industry = re.findall(industry_p1, info)[0]
        else:
            industry = re.findall(industry_p2, info)[0]

        vals['name'] = name
        vals['location'] = location
        vals['website'] = website
        vals['section'] = section
        vals['industry'] = industry
        vals['profile'] = url
        result.append({c:vals})
        
    # save as jsonl
    file_name = inds + '.jsonl'
    file_path = os.path.join(web_path, file_name)
    with jsonlines.open(file_path, 'w') as f:
        f.write_all(result)
    print("{c}/{t}({p:.2%}) companies' profile pages inform nothing in {i}.".format(c=count, 
                                                                                t=total, 
                                                                                p=count/total, 
                                                                                i=inds))
    return result

In [75]:
agr = parse_page('agriculture')

0/18(0.00%) companies' profile pages inform nothing in agriculture.


In [76]:
trv = parse_page('travel')

E3X1.MU None
https://finance.yahoo.com/quote/E3X1.MU/profile?p=E3X1.MU


RC8.MU None
https://finance.yahoo.com/quote/RC8.MU/profile?p=RC8.MU


T6A.SG None
https://finance.yahoo.com/quote/T6A.SG/profile?p=T6A.SG


0TUA.MU None
https://finance.yahoo.com/quote/0TUA.MU/profile?p=0TUA.MU


T6A.HM None
https://finance.yahoo.com/quote/T6A.HM/profile?p=T6A.HM


26Y.F None
https://finance.yahoo.com/quote/26Y.F/profile?p=26Y.F


LAG.BE None
https://finance.yahoo.com/quote/LAG.BE/profile?p=LAG.BE


1NC.HA None
https://finance.yahoo.com/quote/1NC.HA/profile?p=1NC.HA


CVC1.SG None
https://finance.yahoo.com/quote/CVC1.SG/profile?p=CVC1.SG


1NC.DU None
https://finance.yahoo.com/quote/1NC.DU/profile?p=1NC.DU


0HB2.IL None
https://finance.yahoo.com/quote/0HB2.IL/profile?p=0HB2.IL


CVC1.HM None
https://finance.yahoo.com/quote/CVC1.HM/profile?p=CVC1.HM


09B.DU None
https://finance.yahoo.com/quote/09B.DU/profile?p=09B.DU


CRIP34.SA None
https://finance.yahoo.com/quote/CRIP34.SA/profile?p=CRIP34.SA



In [77]:
eng = parse_page('energy')

ALIN-PE None
https://finance.yahoo.com/quote/ALIN-PE/profile?p=ALIN-PE


ALIN-PA None
https://finance.yahoo.com/quote/ALIN-PA/profile?p=ALIN-PA


HMLP-PA None
https://finance.yahoo.com/quote/HMLP-PA/profile?p=HMLP-PA


ALIN-PB None
https://finance.yahoo.com/quote/ALIN-PB/profile?p=ALIN-PB


GMLPP None
https://finance.yahoo.com/quote/GMLPP/profile?p=GMLPP


TGP-PB None
https://finance.yahoo.com/quote/TGP-PB/profile?p=TGP-PB


NS-PA None
https://finance.yahoo.com/quote/NS-PA/profile?p=NS-PA


NS-PC None
https://finance.yahoo.com/quote/NS-PC/profile?p=NS-PC


TGP-PA None
https://finance.yahoo.com/quote/TGP-PA/profile?p=TGP-PA


GLP-PA None
https://finance.yahoo.com/quote/GLP-PA/profile?p=GLP-PA


DCP-PB None
https://finance.yahoo.com/quote/DCP-PB/profile?p=DCP-PB


DCP-PC None
https://finance.yahoo.com/quote/DCP-PC/profile?p=DCP-PC


EP-PC None
https://finance.yahoo.com/quote/EP-PC/profile?p=EP-PC


DLNG-PA None
https://finance.yahoo.com/quote/DLNG-PA/profile?p=DLNG-PA


14/255(5.49%) com

# Footnote

<a name="ft1">[1]</a>: Difference between `os.wal` and `os.listdir` https://www.cnblogs.com/cloud-ken/p/10017093.html