# Get the Company List

Work flow:
1. read the company list `txt` file;
2. get the symbols;
3. put them together <sup>[[1]](#ft1)</sup>

In [10]:
import os
import re

def get_industry_list() -> list:
    root_path = os.path.abspath(os.path.dirname(os.getcwd()))
    data_path = os.path.join(root_path, 'data')
    txt_pattern = re.compile('\w+(?=.txt)')
    names = []
    for file in os.listdir(data_path):
        if file.endswith('.txt'):
            file = re.search(txt_pattern, file)[0]
            names.append(file)
    return names

def get_company_list(l: list) -> dict:
    root_path = os.path.abspath(os.path.dirname(os.getcwd()))
    data_path = os.path.join(root_path, 'data')
    comp = dict()
    for n in l:
        file_name = n + '.txt' 
        path = os.path.join(data_path, file_name)
        # read the txt file
        with open(path, 'r') as f:
            f = f.readlines()
        # identify the symbol pattern of the company
        pattern = re.compile('(?<=^)[A-Z0-9-.&]+(?=\t)')

        company_list = []
        # get the company list
        for line in f:
            company_list.extend(re.findall(pattern, line)) 
        comp[n] = company_list
    return comp

In [12]:
industry = get_industry_list()
comps = get_company_list(industry)
print('data is ready.')

In [17]:
comps['agriculture']

['CTA-PB',
 'CTA-PA',
 'CTVA',
 'NTR',
 'FMC',
 'MOS',
 'SMG',
 'CF',
 'ICL',
 'MGPI',
 'UAN',
 'AVD',
 'IPI',
 'MBII',
 'SEED',
 'CGA',
 'RKDA',
 'YTEN']

In [None]:
comps['energy']

In [None]:
comps['travel']

# Get the Profile Page for Each Company

In [64]:
import re
import time
import requests
import random
        
def get_profile(inds: str, d: dict):
    """
    get profile for one industry
    """
    assert inds in d.keys(), 'This industry is not in the list for now.'
    root_path = os.path.abspath(os.path.dirname(os.getcwd()))
    web_path = os.path.join(root_path, 'webpage')
    inds_path = os.path.join(web_path, inds)
    try:
        # REF: https://www.geeksforgeeks.org/create-a-directory-in-python/
        os.mkdir(inds_path) # create the corresponding industry direction
    except:
        pass
    headers = {'User-agent': 'Mozilla/5.0'}
    count = 0
    total = len(d[inds]) - len(os.listdir(inds_path))
    t1 = time.time()
    for c in d[inds]:
        file_name = c + '.txt'
        if file_name in os.listdir(inds_path):
            continue
        else:  
            count += 1
            file_path = os.path.join(inds_path, file_name)
            url = ("https://finance.yahoo.com/quote/{}/profile?p={}".format(s,s))
            num = random.randint(3, 15)
            time.sleep(num) # sleep num's for each request
            webpage = requests.get(url, headers=headers)
            with open(file_path, 'w') as file:
                file.write(webpage.text)
        t2 = time.time()
        print('Progress {c}/{t}.'.format(c = count, t = total))
        print('Have cost {t:.3f} seconds; average cost time {s:.3f} seconds'.format(t = t2 - t1, s = (t2-t1)/count))
        print('Estimated time to complete {t:.3f} mins.'.format(t = (total-count)*(t2-t1)/count/60))
        print('\n')
    return
    
def get_all_profile(d: dict):
    pass

In [None]:
get_profile('energy', comps)

Progress 1/245.
Have cost 5.006 seconds; average cost time 5.006 seconds
Estimated time to complete 20.357 mins.


Progress 2/245.
Have cost 15.952 seconds; average cost time 7.976 seconds
Estimated time to complete 32.302 mins.


Progress 3/245.
Have cost 25.897 seconds; average cost time 8.632 seconds
Estimated time to complete 34.817 mins.


Progress 4/245.
Have cost 41.889 seconds; average cost time 10.472 seconds
Estimated time to complete 42.064 mins.


Progress 5/245.
Have cost 48.828 seconds; average cost time 9.766 seconds
Estimated time to complete 39.062 mins.


Progress 6/245.
Have cost 59.889 seconds; average cost time 9.982 seconds
Estimated time to complete 39.760 mins.


Progress 7/245.
Have cost 74.839 seconds; average cost time 10.691 seconds
Estimated time to complete 42.409 mins.


Progress 8/245.
Have cost 81.810 seconds; average cost time 10.226 seconds
Estimated time to complete 40.394 mins.


Progress 9/245.
Have cost 94.670 seconds; average cost time 10.519 sec

Progress 71/245.
Have cost 736.255 seconds; average cost time 10.370 seconds
Estimated time to complete 30.072 mins.


Progress 72/245.
Have cost 751.102 seconds; average cost time 10.432 seconds
Estimated time to complete 30.079 mins.


Progress 73/245.
Have cost 766.972 seconds; average cost time 10.506 seconds
Estimated time to complete 30.119 mins.


Progress 74/245.
Have cost 780.692 seconds; average cost time 10.550 seconds
Estimated time to complete 30.067 mins.


Progress 75/245.
Have cost 788.683 seconds; average cost time 10.516 seconds
Estimated time to complete 29.795 mins.


Progress 76/245.
Have cost 798.605 seconds; average cost time 10.508 seconds
Estimated time to complete 29.597 mins.


Progress 77/245.
Have cost 803.427 seconds; average cost time 10.434 seconds
Estimated time to complete 29.216 mins.


Progress 78/245.
Have cost 810.304 seconds; average cost time 10.389 seconds
Estimated time to complete 28.915 mins.


Progress 79/245.
Have cost 826.168 seconds; aver

Progress 140/245.
Have cost 1435.318 seconds; average cost time 10.252 seconds
Estimated time to complete 17.941 mins.


Progress 141/245.
Have cost 1448.182 seconds; average cost time 10.271 seconds
Estimated time to complete 17.803 mins.


Progress 142/245.
Have cost 1454.157 seconds; average cost time 10.241 seconds
Estimated time to complete 17.580 mins.


Progress 143/245.
Have cost 1468.286 seconds; average cost time 10.268 seconds
Estimated time to complete 17.455 mins.


Progress 144/245.
Have cost 1482.321 seconds; average cost time 10.294 seconds
Estimated time to complete 17.328 mins.


Progress 145/245.
Have cost 1491.203 seconds; average cost time 10.284 seconds
Estimated time to complete 17.140 mins.


Progress 146/245.
Have cost 1504.103 seconds; average cost time 10.302 seconds
Estimated time to complete 16.998 mins.


Progress 147/245.
Have cost 1514.970 seconds; average cost time 10.306 seconds
Estimated time to complete 16.833 mins.


Progress 148/245.
Have cost 1530

Progress 208/245.
Have cost 2168.207 seconds; average cost time 10.424 seconds
Estimated time to complete 6.428 mins.


Progress 209/245.
Have cost 2179.951 seconds; average cost time 10.430 seconds
Estimated time to complete 6.258 mins.


Progress 210/245.
Have cost 2195.816 seconds; average cost time 10.456 seconds
Estimated time to complete 6.099 mins.


Progress 211/245.
Have cost 2203.762 seconds; average cost time 10.444 seconds
Estimated time to complete 5.918 mins.


Progress 212/245.
Have cost 2219.594 seconds; average cost time 10.470 seconds
Estimated time to complete 5.758 mins.


Progress 213/245.
Have cost 2223.510 seconds; average cost time 10.439 seconds
Estimated time to complete 5.567 mins.


Progress 214/245.
Have cost 2228.299 seconds; average cost time 10.413 seconds
Estimated time to complete 5.380 mins.


Progress 215/245.
Have cost 2240.105 seconds; average cost time 10.419 seconds
Estimated time to complete 5.210 mins.


Progress 216/245.
Have cost 2247.659 sec

In [None]:
get_profile('travel', comps)

# Parse the Pages

In [None]:
import bs4 as BeautifulSoup 
import re

def parse_page(i: str) -> dict:
    """
    parse one industry pages and extract the full name, location, website, section, industry for each company
    Input:
        i = industry name
    Output:
        dictionary = {company:{name:, location:, webstion:, section:, industry:,}}
    """
    # access the dir
    root_path = os.path.abspath(os.path.dirname(os.getcwd()))
    web_path = os.path.join(root_path, 'webpage')
    assert inds in os.listdir(web_path), 'This industry is not supported.'
    inds_path = os.path.join(web_path, inds)
    
    # read file and parse each
    c_pattern = re.compile('\w+(?=.txt)')
    result = dict()
    for f in os.listdir(inds_path):
        if not f.endswith('.txt'): # avoid _DS.Store
            continue
        vals = {}
        c = re.search(c_pattern, f)[0] # get company name
        url = ("https://finance.yahoo.com/quote/{}/profile?p={}".format(c,c)) # profile page
        file_path = os.path.join(inds_path, f)
        with open(file_path, 'r') as webpage:
            soup = webpage.read()
            
        soup = BeautifulSoup.BeautifulSoup(soup)
        
        # get the company full name
        name = str(soup.find(class_="Fz(m) Mb(10px)"))
        name_p = re.compile('(?<=data-reactid="6">).+(?=</h3>)')
        try:
            name = re.findall(name_p, name)[0]
        except:
            print(c, name)
            print(url)
            print('\n')
            result[c] = vals
            continue

        # get the company address and website address
        address = str(soup.find(class_="D(ib) W(47.727%) Pend(40px)"))
        location_p = re.compile('(?<=-->)[\d\w\s,]+(?=<!--)')
        location = re.findall(location_p, address)
        website_p = re.compile('(?<=target="_blank" title="">).+(?=</a></p>)')
        website = re.findall(website_p, address)
        if website:
            website = website[0]
        
        # get the section and industry along with
        info = str(soup.find(class_="D(ib) Va(t)"))
        section_p1 = re.compile('(?<=data-reactid="21">).+(?=</span><br data-reactid="22"/>)')
        section_p2 = re.compile('(?<=data-reactid="23">).+(?=</span><br data-reactid="24"/>)')
        if re.findall(section_p1, info):
            section = re.findall(section_p1, info)[0]
        else:
            section = re.findall(section_p2, info)[0]
        industry_p1 = re.compile('(?<=data-reactid="25">).+(?=</span><br data-reactid="26"/>)')
        industry_p2 = re.compile('(?<=data-reactid="27">).+(?=</span><br data-reactid="28"/>)')
        if re.findall(industry_p1, info):
            industry = re.findall(industry_p1, info)[0]
        else:
            industry = re.findall(industry_p2, info)[0]

        vals['name'] = name
        vals['location'] = location
        vals['website'] = website
        vals['section'] = section
        vals['industry'] = industry
        vals['profile'] = url
        result[c] = vals
    return result

# Footnote

<a name="ft1">[1]</a>: Difference between `os.wal` and `os.listdir` https://www.cnblogs.com/cloud-ken/p/10017093.html