# Get the Company List

Work flow:
1. read the company list `txt` file;
2. get the symbols;
3. put them together <sup>[[1]](#ft1)</sup>

In [1]:
import os
import re

def get_industry_list() -> list:
    root_path = os.path.abspath(os.path.dirname(os.getcwd()))
    data_path = os.path.join(root_path, 'data')
    txt_pattern = re.compile('\w+(?=.txt)')
    names = []
    for file in os.listdir(data_path):
        if file.endswith('.txt'):
            file = re.search(txt_pattern, file)[0]
            names.append(file)
    return names

def get_company_list(l: list) -> dict:
    root_path = os.path.abspath(os.path.dirname(os.getcwd()))
    data_path = os.path.join(root_path, 'data')
    comp = dict()
    for n in l:
        file_name = n + '.txt' 
        path = os.path.join(data_path, file_name)
        # read the txt file
        with open(path, 'r') as f:
            f = f.readlines()
        # identify the symbol pattern of the company
        pattern = re.compile('(?<=^)[A-Z0-9-.&]+(?=\t)')

        company_list = []
        # get the company list
        for line in f:
            company_list.extend(re.findall(pattern, line)) 
        comp[n] = company_list
    return comp

In [2]:
industry = get_industry_list()
comps = get_company_list(industry)
print('data is ready.')

data is ready.


In [3]:
comps['agriculture']

['CTA-PB',
 'CTA-PA',
 'CTVA',
 'NTR',
 'FMC',
 'MOS',
 'SMG',
 'CF',
 'ICL',
 'MGPI',
 'UAN',
 'AVD',
 'IPI',
 'MBII',
 'SEED',
 'CGA',
 'RKDA',
 'YTEN']

In [4]:
comps['energy']

['XOM',
 'CVX',
 'RDS-A',
 'RDS-B',
 'PTR',
 'TOT',
 'BP',
 'SNP',
 'ENB',
 'COP',
 'EQNR',
 'PBR-A',
 'PBR',
 'EPD',
 'TRP',
 'E',
 'EOG',
 'KMI',
 'SLB',
 'CNQ',
 'MPC',
 'PSX',
 'PXD',
 'SU',
 'WMB',
 'VLO',
 'MPLX',
 'EC',
 'OXY',
 'OKE',
 'ET',
 'BKR',
 'HES',
 'HAL',
 'PBA',
 'CVE',
 'DVN',
 'TS',
 'FANG',
 'TPL',
 'SSL',
 'MMP',
 'CLR',
 'MRO',
 'WES',
 'PSXP',
 'TRGP',
 'CSAN',
 'APA',
 'PAA',
 'CCJ',
 'COG',
 'XEC',
 'SHLX',
 'OVV',
 'HFC',
 'SHI',
 'VVV',
 'NOV',
 'EQT',
 'DCP',
 'NFG',
 'CHK',
 'AM',
 'UGP',
 'CHX',
 'ETRN',
 'PDCE',
 'SUN',
 'FTI',
 'ENBL',
 'CNX',
 'NS-PB',
 'MTDR',
 'VNOM',
 'HP',
 'MGY',
 'SWN',
 'ENLC',
 'DEN',
 'MUR',
 'AR',
 'REGI',
 'RRC',
 'CEQP',
 'WHD',
 'HEP',
 'INT',
 'BSM',
 'CLNE',
 'NS',
 'CVI',
 'RIG',
 'CPG',
 'CRC',
 'SM',
 'LBRT',
 'PAGP',
 'DKL',
 'RTLR',
 'EURN',
 'DK',
 'PBF',
 'USAC',
 'YPF',
 'FRO',
 'OAS',
 'AROC',
 'BPMP',
 'GLOG-PA',
 'WLL',
 'CPE',
 'TGP',
 'NBLX',
 'GLNG',
 'CLB',
 'PTEN',
 'GEL',
 'ERF',
 'CRK',
 'STNG',
 'KOS'

In [5]:
comps['travel']

['BKNG.MX',
 'SONA.JK',
 '032350.KS',
 '039130.KS',
 'TRIP.BA',
 '601888.SS',
 'BKNG34.SA',
 'EXPE.MX',
 'TUI.L',
 'RCL.MX',
 'BAYU.JK',
 'JET2.L',
 'TRN.L',
 'PDES.JK',
 'PANR.JK',
 'C1CL34.SA',
 'NCLHN.MX',
 'CRIP34.SA',
 'EXGR34.SA',
 'TRIP.MX',
 'R1CL34.SA',
 'BKNG',
 'DESP.BA',
 'PCE1.F',
 'BOOK.VI',
 'OTB.L',
 'N1CL34.SA',
 '6191.T',
 '0780.HK',
 '6030.T',
 'YELO.JK',
 'T1RI34.SA',
 'CCL',
 '9726.T',
 'CVC1.F',
 'EXPE',
 'TCOM',
 'E3X1.F',
 'EXPE.VI',
 'RCL',
 '7048.T',
 'EASEMYTRIP.NS',
 'CLV.F',
 'THOMASCOOK.NS',
 'RC8.F',
 'PGJO.JK',
 '1992.HK',
 'HSW.L',
 'THOMASCOOK.BO',
 '9085.S',
 'NCLH',
 '6561.T',
 '000796.SZ',
 '1NC.F',
 'NCLH.VI',
 'TENG.L',
 'TRIP',
 'TUIFY',
 '2731.TW',
 'TUIFF',
 'TRIP.VI',
 'T6A.DE',
 'T6A.F',
 'TRIP.MI',
 'TNL',
 '002707.SZ',
 'TUI1.DE',
 'TUI1.F',
 'TUI2.SG',
 '6548.T',
 'TUI2.F',
 '300859.SZ',
 'CVCB3.SA',
 'TEM.DU',
 '1810.SR',
 'TEM.F',
 'WD5A.F',
 'FLT.AX',
 '000888.SZ',
 'LAG.F',
 'NTHOL.IS',
 '6577.T',
 'MMB.PA',
 'DRTGF',
 'DG1.F',
 'MMYT'

# Get the Profile Page for Each Company

In [8]:
import re
import time
import requests
import random
        
def get_profile(inds: str, d: dict):
    """
    get profile for one industry
    """
    assert inds in d.keys(), 'This industry is not in the list for now.'
    root_path = os.path.abspath(os.path.dirname(os.getcwd()))
    web_path = os.path.join(root_path, 'webpage')
    inds_path = os.path.join(web_path, inds)
    try:
        # REF: https://www.geeksforgeeks.org/create-a-directory-in-python/
        os.mkdir(inds_path) # create the corresponding industry direction
    except:
        pass
    headers = {'User-agent': 'Mozilla/5.0'}
    count = 0
    total = len(d[inds]) - len(os.listdir(inds_path))
    t1 = time.time()
    for c in d[inds]:
        file_name = c + '.txt'
        if file_name in os.listdir(inds_path):
            continue
        else:  
            count += 1
            file_path = os.path.join(inds_path, file_name)
            url = ("https://finance.yahoo.com/quote/{}/profile?p={}".format(s,s))
            num = random.randint(3, 15)
            time.sleep(num) # sleep num's for each request
            webpage = requests.get(url, headers=headers)
            with open(file_path, 'w') as file:
                file.write(webpage.text)
        t2 = time.time()
        print('Progress {c}/{t}.'.format(c = count, t = total))
        print('Have cost {t:.3f} seconds; average cost time {s:.3f} seconds'.format(t = t2 - t1, s = (t2-t1)/count))
        print('Estimated time to complete {t:.3f} mins.'.format(t = (total-count)*(t2-t1)/count/60))
        print('\n')
    return
    
def get_all_profile(d: dict):
    for k in d.keys():
        get_profile(k, d)
        print('\n\n\n {i} is done'.formati = k)
        
    print('\n\n\nall done.') 
        
        

SyntaxError: invalid syntax (<ipython-input-8-41cbefe04b6d>, line 46)

In [7]:
get_all_profile(comps)

NameError: name 'get_all_profile' is not defined

# Parse the Pages

In [None]:
import bs4 as BeautifulSoup 
import re

def parse_page(i: str) -> dict:
    """
    parse one industry pages and extract the full name, location, website, section, industry for each company
    Input:
        i = industry name
    Output:
        dictionary = {company:{name:, location:, webstion:, section:, industry:,}}
    """
    # access the dir
    root_path = os.path.abspath(os.path.dirname(os.getcwd()))
    web_path = os.path.join(root_path, 'webpage')
    assert inds in os.listdir(web_path), 'This industry is not supported.'
    inds_path = os.path.join(web_path, inds)
    
    # read file and parse each
    c_pattern = re.compile('\w+(?=.txt)')
    result = dict()
    for f in os.listdir(inds_path):
        if not f.endswith('.txt'): # avoid _DS.Store
            continue
        vals = {}
        c = re.search(c_pattern, f)[0] # get company name
        url = ("https://finance.yahoo.com/quote/{}/profile?p={}".format(c,c)) # profile page
        file_path = os.path.join(inds_path, f)
        with open(file_path, 'r') as webpage:
            soup = webpage.read()
            
        soup = BeautifulSoup.BeautifulSoup(soup)
        
        # get the company full name
        name = str(soup.find(class_="Fz(m) Mb(10px)"))
        name_p = re.compile('(?<=data-reactid="6">).+(?=</h3>)')
        try:
            name = re.findall(name_p, name)[0]
        except:
            print(c, name)
            print(url)
            print('\n')
            result[c] = vals
            continue

        # get the company address and website address
        address = str(soup.find(class_="D(ib) W(47.727%) Pend(40px)"))
        location_p = re.compile('(?<=-->)[\d\w\s,]+(?=<!--)')
        location = re.findall(location_p, address)
        website_p = re.compile('(?<=target="_blank" title="">).+(?=</a></p>)')
        website = re.findall(website_p, address)
        if website:
            website = website[0]
        
        # get the section and industry along with
        info = str(soup.find(class_="D(ib) Va(t)"))
        section_p1 = re.compile('(?<=data-reactid="21">).+(?=</span><br data-reactid="22"/>)')
        section_p2 = re.compile('(?<=data-reactid="23">).+(?=</span><br data-reactid="24"/>)')
        if re.findall(section_p1, info):
            section = re.findall(section_p1, info)[0]
        else:
            section = re.findall(section_p2, info)[0]
        industry_p1 = re.compile('(?<=data-reactid="25">).+(?=</span><br data-reactid="26"/>)')
        industry_p2 = re.compile('(?<=data-reactid="27">).+(?=</span><br data-reactid="28"/>)')
        if re.findall(industry_p1, info):
            industry = re.findall(industry_p1, info)[0]
        else:
            industry = re.findall(industry_p2, info)[0]

        vals['name'] = name
        vals['location'] = location
        vals['website'] = website
        vals['section'] = section
        vals['industry'] = industry
        vals['profile'] = url
        result[c] = vals
    return result

# Footnote

<a name="ft1">[1]</a>: Difference between `os.wal` and `os.listdir` https://www.cnblogs.com/cloud-ken/p/10017093.html