In [2]:
import pandas as pd
import numpy as np
import time
from tqdm import *

from models.filing_text import FilingText
from models.sls_web_httparty import SlsWebHTTParty
from bs4 import BeautifulSoup as Soup

        

In [3]:
stops = [
    'PLC',
    'INC',
    'CORP',
    'LLC',
    'LP',
    'COS',
    'LTD',
    'CO',
    'II'
]

# indices = ['cik', 'link', 'summary', 'title', 'updated']

def preformat_title(cn):
    for stop in stops:
        cn = cn + " "
        cn = cn.replace(f' .{stop} ', '')
        cn = cn.replace(f' {stop}. ', '')
        cn = cn.replace(f' {stop} ', '')
    cn = cn.strip()
    return cn.replace(" ", "%20")

def get_mw_soup(title):
    url = f'https://www.marketwatch.com/tools/quotes/lookup.asp?siteID=mktw&Lookup={title}+&Country=us&Type=All'
    res = SlsWebHTTParty().simple_get(url)
    soup = Soup(res,'html.parser')
    return soup

def cik_title(val):
    cik = val[0]
    title = val[3]
    return [cik, title.upper()]

def get_mapping(soup, original_title, cn):
    mw_titles = soup.find_all('td', {"class": "bottomborder"})
    ts_mapping = {}
    for i in range(len(mw_titles)):
        use_title = True
        if i%3 != 0:
            continue 
        mw_title = soup.find_all('td', {"class": "bottomborder"})[i].a['title'].upper()
        symbol = soup.find_all('td', {"class": "bottomborder"})[i].text
        ts_mapping[original_title] = ts_mapping.get(original_title, [])
        ts_mapping[original_title].append([mw_title, symbol])
    return ts_mapping



In [4]:
def get_mappings(df):
    mappings = []
    for val in tqdm(df.values):
        record = {}
        cik, original_title  = cik_title(val)
        cn                   = preformat_title(original_title)
        soup                 = get_mw_soup(cn)
        mapping              = get_mapping(soup, original_title, cn)
        mappings.append(mapping)
    return mappings
    

In [5]:
def get_title_2_cik(df):
    title_2_cik = {}
    for val in tqdm(df.values):
        record = {}
        cik, original_title  = cik_title(val)
        title_2_cik[original_title] = cik

def get_records(mappings, title_2_cik):
    records = []
    for mapping in mappings:
        use_title = True
        record = {}
        if mapping == {}:
            continue 
        company_name = list(mapping.keys())[0].upper()
        results = list(mapping.values())[0]
        for suggested_name, suggested_ticker in results:
            for word in suggested_name.upper().split(' '):
                if word not in company_name:
                    use_title = False
            if use_title:
                record['name']             = company_name
                record['ticker_symbol']    = suggested_ticker
                record['cik']              = title_2_cik[company_name]
                record['marketwatch_name'] = suggested_name
        if record == {}:
            continue
        records.append(record)
    return records

In [6]:
df = FilingText().missing_ticker_symbol_dates()
title_2_cik = get_title_2_cik(df)
mappings = get_mappings(df)

records = get_records(mappings, title_2_cik)

new_ctm_df = pd.DataFrame.from_records(records)
ctm_df = pd.read_csv('company_ticker_mapping.csv')
all_ctm = ctm_df.append(new_ctm_df)
deduped_ctm = all_ctm.drop_duplicates(subset=['ticker_symbol'])

new_count = len(deduped_ctm) - len(ctm_df)
print(f'{new_count} new company tickers added to company_ticker_mapping.csv')

deduped_ctm.to_csv('company_ticker_mapping.csv', index=False)


1171