# Market Data

## Yahoo Finance Scrape

In [1]:
# Import dependencies
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import pymongo
import json

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

## Connect to MongoDB

In [10]:
# Use pymongo to set up mongo connection
conn = "mongodb://localhost:27017"
client = pymongo.MongoClient(conn)
db = client.otc_market_finance_data

# Link to the master DB and then look within the mongoDB COLLECTION
most_actives = db.otc_market_most_active_stocks.find({})

# Create empty list to store all symbols stored in the mongoDB
companies = []

# Appened empty list with symbols pulled from the mongoDB
for most_active in most_actives:
    companies.append(most_active['SYMBOL'])

## Scrape Yahoo Page

In [20]:
# Use this line to limit the number of companies to scrape
user_input = False
while user_input == False:
    number_of_companies_to_run = input(f"Choose the number of companies to extract data (max: {len(companies)}): ")
    try:
        if int(number_of_companies_to_run) <= 984:
            symbols = companies[:int(number_of_companies_to_run)]
            user_input = True
        else:
            symbols = companies[:len(companies)]
            user_input = True
    except ValueError:
        print("Input a integer number please.")

Choose the number of companies to extract data (max: 984): e
Input a integer number please.
Choose the number of companies to extract data (max: 984): 1000


In [33]:
# Create emtpy lists for each key metric datapoint that is being scrapped below
previous_close_lst = []
avg_volume_lst = []
market_cap_lst = []
beta_lst = []
pe_ratio_lst = []
price_estimate_lst = []
latest_news_link_lst = []
latest_news_head_lst = []
sector_lst = []
industry_lst = []
employees_lst = []
companies_counter = 1



# With browser open to the Yahoo Finance webpage loop through all ticker symbols and extract key data
for symbol in symbols:
    print(f"Scraping company number {companies_counter}: {symbol}.")
    
    # Using Selenium to open browser connection to Yahoo Finance Page
    chromeOptions = webdriver.ChromeOptions()
    # Set options to don't open images and store history cache
    prefs={"profile.managed_default_content_settings.images": 2, 'disk-cache-size': 4096 }
    chromeOptions.add_experimental_option('prefs', prefs)
    driver = webdriver.Chrome(options=chromeOptions)
    url = 'https://finance.yahoo.com/quote/' + symbol
    driver.get(url)
    soup = BeautifulSoup(driver.page_source,'lxml')
    
# Wait for page to load
    #wait = WebDriverWait(driver, 10)
    
# Scrape the webpage and collect the key metrics
    string = 'data-test'
    
# Try to extract last Close Price, or return an N/A
    try:
        previous_close = soup.find(attrs={'data-test': 'PREV_CLOSE-value'}).span.text
        previous_close = previous_close.replace(',','') 
        previous_close = float(previous_close)
    except AttributeError:
        previous_close = 'N/A'
        
# Try to extract average 3 month trading volume and remove commas, or return an N/A
    try:
        avg_volume = soup.find(attrs={'data-test': 'AVERAGE_VOLUME_3MONTH-value'}).span.text
        if avg_volume != 'N/A':
            avg_volume = float(avg_volume.replace(',',''))
    except AttributeError:
        avg_volume = 'N/A'
        
# Try to extract market cap of the company, remove commas,
# remove letters from million/billion dollar market caps,
# then multiply by million or billion to store correct market caps or store an N/A
    try:
        market_cap = soup.find(attrs={'data-test': 'MARKET_CAP-value'}).span.text
        market_cap = market_cap.replace(',','')
        if market_cap == 'N/A':
            market_cap = 'N/A'
        elif market_cap[-1] == 'M':
            market_cap = float(market_cap[:-1]) * 1000000
        elif market_cap[-1] == 'B':
            market_cap = float(market_cap[:-1]) * 1000000000
        else:
            market_cap = float(market_cap)
    except AttributeError:
        market_cap = 'N/A'
# Try to extract Beta as float or return an N/A
    try:
        beta = soup.find(attrs={'data-test': 'BETA_3Y-value'}).span.text
        beta = beta.replace(',','')
        if beta != 'N/A':
            beta = float(beta)
    except AttributeError:
        beta = 'N/A'
# Try to extract Price to Earnings ratio as float or return an N/A
    try:
        pe_ratio = soup.find(attrs={'data-test': 'PE_RATIO-value'}).span.text
        pe_ratio = pe_ratio.replace(',','')
        if pe_ratio == 'N/A':
            pe_ratio = 'N/A'
        else:
            pe_ratio = float(pe_ratio)
    except AttributeError:
        pe_ratio = 'N/A'
# Try to extract 1 year Price estimate as float or return an N/A
    try:
        price_estimate = soup.find(attrs={'data-test': 'ONE_YEAR_TARGET_PRICE-value'}).span.text
        price_estimate = price_estimate.replace(',','')
        if price_estimate == 'N/A':
            price_estimate = 'N/A'
        else:
            price_estimate = float(price_estimate)
    except AttributeError:
        price_estimate = 'N/A'
# Try to extract top news link and header of the article or return an N/A
    try:
        latest_news_link = 'https://finance.yahoo.com' + soup.find('li', class_='js-stream-content').a['href']
    except AttributeError:
        latest_news_link = 'N/A'
    try:
        latest_news_head = soup.find('li', class_='js-stream-content').a.text
    except AttributeError:
        latest_news_head = 'N/A'      
        
    # Test out printing all extracted data   
    # print(previous_close)
    # print(avg_volume)
    # print(market_cap)
    # print(beta)
    # print(pe_ratio)
    # print(price_estimate)
    # print(latest_news_link)
    # print(latest_news_head)
    
# Open Yahoo Profile page to scrape additional data
    
    url = url + '/profile?p=' + symbol
    driver.get(url)
    soup = BeautifulSoup(driver.page_source,'lxml')
    
# Wait for page to load
    #wait = WebDriverWait(driver, 10)
    
# As part of the try loop above, try to find industry, sector, employee count, if nothing found, return N/As
# Need to split each string up and separate on a : then grab second index to return actual industry/sector/employees

    try:
        string = soup.find('p', class_='D(ib) Va(t)').text
        
        sector_pos = string.find('Industry')
        sector = string[:sector_pos]
        if (sector.replace('\xa0', '').split(':')[1]) == '':
            sector = 'N/A'
        else:
            sector = (sector.replace('\xa0', '').split(':')[1])
        #print(sector)

        industry_pos = string.find('Full Time')
        industry = string[sector_pos:industry_pos]
        if (industry.replace('\xa0', '').split(':')[1]) == '':
            industry = 'N/A'
        else:
            industry = (industry.replace('\xa0', '').split(':')[1])
        #print(industry)

        employees = string[industry_pos:]
        if (employees.replace('\xa0', '').split(':')[1]) == '':
            employee_count = 'N/A'
        else:
            employees = employees.replace(',', '').split(':')[1]
            employee_count = int(employees.replace('\xa0', ''))
        #print(employees.replace('\xa0', '').split(':'))
    except AttributeError:
        sector = 'N/A'
        industry = 'N/A'
        employee_count = 'N/A'

    # Close the browser window
    driver.quit()
    
# Appened all extracted values into the empty lists
    previous_close_lst.append(previous_close)
    avg_volume_lst.append(avg_volume)
    market_cap_lst.append(market_cap)
    beta_lst.append(beta)
    pe_ratio_lst.append(pe_ratio)
    price_estimate_lst.append(price_estimate)
    latest_news_link_lst.append(latest_news_link)
    latest_news_head_lst.append(latest_news_head)
    sector_lst.append(sector)
    industry_lst.append(industry)
    employees_lst.append(employee_count)
    
    companies_counter += 1

Scraping company number 1: RHHBY.
Scraping company number 2: GBTC.
Scraping company number 3: FMCKJ.
Scraping company number 4: FNMAS.
Scraping company number 5: CURLF.
Scraping company number 6: DTEGY.
Scraping company number 7: RHHVF.
Scraping company number 8: OGRMF.
Scraping company number 9: CVSI.
Scraping company number 10: FERGY.
Scraping company number 11: JBSAY.
Scraping company number 12: ADDYY.
Scraping company number 13: BASFY.
Scraping company number 14: CHKVP.
Scraping company number 15: CRLBF.
Scraping company number 16: BNPQY.
Scraping company number 17: CWBHF.
Scraping company number 18: ACRGF.
Scraping company number 19: DANOY.
Scraping company number 20: REPYY.
Scraping company number 21: ADDDF.
Scraping company number 22: KSHB.
Scraping company number 23: AVMXY.
Scraping company number 24: KBLB.
Scraping company number 25: IMBBY.
Scraping company number 26: VGWCF.
Scraping company number 27: HOFD.
Scraping company number 28: AXAHY.
Scraping company number 29: EXPGY.

Scraping company number 235: LBUY.
Scraping company number 236: FMCCH.
Scraping company number 237: FFRMF.
Scraping company number 238: TOFB.
Scraping company number 239: VFRM.
Scraping company number 240: PGHEF.
Scraping company number 241: ETST.
Scraping company number 242: RNKLF.
Scraping company number 243: NTRB.
Scraping company number 244: ENDV.
Scraping company number 245: MPMQ.
Scraping company number 246: FSDDF.
Scraping company number 247: CNBX.
Scraping company number 248: BTCY.
Scraping company number 249: TPCS.
Scraping company number 250: FRLF.
Scraping company number 251: REVB.
Scraping company number 252: FCUUF.
Scraping company number 253: STLY.
Scraping company number 254: FTSSF.
Scraping company number 255: SFOR.
Scraping company number 256: SEDO.
Scraping company number 257: WIZP.
Scraping company number 258: IGXT.
Scraping company number 259: FMCKO.
Scraping company number 260: BIVI.
Scraping company number 261: OTCM.
Scraping company number 262: JMSB.
Scraping com

Scraping company number 467: BBSRF.
Scraping company number 468: REPCF.
Scraping company number 469: BSTG.
Scraping company number 470: VABK.
Scraping company number 471: SVBL.
Scraping company number 472: WMLLF.
Scraping company number 473: CTEQF.
Scraping company number 474: CFST.
Scraping company number 475: NHEL.
Scraping company number 476: WINRW.
Scraping company number 477: ECSIF.
Scraping company number 478: AXNVF.
Scraping company number 479: INFT.
Scraping company number 480: FMCB.
Scraping company number 481: LEMIF.
Scraping company number 482: HYHDF.
Scraping company number 483: EXMGF.
Scraping company number 484: YTROF.
Scraping company number 485: ICNAF.
Scraping company number 486: PMULF.
Scraping company number 487: BLGO.
Scraping company number 488: ANXGF.
Scraping company number 489: ACNNF.
Scraping company number 490: CRRVF.
Scraping company number 491: AERO.
Scraping company number 492: HCBC.
Scraping company number 493: NTSFF.
Scraping company number 494: RPMT.
Scr

Scraping company number 698: RRIF.
Scraping company number 699: CUEN.
Scraping company number 700: FPPP.
Scraping company number 701: OWRDF.
Scraping company number 702: RSHYY.
Scraping company number 703: RZLT.
Scraping company number 704: BLONF.
Scraping company number 705: GLGI.
Scraping company number 706: RHNO.
Scraping company number 707: AAUKF.
Scraping company number 708: SMKC.
Scraping company number 709: MCLDF.
Scraping company number 710: FACO.
Scraping company number 711: DTST.
Scraping company number 712: WCTXF.
Scraping company number 713: FMCCK.
Scraping company number 714: FTMDF.
Scraping company number 715: TSSI.
Scraping company number 716: KERMF.
Scraping company number 717: ISENF.
Scraping company number 718: GRRB.
Scraping company number 719: VCBD.
Scraping company number 720: ADMT.
Scraping company number 721: UGEIF.
Scraping company number 722: DGLF.
Scraping company number 723: PFHO.
Scraping company number 724: LVCLY.
Scraping company number 725: BRTI.
Scraping

Scraping company number 929: APAAF.
Scraping company number 930: NAUH.
Scraping company number 931: FNNZF.
Scraping company number 932: MNDJF.
Scraping company number 933: CYBXF.
Scraping company number 934: IPNFF.
Scraping company number 935: NTRU.
Scraping company number 936: LVBX.
Scraping company number 937: EMGC.
Scraping company number 938: PCQRF.
Scraping company number 939: BMNM.
Scraping company number 940: PRZFF.
Scraping company number 941: JGLDF.
Scraping company number 942: BKPPF.
Scraping company number 943: GLKIF.
Scraping company number 944: DTRC.
Scraping company number 945: LIVC.
Scraping company number 946: AMSLF.
Scraping company number 947: BFFAF.
Scraping company number 948: GPDNF.
Scraping company number 949: NSRCF.
Scraping company number 950: PETV.
Scraping company number 951: EQUR.
Scraping company number 952: CFNB.
Scraping company number 953: PMCCF.
Scraping company number 954: GTXO.
Scraping company number 955: PWVI.
Scraping company number 956: SKVI.
Scrap

## Create Dataframe to store Yahoo data

In [34]:
# Create new DataFrame with columns for each Key Metric
yahooDF = pd.DataFrame(columns = ['symbol', 'previous_close', 'avg_volume', 'market_cap', 'beta', 'pe_ratio', 'price_estimate', 'latest_news_link', 'latest_news_head', 'sector', 'industry', 'employees'])

# Assign values to columns created in DF above
yahooDF['symbol'] = symbols
yahooDF['previous_close'] = previous_close_lst 
yahooDF['avg_volume'] = avg_volume_lst 
yahooDF['market_cap'] = market_cap_lst
yahooDF['beta'] = beta_lst
yahooDF['pe_ratio'] = pe_ratio_lst
yahooDF['price_estimate'] = price_estimate_lst 
yahooDF['latest_news_link'] = latest_news_link_lst
yahooDF['latest_news_head'] = latest_news_head_lst
yahooDF['sector'] = sector_lst
yahooDF['industry'] =industry_lst 
yahooDF['employees'] = employees_lst

# Display DF
yahooDF.head()

# Write DF to a CSV file
file_path = os.path.join('data','yahoo_financial_data.csv')
yahooDF.to_csv(file_path, index = False)

## Store Yahoo data in MongoDB

In [35]:
# Drop old DB in order to create a new one
db.otc_market_yahoo_data.drop()

# Convert DF to JSON to store in mongoDB
records = json.loads(yahooDF.T.to_json()).values()

# Insert dataframe as a JSON into mongoDB
db.otc_market_yahoo_data.insert(records)

  


[ObjectId('5cd0534d0caea06b3b349a8c'),
 ObjectId('5cd0534d0caea06b3b349a8d'),
 ObjectId('5cd0534d0caea06b3b349a8e'),
 ObjectId('5cd0534d0caea06b3b349a8f'),
 ObjectId('5cd0534d0caea06b3b349a90'),
 ObjectId('5cd0534d0caea06b3b349a91'),
 ObjectId('5cd0534d0caea06b3b349a92'),
 ObjectId('5cd0534d0caea06b3b349a93'),
 ObjectId('5cd0534d0caea06b3b349a94'),
 ObjectId('5cd0534d0caea06b3b349a95'),
 ObjectId('5cd0534d0caea06b3b349a96'),
 ObjectId('5cd0534d0caea06b3b349a97'),
 ObjectId('5cd0534d0caea06b3b349a98'),
 ObjectId('5cd0534d0caea06b3b349a99'),
 ObjectId('5cd0534d0caea06b3b349a9a'),
 ObjectId('5cd0534d0caea06b3b349a9b'),
 ObjectId('5cd0534d0caea06b3b349a9c'),
 ObjectId('5cd0534d0caea06b3b349a9d'),
 ObjectId('5cd0534d0caea06b3b349a9e'),
 ObjectId('5cd0534d0caea06b3b349a9f'),
 ObjectId('5cd0534d0caea06b3b349aa0'),
 ObjectId('5cd0534d0caea06b3b349aa1'),
 ObjectId('5cd0534d0caea06b3b349aa2'),
 ObjectId('5cd0534d0caea06b3b349aa3'),
 ObjectId('5cd0534d0caea06b3b349aa4'),
 ObjectId('5cd0534d0caea0