# Market Data

## Yahoo Finance Scrape

In [1]:
# Import dependencies
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import pymongo
import json

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

## Connect to MongoDB

In [2]:
# Use pymongo to set up mongo connection
conn = "mongodb://localhost:27017"
client = pymongo.MongoClient(conn)
db = client.otc_market_finance_data

# Link to the master DB and then look within the mongoDB COLLECTION
most_actives = db.otc_market_most_active_stocks.find({})

# Create empty list to store all symbols stored in the mongoDB
symbols = []

# Appened empty list with symbols pulled from the mongoDB
for most_active in most_actives:
    symbols.append(most_active['SYMBOL'])

ServerSelectionTimeoutError: localhost:27017: [Errno 61] Connection refused

## Scrape Yahoo Page

In [28]:
# Use this line to limit the number of companies to scrape
number_of_companies_to_run = 10
symbols = symbols[:number_of_companies_to_run]

# Create emtpy lists for each key metric datapoint that is being scrapped below
previous_close_lst = []
avg_volume_lst = []
market_cap_lst = []
beta_lst = []
pe_ratio_lst = []
price_estimate_lst = []
latest_news_link_lst = []
latest_news_head_lst = []
sector_lst = []
industry_lst = []
employees_lst = []
companies_counter = 1

# With browser open to the Yahoo Finance webpage loop through all ticker symbols and extract key data
for symbol in symbols:
    print(f"Scraping company number {companies_counter}: {symbol}.")
    
    # Using Selenium to open browser connection to Yahoo Finance Page
    driver = webdriver.Chrome('/usr/local/bin/chromedriver')
    url = 'https://finance.yahoo.com/quote/' + symbol
    driver.get(url)
    soup = BeautifulSoup(driver.page_source,'lxml')
    
# Wait for page to load
    wait = WebDriverWait(driver, 30)
    
# Scrape the webpage and collect the key metrics
    string = 'data-test'
    
# Try to extract last Close Price, or return an N/A
    try:
        previous_close = soup.find(attrs={'data-test': 'PREV_CLOSE-value'}).span.text
        previous_close = float(previous_close)
    except AttributeError:
        previous_close = 'N/A'
# Try to extract average 3 month trading volume and remove commas, or return an N/A
    try:
        avg_volume = soup.find(attrs={'data-test': 'AVERAGE_VOLUME_3MONTH-value'}).span.text
        avg_volume = float(avg_volume.replace(',',''))
    except AttributeError:
        avg_volume = 'N/A'
# Try to extract market cap of the company, remove commas,
# remove letters from million/billion dollar market caps,
# then multiply by million or billion to store correct market caps or store an N/A
    try:
        market_cap = soup.find(attrs={'data-test': 'MARKET_CAP-value'}).span.text
        market_cap = market_cap.replace(',','')
        if market_cap[-1] == 'M':
            market_cap = float(market_cap[:-1]) * 1000000
        elif market_cap[-1] == 'B':
            market_cap = float(market_cap[:-1]) * 1000000000
        else:
            market_cap = float(market_cap)
    except AttributeError:
        market_cap = 'N/A'
# Try to extract Beta as float or return an N/A
    try:
        beta = soup.find(attrs={'data-test': 'BETA_3Y-value'}).span.text
        if beta != 'N/A':
            beta = float(beta)
    except AttributeError:
        beta = 'N/A'
# Try to extract Price to Earnings ratio as float or return an N/A
    try:
        pe_ratio = soup.find(attrs={'data-test': 'PE_RATIO-value'}).span.text
        pe_ratio = pe_ratio.replace(',','')
        if pe_ratio == 'N/A':
            pe_ratio = 'N/A'
        else:
            pe_ratio = float(pe_ratio)
    except AttributeError:
        pe_ratio = 'N/A'
# Try to extract 1 year Price estimate as float or return an N/A
    try:
        price_estimate = soup.find(attrs={'data-test': 'ONE_YEAR_TARGET_PRICE-value'}).span.text
        if price_estimate == 'N/A':
            price_estimate = 'N/A'
        else:
            price_estimate = float(price_estimate)
    except AttributeError:
        price_estimate = 'N/A'
# Try to extract top news link and header of the article or return an N/A
    try:
        latest_news_link = 'https://finance.yahoo.com' + soup.find('li', class_='js-stream-content').a['href']
    except AttributeError:
        latest_news_link = 'N/A'
    try:
        latest_news_head = soup.find('li', class_='js-stream-content').a.text
    except AttributeError:
        latest_news_head = 'N/A'      
        
    # Test out printing all extracted data   
    # print(previous_close)
    # print(avg_volume)
    # print(market_cap)
    # print(beta)
    # print(pe_ratio)
    # print(price_estimate)
    # print(latest_news_link)
    # print(latest_news_head)
    
# Open Yahoo Profile page to scrape additional data
    
    url = url + '/profile?p=' + symbol
    driver.get(url)
    
# Wait for page to load
    
    soup = BeautifulSoup(driver.page_source,'lxml')
    wait = WebDriverWait(driver, 30)
    
# As part of the try loop above, try to find industry, sector, employee count, if nothing found, return N/As
# Need to split each string up and separate on a : then grab second index to return actual industry/sector/employees

    try:
        string = soup.find('p', class_='D(ib) Va(t)').text
        
        sector_pos = string.find('Industry')
        sector = string[:sector_pos]
        if (sector.replace('\xa0', '').split(':')[1]) == '':
            sector = 'N/A'
        else:
            sector = (sector.replace('\xa0', '').split(':')[1])
        #print(sector)

        industry_pos = string.find('Full Time')
        industry = string[sector_pos:industry_pos]
        if (industry.replace('\xa0', '').split(':')[1]) == '':
            industry = 'N/A'
        else:
            industry = (industry.replace('\xa0', '').split(':')[1])
        #print(industry)

        employees = string[industry_pos:]
        if (employees.replace('\xa0', '').split(':')[1]) == '':
            employee_count = 'N/A'
        else:
            employees = employees.replace(',', '').split(':')[1]
            employee_count = int(employees.replace('\xa0', ''))
        #print(employees.replace('\xa0', '').split(':'))
    except AttributeError:
        sector = 'N/A'
        industry = 'N/A'
        employee_count = 'N/A'

    # Close the browser window
    driver.quit()
    
# Appened all extracted values into the empty lists
    previous_close_lst.append(previous_close)
    avg_volume_lst.append(avg_volume)
    market_cap_lst.append(market_cap)
    beta_lst.append(beta)
    pe_ratio_lst.append(pe_ratio)
    price_estimate_lst.append(price_estimate)
    latest_news_link_lst.append(latest_news_link)
    latest_news_head_lst.append(latest_news_head)
    sector_lst.append(sector)
    industry_lst.append(industry)
    employees_lst.append(employee_count)
    
    companies_counter += 1

Scraping company number 1: RHHBY.
Scraping company number 2: GBTC.
Scraping company number 3: FMCKJ.
Scraping company number 4: FNMAS.
Scraping company number 5: CURLF.
Scraping company number 6: DTEGY.
Scraping company number 7: RHHVF.
Scraping company number 8: OGRMF.
Scraping company number 9: CVSI.
Scraping company number 10: FERGY.


## Create Dataframe to store Yahoo data

In [26]:
# Create new DataFrame with columns for each Key Metric
yahooDF = pd.DataFrame(columns = ['symbol', 'previous_close', 'avg_volume', 'market_cap', 'beta', 'pe_ratio', 'price_estimate', 'latest_news_link', 'latest_news_head', 'sector', 'industry', 'employees'])

# Assign values to columns created in DF above
yahooDF['symbol'] = symbols
yahooDF['previous_close'] = previous_close_lst 
yahooDF['avg_volume'] = avg_volume_lst 
yahooDF['market_cap'] = market_cap_lst
yahooDF['beta'] = beta_lst
yahooDF['pe_ratio'] = pe_ratio_lst
yahooDF['price_estimate'] = price_estimate_lst 
yahooDF['latest_news_link'] = latest_news_link_lst
yahooDF['latest_news_head'] = latest_news_head_lst
yahooDF['sector'] = sector_lst
yahooDF['industry'] =industry_lst 
yahooDF['employees'] = employees_lst

# Display DF
yahooDF.head()

# Write DF to a CSV file
file_path = os.path.join('data','yahoo_financial_data.csv')
yahooDF.to_csv(file_path, index = False)

## Store Yahoo data in MongoDB

In [27]:
# Drop old DB in order to create a new one
db.otc_market_yahoo_data.drop()

# Convert DF to JSON to store in mongoDB
records = json.loads(yahooDF.T.to_json()).values()

# Insert dataframe as a JSON into mongoDB
db.otc_market_yahoo_data.insert(records)

  


[ObjectId('5ccdc83c247ac635645edf71'),
 ObjectId('5ccdc83c247ac635645edf72'),
 ObjectId('5ccdc83c247ac635645edf73'),
 ObjectId('5ccdc83c247ac635645edf74'),
 ObjectId('5ccdc83c247ac635645edf75'),
 ObjectId('5ccdc83c247ac635645edf76'),
 ObjectId('5ccdc83c247ac635645edf77'),
 ObjectId('5ccdc83c247ac635645edf78'),
 ObjectId('5ccdc83c247ac635645edf79'),
 ObjectId('5ccdc83c247ac635645edf7a')]