# Market Data

### Yahoo Finance Scrape

In [None]:
# Import dependencies
from splinter import Browser
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import pymongo
import json

In [None]:
# Use pymongo to set up mongo connection
conn = "mongodb://localhost:27017"
client = pymongo.MongoClient(conn)
db = client.otc_markets

# Insert testing tickers into mongoDB
#db.most_active.insert({'symbols': 'ABCE'})
#db.most_active.insert({'symbols': 'ABEPF'})
#db.most_active.insert({'symbols': 'ABMT'})

# Link to the master DB and then look within the mongoDB COLLECTION
most_actives = db.most_active.find({})

# Create empty list to store all symbols stored in the mongoDB
symbols = []

# Appened empty list with symbols pulled from the mongoDB
for most_active in most_actives:
    symbols.append(most_active['symbols'])

symbols

In [None]:
# Create emtpy lists for each key metric datapoint that is being scrapped below
previous_close_lst = []
avg_volume_lst = []
market_cap_lst = []
beta_lst = []
pe_ratio_lst = []
price_estimate_lst = []
latest_news_link_lst = []
latest_news_head_lst = []
sector_lst = []
industry_lst = []
employees_lst = []

# With browser open to the Yahoo Finance webpage loop through all ticker symbols and extract key data
for symbol in symbols:
    executable_path = {'executable_path': '/usr/local/bin/chromedriver'}
    browser = Browser('chrome', **executable_path, headless=False)
    url = 'https://finance.yahoo.com/quote/' + symbol
    browser.visit(url)
    
# Wait for page to load
    time.sleep(5)
    
# Scrape the webpage and collect the key metrics
    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    string = 'data-test'
    
# Try to extract last Close Price, or return an N/A
    try:
        previous_close = soup.find(attrs={'data-test': 'PREV_CLOSE-value'}).span.text
        previous_close = float(previous_close)
    except AttributeError:
        previous_close = 'N/A'
# Try to extract average 3 month trading volume and remove commas, or return an N/A
    try:
        avg_volume = soup.find(attrs={'data-test': 'AVERAGE_VOLUME_3MONTH-value'}).span.text
        avg_volume = float(avg_volume.replace(',',''))
    except AttributeError:
        avg_volume = 'N/A'
# Try to extract market cap of the company, remove commas,
# remove letters from million/billion dollar market caps,
# then multiply by million or billion to store correct market caps or store an N/A
    try:
        market_cap = soup.find(attrs={'data-test': 'MARKET_CAP-value'}).span.text
        market_cap = market_cap.replace(',','')
        if market_cap[-1] == 'M':
            market_cap = float(market_cap[:-1]) * 1000000
        elif market_cap[-1] == 'B':
            market_cap = float(market_cap[:-1]) * 1000000000
        else:
            market_cap = float(market_cap)
    except AttributeError:
        market_cap = 'N/A'
# Try to extract Beta as float or return an N/A
    try:
        beta = soup.find(attrs={'data-test': 'BETA_3Y-value'}).span.text
        beta = float(beta)
    except AttributeError:
        beta = 'N/A'
# Try to extract Price to Earnings ratio as float or return an N/A
    try:
        pe_ratio = soup.find(attrs={'data-test': 'PE_RATIO-value'}).span.text
        if pe_ratio == 'N/A':
            pe_ratio = 'N/A'
        else:
            pe_ratio = float(pe_ratio)
    except AttributeError:
        pe_ratio = 'N/A'
# Try to extract 1 year Price estimate as float or return an N/A
    try:
        price_estimate = soup.find(attrs={'data-test': 'ONE_YEAR_TARGET_PRICE-value'}).span.text
        if price_estimate == 'N/A':
            price_estimate = 'N/A'
        else:
            price_estimate = float(price_estimate)
    except AttributeError:
        price_estimate = 'N/A'
# Try to extract top news link and header of the article or return an N/A
    try:
        latest_news_link = 'https://finance.yahoo.com' + soup.find('li', class_='js-stream-content').a['href']
    except AttributeError:
        latest_news_link = 'N/A'
    try:
        latest_news_head = soup.find('li', class_='js-stream-content').a.text
    except AttributeError:
        latest_news_head = 'N/A'
        
# Test out printing all extracted data   
#     print(symbol)
#     print(previous_close)
#     print(avg_volume)
#     print(market_cap)
#     print(beta)
#     print(pe_ratio)
#     print(price_estimate)
#     print(latest_news_link)
#     print(latest_news_head)
    
# Command browser to navigate to the Profile page to scrape additional data
    browser.click_link_by_partial_text('Profile')
    
# Wait for page to load
    time.sleep(5)

    html = browser.html
    soup = BeautifulSoup(html, 'html.parser')
    
# Try to find the table with Profile data, if no table present on this page, store N/As 
    try:
        string = soup.find('p', class_='D(ib) Va(t)').text
    except AttributeError:
        sector = 'N/A'
        industry = 'N/A'
        employee_count = 'N/A'
        
# As part of the try loop above, try to find industry, sector, employee count, if nothing found, return N/As
# Need to split each string up and separate on a : then grab second index to return actual industry/sector/employees
    sector_pos = string.find('Industry')
    sector = string[:sector_pos]
    if (sector.replace('\xa0', '').split(':')[1]) == '':
        sector = 'N/A'
    else:
        sector = (sector.replace('\xa0', '').split(':')[1])
    print(sector)
    
    industry_pos = string.find('Full Time')
    industry = string[sector_pos:industry_pos]
    if (industry.replace('\xa0', '').split(':')[1]) == '':
        industry = 'N/A'
    else:
        industry = (industry.replace('\xa0', '').split(':')[1])
    print(industry)
    
    employees = string[industry_pos:]
    if (employees.replace('\xa0', '').split(':')[1]) == '':
        employee_count = 'N/A'
    else:
        employee_count = int(employees.replace('\xa0', '').split(':')[1])
    print(employees.replace('\xa0', '').split(':'))
    
    browser.quit()
    
# Appened all extracted values into the empty lists
    previous_close_lst.append(previous_close)
    avg_volume_lst.append(avg_volume)
    market_cap_lst.append(market_cap)
    beta_lst.append(beta)
    pe_ratio_lst.append(pe_ratio)
    price_estimate_lst.append(price_estimate)
    latest_news_link_lst.append(latest_news_link)
    latest_news_head_lst.append(latest_news_head)
    sector_lst.append(sector)
    industry_lst.append(industry)
    employees_lst.append(employee_count)

In [None]:
# Create new DataFrame with columns for each Key Metric
yahooDF = pd.DataFrame(columns = ['symbol', 'previous_close', 'avg_volume', 'market_cap', 'beta', 'pe_ratio', 'price_estimate', 'latest_news_link', 'latest_news_head', 'sector', 'industry', 'employees'])

# Assign values to columns created in DF above
yahooDF['symbol'] = symbols
yahooDF['previous_close'] = previous_close_lst 
yahooDF['avg_volume'] = avg_volume_lst 
yahooDF['market_cap'] = market_cap_lst
yahooDF['beta'] = beta_lst
yahooDF['pe_ratio'] = pe_ratio_lst
yahooDF['price_estimate'] = price_estimate_lst 
yahooDF['latest_news_link'] = latest_news_link_lst
yahooDF['latest_news_head'] = latest_news_head_lst
yahooDF['sector'] = sector_lst
yahooDF['industry'] =industry_lst 
yahooDF['employees'] = employees_lst

# Display DF
yahooDF.head()

# Write DF to a CSV file
file_path = os.path.join('data','yahoo_financial_data.csv')
yahooDF.to_csv(file_path, index = False)

In [None]:
# Drop old DB in order to create a new one
db.yahoo_finance_data.drop()

# Convert DF to JSON to store in mongoDB
records = json.loads(yahooDF.T.to_json()).values()

# Insert dataframe as a JSON into mongoDB
db.yahoo_finance_data.insert(records)