# <center>EDA for Yahoo Finance Webscrape</center>

In [1]:
import os.path

import requests
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
#from selenium.webdriver.support.ui import WebDriverWait
#from selenium.webdriver.common.by import By
#from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import numpy
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.float_format = '{:.0f}'.format

chrome_options = Options()
chrome_options.add_argument('--headless')

In [2]:
# Old version
#def get_all_tickers():
#    path = './data/nasdaq_tickers.csv'
#    if(os.path.isfile(path)):
#        return pd.read_csv(path)
#    else:
#        tickers1 = pd.read_csv('./data/nasdaq_screener_1617441070940.csv')
#        tickers2 = pd.read_csv('./data/nasdaq_screener_1617441128021.csv')
#        tickers3 = pd.read_csv('./data/nasdaq_screener_1617441157210.csv')
#
#        total = [symbol for symbol in pd.concat([tickers1['Symbol'], 
#                                               tickers2['Symbol'], 
#                                               tickers3['Symbol']]
#                                             )]
#        print(total)
#        total = pd.DataFrame(total)
#        total.to_csv(r'./data/nasdaq_tickers.csv', index=False)
#        return total
#get_all_tickers()

In [3]:
def get_tickers(*csvs):
    """Accepts a collection of file paths
       ----------------------------------
       File output goes to ./data/nasdaq_tickers as a csv file
       ----------------------------------
       Returns a pandas DataFrame"""
    
    path = './data/nasdaq_tickers.csv'
    if(os.path.isfile(path)):
        return pd.read_csv(path)
    else:
        total = [symbol for symbol in pd.concat([pd.read_csv(csv)['Symbol'] for csv in csvs[0]])]
        total = pd.DataFrame(total)
        total.to_csv(r'./data/nasdaq_tickers.csv', index=False)
        return total

In [4]:
tickers_raw = ['./data/nasdaq_screener_1617441070940.csv', './data/nasdaq_screener_1617441128021.csv', './data/nasdaq_screener_1617441157210.csv']
tickers = get_tickers(tickers_raw)
tickers = tickers.to_numpy().ravel()

In [5]:
def bypass_gdpr(driver, url):
    """Accepts the webdriver and the url (works only for yahoo finance)
       ----------------------------------------------------------------
       Returns None
    """
    driver.get(url)
    button = driver.find_element_by_name('agree')
    driver.execute_script('arguments[0].click()', button)

In [6]:
def get_features(features):
    result = {}
    temp = {}
    for i, item in enumerate(features):
        entries = [line.text for line in features[i].find_all('div', class_='D(tbc)')]
        if len(entries) > 0:
            temp = {entries[0]: entries[1:]}
        result.update(temp)
        temp = {}
    return result

In [7]:
def get_headers(features):
    return [header.text for header in features[0].find_all('div', class_='D(ib)')]

In [8]:
# Path to webdriver
path_to_driver = '/usr/bin/chromedriver'
# Base url for scraping
url = 'https://finance.yahoo.com/quote/'
# Initiate webdriver
chrome = webdriver.Chrome(path_to_driver, options=chrome_options)
# As the GDPR cookie consent pops up on driver start we have to deal with it
bypass_gdpr(chrome, url)

In [40]:
def get_ticker_info():
    result = pd.DataFrame(None)
    for ticker in tickers[0:1]:
        temp = {ticker:{}}
        temp_temp = {}
        is_link = url + ticker + '?p=' + ticker
        financial_url = url + ticker + '/financials?p=' + ticker 
    
        chrome.get(is_link)
    
        html = chrome.execute_script('return document.body.innerHTML;')
        soup = BeautifulSoup(html, 'lxml')
    
        close_price = [entry.text for entry in soup.find_all('span', {'class':'Trsdu(0.3s) Fw(b) Fz(36px) Mb(-4px) D(ib)'})]
        
        temp.update({ticker:{'close_price': close_price[0]}})
        chrome.get(financial_url)
        
        html_financial = chrome.execute_script('return document.body.innerHTML;')
        soup_financial = BeautifulSoup(html_financial, 'lxml')
        
        features_raw = soup_financial.find_all('div', class_='D(tbr)')
        headers = get_headers(features_raw)
        features = get_features(features_raw)
        
        temp_temp = {header: {} for header in headers}
        temp_temp['Breakdown'] = {feature for feature in features}
        temp_temp['ttm'] = features['Total Revenue'][0]
        temp_temp['12/31/2020'] = features['Total Revenue'][1]
        temp_temp['12/31/2019'] = features['Total Revenue'][2]
        temp_temp['12/31/2018'] = features['Total Revenue'][3]
        
        temp.update(temp_temp)
        print(temp)
            
        #result = pd.DataFrame(temp)
        
    return result
get_ticker_info().head()

{'AACG': {'close_price': '4.1900'}, 'Breakdown': {'Total Revenue', 'Basic EPS', 'Basic Average Shares', 'Tax Rate for Calcs', 'Other Income Expense', 'Net Interest Income', 'Cost of Revenue', 'Reconciled Cost of Revenue', 'Total Expenses', 'Diluted Average Shares', 'Net Non Operating Interest Income Expense', 'Normalized Income', 'Total Unusual Items Excluding Goodwill', 'Net Income Common Stockholders', 'Gross Profit', 'Net Income from Continuing Operation Net Minority Interest', 'Operating Income', 'Total Operating Income as Reported', 'Normalized EBITDA', 'Reconciled Depreciation', 'Total Unusual Items', 'Operating Expense', 'EBITDA', 'Diluted EPS', 'Net Income from Continuing & Discontinued Operation', 'Tax Provision', 'EBIT', 'Tax Effect of Unusual Items', 'Diluted NI Available to Com Stockholders', 'Pretax Income'}, 'ttm': '162,168', '12/31/2020': '162,168', '12/31/2019': '97,770', '12/31/2018': '1,339'}


In [10]:
#is_link = 'https://finance.yahoo.com/quote/AAPL?p=AAPL'
#
#chrome = webdriver.Chrome('/usr/bin/chromedriver', options=chrome_options)
#chrome.get(is_link)
#button = chrome.find_element_by_name('agree')
#chrome.execute_script('arguments[0].click()', button)
#
#html = chrome.execute_script('return document.body.innerHTML;')
#
#soup = BeautifulSoup(html, 'lxml')

In [11]:
#close_price = [entry.text for entry in soup.find_all('span', {'class':'Trsdu(0.3s) Fw(b) Fz(36px) Mb(-4px) D(ib)'})]

In [12]:
#print(close_price[0])

In [13]:
#financial_page = 'https://finance.yahoo.com/quote/AAPL/financials?p=AAPL'
#chrome.get(financial_page)
#html_financial = chrome.execute_script('return document.body.innerHTML;')
#soup_financial = BeautifulSoup(html_financial, 'lxml')

In [14]:
#features = soup_financial.find_all('div', class_='D(tbr)')
#headers, final = _get_features(features)
#html_financial = chrome.execute_script('return document.body.innerHTML;')
#soup_financial = BeautifulSoup(html_financial, 'lxml')
#print(headers)
#df = pd.DataFrame(final[1:])
#df.columns = headers

In [15]:
def convert_to_numeric(column):
    first_col = [i.replace(',','') for i in column]
    second_col = [i.replace('-','') for i in first_col]
    final_col = pd.to_numeric(second_col)
    
    return final_col

In [16]:
#for column in headers[1:]:
#    df[column] = convert_to_numeric(df[column])
#
#final_df = df.fillna('-')

In [17]:
#final_df