# <center>EDA for Yahoo Finance Webscrape</center>

In [1]:
import os.path

import requests
from requests.exceptions import RequestException
import pandas as pd
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException
#from selenium.webdriver.support.ui import WebDriverWait
#from selenium.webdriver.common.by import By
#from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

pd.options.display.float_format = '{:.0f}'.format

chrome_options = Options()
chrome_options.add_argument('--headless')

In [2]:
def bypass_gdpr(driver, url):
    """Accepts the webdriver and the url (works only for yahoo finance)
       ----------------------------------------------------------------
       Returns None
    """
    driver.get(url)
    button = driver.find_element_by_name('agree')
    driver.execute_script('arguments[0].click()', button)

In [3]:
# Path to webdriver
path_to_driver = '/usr/bin/chromedriver'
# Base url for scraping
url = 'https://finance.yahoo.com/quote/'
# Initiate webdriver
chrome = webdriver.Chrome(path_to_driver, options=chrome_options)
# As the GDPR cookie consent pops up on driver start we have to deal with it
bypass_gdpr(chrome, url)

In [4]:
def get_tickers(*csvs):
    """Accepts a collection of file paths
       ----------------------------------
       File output goes to ./data/nasdaq_tickers as a csv file
       ----------------------------------
       Returns a pandas DataFrame"""
    
    path = './data/nasdaq_tickers.csv'
    result = []
    special_characters = "\"!@#$%^&*()-+?_=,<>/\""
    
    if(os.path.isfile(path)):
        return pd.read_csv(path)
    else:
        total = [symbol for symbol in pd.concat([pd.read_csv(csv)['Symbol'] for csv in csvs[0]])]
        
        for ticker in total:
            if any(c in special_characters for c in ticker):
                continue
            is_link = url + ticker + '?p=' + ticker
            try:
                requests.get(is_link)
                result.append(ticker)
                print('ticker: ' + ticker)
            except Exception:
                continue
            
        print(result)
        result = pd.DataFrame(result)
        result.to_csv(r'./data/nasdaq_tickers.csv', index=False)
        return result

In [5]:
tickers_raw = ['./data/nasdaq_screener_1617441070940.csv', './data/nasdaq_screener_1617441128021.csv', './data/nasdaq_screener_1617441157210.csv']
tickers = get_tickers(tickers_raw)
tickers = tickers.to_numpy().ravel()

In [6]:
def get_features(features):
    result = {}
    temp = {}
    for i, item in enumerate(features):
        entries = [line.text for line in features[i].find_all('div', class_='D(tbc)')]
        if len(entries) > 0:
            temp = {entries[0]: entries[1:]}
        result.update(temp)
        temp = {}
    return result

In [14]:
def get_headers(features):
    return [header.text for header in features[0].find_all('div', class_='D(ib)')]

In [35]:
def get_ticker_info():
    result = []
    for ticker in tickers[0:1]:
        print(ticker)
        
        is_link = url + ticker + '?p=' + ticker
        financial_url = url + ticker + '/financials?p=' + ticker 
        
        chrome.get(is_link)

        html = chrome.execute_script('return document.body.innerHTML;')
        soup = BeautifulSoup(html, 'lxml')

        close_price = [entry.text for entry in soup.find_all('span', {'class':'Trsdu(0.3s) Fw(b) Fz(36px) Mb(-4px) D(ib)'})]
        test1, test2 = ['ticker_symbol', ticker], ['close_price', close_price[0]]

        chrome.get(financial_url)

        html_financial = chrome.execute_script('return document.body.innerHTML;')
        soup_financial = BeautifulSoup(html_financial, 'lxml')

        features_raw = soup_financial.find_all('div', class_='D(tbr)')
        headers = get_headers(features_raw)
        features = get_features(features_raw)
        
        for i,_ in enumerate(list(features.keys())):
            keys = [list(features.keys())[i]]
            values = list(features.values())[i]
            for value in values:
                keys.append(value)
            result.append(keys)
    return pd.DataFrame(np.array(result), columns=headers)
get_ticker_info()

AACG


Unnamed: 0,Breakdown,ttm,12/31/2020,12/31/2019,12/31/2018
0,Total Revenue,162168,162168,97770,1339
1,Cost of Revenue,98521,98521,61915,4251
2,Gross Profit,63647,63647,35856,-2913
3,Operating Expense,168004,168004,144696,64879
4,Operating Income,-104358,-104358,-108840,-67792
5,Net Non Operating Interest Income Expense,1172,1172,3282,2409
6,Other Income Expense,-7667,-7667,-35703,-2671
7,Pretax Income,-110854,-110854,-141262,-68053
8,Tax Provision,-10269,-10269,-7149,0
9,Net Income Common Stockholders,-92198,-92198,-122254,854926


In [9]:
def convert_to_numeric(column):
    first_col = [i.replace(',','') for i in column]
    second_col = [i.replace('-','') for i in first_col]
    final_col = pd.to_numeric(second_col)
    
    return final_col

In [10]:
#for column in headers[1:]:
#    df[column] = convert_to_numeric(df[column])
#
#final_df = df.fillna('-')

In [11]:
#final_df