# __WEB SCRAPING__

### Objective:
* Extract NASDAQ data from a financial website
    
### Steps:
1. Research websites with financial data
2. List all the links from where to get data from.
3. Write a scrapping algorith to extract selected information.
4. Store data into a dataframe
5. Combine all dataframes
6. Export as a CSV file 

### Description:
* In this project I researched financial websites including __[U.S. Securities and Exchange Commission (SEC)](https://www.sec.gov/)__ and __[Yahoo Finance](https://finance.yahoo.com/)__ which are well known websites with financial data. After the research I found www.advfn.com to be an accurated and up to date data source.
* I collected all financial data from each NASDAQ company and merged it into a large data set for personal financial analysis.

### Disclaimers:
* This project is not monetized nor is part of a commercialization in any way. This project complies according to the "Copyright And Limited Reproduction Notices" from www.advfn.com 

<br><br><br><br>

# Web Scraping Code

In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import urllib.request
from datetime import datetime


# Part 1 / 3 List of financial websites

##### Set up header variables

In [2]:
# define header variables
user_agent = 'SaintWhoza@protonmail.com'
headers={'User-Agent':user_agent,
         'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
         'Accept-Encoding': 'none',
         'Accept-Language': 'en-US,en;q=0.8',
         'DNT': '1',
         'Connection': 'keep-alive'}


##### List with initial websites to be scrapped

In [3]:
# create a list with websites containing financials data
base_url = ["https://www.advfn.com/nasdaq/nasdaq.asp?companies=","https://www.advfn.com/nyse/newyorkstockexchange.asp?companies=","https://www.advfn.com/amex/americanstockexchange.asp?companies="]
letters = [ chr(x) for x in range(65,91)] # list with A - Z letters
ls_websites = [ x + y for x in base_url for y in letters ]

##### FUNCTION: input url and return its html (soup)

In [4]:
# download html from url
def getSoup(url):
    request=urllib.request.Request(url,None,headers)
    response = urllib.request.urlopen(request)
    data = response.read()
    soup = BeautifulSoup(data, 'html.parser')
    return soup

##### FUNCTION: input soup and return all href

In [5]:
# store financial links
def getLinks(soup, list_): # list_ is where to store all href from soup
    ls = [ x.get('href') for x in soup.select('a') if x.get('href').endswith("financials")] # make a list with all href
    for e in ls:
        list_.append(e) # append each href to the main list

##### FUNCTION: input url and return list of websites to scrape

In [6]:
# Takes from 1m:40s to 2m:30s
ls = []
for url in ls_websites:
    soup = getSoup(url)
    getLinks(soup, ls)
# len(ls) = 12575,12594


# Part 2/3
# Extract Individual Company Information

##### FUNCTION: input url then validates, and return soup.

In [7]:
# download html from url ; only those that have "financials" at the end of the url
def getSoupProfile(url):
    request=urllib.request.Request(url,None,headers)
    response = urllib.request.urlopen(request) # although I open a 'financials' url I may get redirected.
    if response.url.endswith("financials"): # if no redirected then extract soup
        data = response.read()
        soup = BeautifulSoup(data, 'html.parser')
        return soup
    else: # if redirect then return 0
        return 0


#### FUNCTION: input soup and return pd df with column names. It is a single row.

In [8]:
# KEYS in profile

def getNamesInProfile(soup_financials):
    
    description_ls = ['Company Name', 'Ticket', 'Description', 'Source Link', 'Stock Group']
    for e in description_ls:
        name_ls.append(e)

    CompanyProfile = soup_financials.find_all('tr', {'class': "border-bottom"})
    for e in CompanyProfile:
        profileItem = e.find_all('td')[0].get_text()
        name_ls.append(profileItem)
    
    # create a dataFrame with the names transposed.
    dataFrame_main = pd.DataFrame(data={'name': name_ls}).transpose()
    return dataFrame_main


#### FUNCTION: input soup and return pd df with values. It is a single row.

In [9]:
# VALUES in profile

def getValuesInProfile(soup_financials, website):
    value_ls = []
    CompanyProfile = soup_financials.find_all('tr', {'class': "border-bottom"})

    # append title
    title = soup_financials.find_all('div', {'class':'mx-0 px-0'})[0].get_text()[1:-21]
    value_ls.append(title)

    # append company ticket
    initial_string = soup_financials.find_all('div', {'class':'mx-0 px-0'})[0].get_text().rfind("(") + 1
    ticket = soup_financials.find_all('div', {'class':'mx-0 px-0'})[0].get_text()[initial_string:-15]
    value_ls.append(ticket)

    # append company description
    description = soup_financials.find_all('td', {'class':'text-left'})[0].get_text()
    value_ls.append(description)

    # append source website
    value_ls.append(website)

    # append company stock group name
    value_ls.append(website.split('/')[-3])

    # append company Values
    for e in CompanyProfile:
        profileItem = e.find_all('td')[1].get_text()
        try:
            value_ls.append(int(profileItem))
        except:
            value_ls.append(profileItem)
    
    # create a dataFrame with the values transposed
    dataFrame_ticket = pd.DataFrame(data={'value': value_ls}).T
    return dataFrame_ticket

#### Build list with column names, then build a pd df, then add df into df list

In [10]:
# Builds the heading (first row) of the combined dataFrame
dataFrame_ls = []
name_ls = []

anyWebsite = "https://ih.advfn.com/stock-market/NASDAQ/tesla-TSLA/financials" # used as a template website to extract KEYS headings
soupWebiste = getSoup(anyWebsite)
names = getNamesInProfile(soupWebiste)
dataFrame_ls.append(names)


#### FUNCTION: input root link (with company information), then return df with compan profile information values. It is a single row per company.

In [11]:
# open profile link and store info into a database
# link_name = list with links (list)
# stock_group = name of the stock group (string)
def build_dataframe(link_name):
    # TRACKING
    list_length = len(link_name)
    redirect_tracking = 0
    download_tracking = 0

    for website in link_name: # for every element in the list containing financial statments urls
        soupWebiste = getSoupProfile(website) # get html from website
        if soupWebiste != 0:
            values = getValuesInProfile(soupWebiste, website) # get values as a dataFrame
            dataFrame_ls.append(values) # add dataFrame to the list

            # TRACKING
            list_length-=1
            download_tracking +=1
            print(download_tracking," downloads.", "\t"*4 , list_length, " left. --------------- SUCCESS", "\t"*1, website.split('/')[-3])
        # TRACKING
        else:
            # TRACKING
            redirect_tracking+=1
            list_length-=1
            print("Rederected.", redirect_tracking, "\t"*4, list_length, " left.", "\t"*4, website.split('/')[-3])

In [12]:
# RUN ALL PROFILE FUNCTION ------- takes 9.4 hrs.
build_dataframe(ls)

# TRACKING
print("DONE")

Rederected. 1 				 12574  left. 				 NASDAQ
1  downloads. 				 12573  left. --------------- SUCCESS 	 NASDAQ
Rederected. 2 				 12572  left. 				 NASDAQ
2  downloads. 				 12571  left. --------------- SUCCESS 	 NASDAQ
3  downloads. 				 12570  left. --------------- SUCCESS 	 NASDAQ
4  downloads. 				 12569  left. --------------- SUCCESS 	 NASDAQ
Rederected. 3 				 12568  left. 				 NASDAQ
Rederected. 4 				 12567  left. 				 NASDAQ
5  downloads. 				 12566  left. --------------- SUCCESS 	 NASDAQ
Rederected. 5 				 12565  left. 				 NASDAQ
6  downloads. 				 12564  left. --------------- SUCCESS 	 NASDAQ
7  downloads. 				 12563  left. --------------- SUCCESS 	 NASDAQ
Rederected. 6 				 12562  left. 				 NASDAQ
8  downloads. 				 12561  left. --------------- SUCCESS 	 NASDAQ
9  downloads. 				 12560  left. --------------- SUCCESS 	 NASDAQ
10  downloads. 				 12559  left. --------------- SUCCESS 	 NASDAQ
11  downloads. 				 12558  left. --------------- SUCCESS 	 NASDAQ
12  downloads. 				

In [13]:
dataFrame_finalScrape = pd.concat(dataFrame_ls, axis = 0)

# Part 3/3
# Shaping the DataFrame

#### Convert to df and save as csv

In [14]:
dataFrame_finalScrape = pd.concat(dataFrame_ls, axis = 0)

date = datetime.now().strftime("%d-%m-%Y %H%M%S")
nameFile = "InvestorsHub " + date + " - Webscrape raw.csv"
filePath = "/Users/pedrosanhueza/EXOXY/Personal Projects/Programming/Web Scraping/Yahoo Finance (py)/InvestorsHub - Historical Data/" + nameFile

dataFrame_finalScrape.to_csv(filePath, index=False)

In [15]:
# make  copy of the dataFrame
dataFrame = dataFrame_finalScrape.copy()

# First row to column
dataFrame.columns = dataFrame.iloc[0] # assign first row to columns header
dataFrame = dataFrame[1:] # remove first row

# Reset index
dataFrame=dataFrame.reset_index()
dataFrame.drop(['index'], axis=1, inplace=True)


In [16]:
# drop duplicated column
dataFrame.drop(dataFrame.columns[90], axis=1, inplace=True) # col name: 

# list of columns with "$\xa"
colNameEdit_ls = []
for e in dataFrame.columns.values:
    try:    
        if dataFrame[e].iloc[0][0] == chr(36):
            colNameEdit_ls.append(e)
    except:
        try:
            if dataFrame[e].iloc[1][0] == chr(36):
                colNameEdit_ls.append(e)
        except:
            if dataFrame[e].iloc[1] == chr(36):
                colNameEdit_ls.append(e)

dataFrame = dataFrame.replace(',','', regex=True) # remove commas from all dataframe

dataFrame = dataFrame.replace('%','', regex=True) # remove percentage sign from all dataframe

dataFrame = dataFrame.replace('-', '') # remove hyphen from all data frame

# remove all "$\xa" based on list of columns with "$\xa"
for e in colNameEdit_ls:
    dataFrame[e] = dataFrame[e].apply(lambda x: x[2:])
    dataFrame[e] = pd.to_numeric(dataFrame[e])
    # dataFrame[e] = dataFrame[e].apply(lambda x: "$" + x)   



In [17]:
# save df
 
date = datetime.now().strftime("%d-%m-%Y %H%M%S")
nameFile = "InvestorsHub " + date + " - Webscrape.csv"
filePath = "../InvestorsHub - Historical Data/" + nameFile
dataFrame.to_csv(filePath, index=False)


In [18]:
# open df
# it comes with data types adjusted from csv
dataFrame = pd.read_csv(f'../InvestorsHub - Historical Data/{nameFile}')

# ---------- PRACTICE AREA -----------