# Installing Libraries

In [None]:
#!pip install pandas_datareader

In [None]:
#!pip install yfinance

In [None]:
#!pip install yfinance --upgrade --no-cache-dir


In [None]:
#!pip install html5lib

# Importing Libraries

In [2]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from unidecode import unidecode
import re
import os
import yfinance as yf
from time import sleep
import glob

# Gathering Data - Brazilian Companies

### List of brazilian companies and composition of indexes - Ibovespa, IBrX100, IBrX50, IBrA

## Selenium

Used to get list of companies in brazilian's index

### Configurating WebDriver

In [98]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains

### Configurating download file

In [99]:
chromeOptions = webdriver.ChromeOptions()
download_path = r'C:\Users\Pedro\OneDrive\Desktop\Ironhack\04. GitHub\stocks_project\data'
prefs = {"download.default_directory" : download_path}
chromeOptions.add_experimental_option("prefs",prefs)

In [None]:
#Runs the webdriver
driver = webdriver.Chrome(options=chromeOptions)

### Download Index File (.csv)

In [None]:
def get_index_stocks(index, wait=6):
    '''
        Receives the Index name, download a file that contains the index tickers, and return the name of the downloaded file
    '''

    # Chrome WebDriver opens the index webside
    url = f'https://sistemaswebb3-listados.b3.com.br/indexPage/day/{index.upper()}?language=pt-br'
    driver.get(url)
    driver.implicitly_wait(wait)

    driver.find_element(By.ID, 'segment').send_keys("Setor de Atuação")
    driver.implicitly_wait(wait)
    driver.find_element(By.LINK_TEXT, 'Download').click()
    driver.implicitly_wait(wait)

    # Set the directory
    os.chdir(r'C:\Users\Pedro\OneDrive\Desktop\Ironhack\04. GitHub\stocks_project\data')
    sleep(wait)

    # Get the .csv files from the selectec directory and sort them ascending by modification date 
    files = list(glob.glob('*csv'))
    files.sort(key=lambda x: os.path.getmtime(x), reverse=True)

    # Returns the name of the most recent file
    return files[0]

### Create the Index DataFrame

In [None]:
def create_df(file):
    '''
        This function receives the name of the Stock Exchange index and returns a DataFrame with all companies and their respective sectors
    '''

    # Creating DataFrame
    DataFrame = pd.read_csv(file, 
                                encoding='ISO-8859-1',
                                header=1,                   # Uses line 1 as header
                                sep=';',                    
                                decimal=',',
                                thousands='.',
                                skipfooter=2,               # Removes last 2 lines
                                engine='python',
                                index_col=False)            # Does not make first column as index
    
    # Normalizing columns
    DataFrame.columns = [re.sub('[\.()]', '', re.sub(' ', '_', unidecode(columns.lower()))) for columns in DataFrame.columns]

    return DataFrame

## Creating Index's DataFrames

In [3]:
# Defining file's path
path = r'C:\Users\Pedro\OneDrive\Desktop\Ironhack\04. GitHub\stocks_project\data/'

In [None]:
# Defining DataFrames' columns names
col_names = [
    'sector',
    'ticker',
    'name',
    'type',
    'amount',
    'percentage',
    'percentage_acum'
]

### Ibovespa

In [None]:
ibov = create_df(get_index_stocks('ibov'))
len(ibov)

In [None]:
ibov.columns = col_names

In [None]:
# Save index file
ibov.to_csv(f'{path}IBOV.csv',
                    encoding='ISO-8859-1',
                    sep=';',
                    decimal='.',
                    index=False)

### IBrX100

In [None]:
ibrx = create_df(get_index_stocks('ibxx'))
len(ibrx)

In [None]:
ibrx.columns = col_names

In [None]:
# Save index file
ibrx.to_csv(f'{path}IBRX100.csv',
                    encoding='ISO-8859-1',
                    sep=';',
                    decimal='.',
                    index=False)

### IBrX50

In [None]:
ibrx50 = create_df(get_index_stocks('ibxl'))
len(ibrx50)

In [None]:
ibrx50.columns = col_names

In [None]:
# Save index file
ibrx50.to_csv(f'{path}IBRX50.csv',
                    encoding='ISO-8859-1',
                    sep=';',
                    decimal='.',
                    index=False)

### IBrA

In [None]:
ibra = create_df(get_index_stocks('ibra'))
len(ibra)

In [None]:
ibra.columns = col_names

In [None]:
# Save index file
ibra.to_csv(f'{path}IBRA.csv',
                    encoding='ISO-8859-1',
                    sep=';',
                    decimal='.',
                    index=False)

### Transform Sector - Obsoleto?

In [None]:
ibra = pd.read_csv(f'{path}IBRA.csv',
                    encoding='ISO-8859-1',
                    sep=';',
                    decimal='.')

In [None]:
len(ibra)

In [None]:
ibra

In [None]:
ibra[['sector_aux', 'sub_sector_aux']] = ibra['sector'].str.split('/',expand=True)

In [None]:
ibra['sector_aux'] = ibra['sector_aux'].str.strip()
ibra['sub_sector_aux'] = ibra['sub_sector_aux'].str.strip()

In [None]:
len(ibra['sector_aux'].unique())

In [None]:
len(ibra['sub_sector_aux'].unique())

## Verificando empresas que divergem nos dois índices - Obsoleto

In [None]:
emp_ibrx = ibrx['codigo']
set_ibrx = set(emp_ibrx)
len(set_ibrx)

In [None]:
emp_ibov = ibov['codigo']
set_ibov = set(emp_ibov)
len(set_ibov)

In [None]:
emp_ibra = ibra['codigo']
set_ibra = set(emp_ibra)
len(set_ibra)

In [None]:
print(len(set_ibra.difference(set_ibrx)))
print(set_ibra.difference(set_ibrx))

In [None]:
print(len(set_ibov.difference(set_ibrx)))
print(set_ibov.difference(set_ibrx))

In [None]:
print(len(set_ibrx.difference(set_ibov)))
print(set_ibrx.difference(set_ibov))

### Verificando qual empresa da lista completa não exista em 2021

In [None]:
set_codigo = set(lista_codigo)
set_empresas_2021 = set(empresas_2021)

print(len(set_codigo.difference(set_empresas_2021)))
print(set_codigo.difference(set_empresas_2021))

## Criando DataFrame unindo os dois índices - Obsoleto

In [None]:
df_empresas = pd.concat([ibrx, ibov]).drop_duplicates(subset='codigo', ignore_index=True).drop(['qtde_teorica',r'part_%',r'part_%acum'], axis=1)

# Gathering Data - Brazilian Companies Historical Dataset

### Create data set concatenating historical datasets downloaded from B3 website with data from the companies listed in IBRA Index

## Criando lista com o código dos pricipais ativos

In [None]:
## Utilizar lista do IBRA
codigo = list(set_ibra)

## Unzip files

In [None]:
import zipfile
path = r'C:\Users\Pedro\OneDrive\Desktop\Ironhack\04. GitHub\data_visualization\data/'

i = 1986

while i < 2000:
    with zipfile.ZipFile(f'{path}COTAHIST_A{i}.zip', 'r') as zip_ref:
        zip_ref.extractall(path)
    i +=1



## Rename files

In [None]:
os.listdir()

In [None]:
i = 2000

while i < 2022:
    os.rename(f'COTAHIST_A{i}.TXT', f'COTAHIST_A{i}.txt')
    i +=1

## Setting configurations to read B3 historical files

In [None]:
pd.set_option('display.max_columns', 500)
pd.options.display.max_columns=500

In [None]:
path = r'C:\Users\Pedro\OneDrive\Desktop\Ironhack\05. Dados\B3\txt/'

year = 2022

widths = [2,8,2,12,3,12,10,3,4,13,13,13,13,13,13,13,5,18,18,13,1,8,7,13,12,3]

col_names = [
"tipo_registro",
"data_pregao",
"cod_bdi",
"cod_negociacao",
"tipo_mercado",
"nome_empresa",
"especificacao_papel",
"prazo_dias_merc_termo",
"moeda_referencia",
"preco_abertura",
"preco_maximo",
"preco_minimo",
"preco_medio",
"preco_ultimo_negocio",
"preco_melhor_oferta_compra",
"preco_melhor_oferta_venda",
"numero_negocios",
"quantidade_papeis_negociados",
"volume_total_negociado",
"preco_exercicio",
"ìndicador_correcao_precos",
"data_vencimento" ,
"fator_cotacao",
"preco_exercicio_pontos",
"codigo_isin",
"num_distribuicao_papel"]

decimal_config=[
"preco_abertura",
"preco_maximo",
"preco_minimo",
"preco_medio",
"preco_ultimo_negocio",
"preco_melhor_oferta_compra",
"preco_melhor_oferta_venda",
"volume_total_negociado",
"preco_exercicio",
"preco_exercicio_pontos"
]


remains = [
"data_pregao",
"cod_negociacao",
"tipo_mercado",
"nome_empresa",
"preco_abertura",
"preco_maximo",
"preco_minimo",
"preco_medio",
"preco_ultimo_negocio",
]

In [None]:
dataset = pd.DataFrame()

In [None]:
'''     Concatenate DataFrames
year = 2022
while year > 1985:
    df = pd.read_fwf(f'{path}COTAHIST_A{year}.TXT',
                                    encoding='ISO-8859-1',
                                    header=0,
                                    widths=widths,
                                    skipfooter=1,
                                    engine='python',
                                    parse_dates=[1],
                                    infer_datetime_format=True,
                                    index_col=False)
    year -= 1

    #Definindo nomes das colunas
    df.columns = col_names

    #Corrigindo casas decimais
    for col in decimal_config:
        df[col]=df[col]/100

    #Selecionando colunas
    df = df[remains]
    
    #Mascara de empresas desejadas
    mask_empresas = df.cod_negociacao.isin(codigo)

    df = df[mask_empresas]

    dataset = pd.concat([dataset, df], ignore_index=True)
'''

## Export DF - Historical Series

In [None]:
dataset.to_csv(f'{path}dataset_IBRA.csv',
                    encoding='ISO-8859-1',
                    sep=';',
                    decimal='.',
                    index=False)

In [4]:
pd.read_csv(f'{path}dataset_IBRA.csv',
                encoding='ISO-8859-1',
                sep=';',
                decimal='.',
                index_col=False
                )

Unnamed: 0,data_pregao,cod_negociacao,tipo_mercado,nome_empresa,preco_abertura,preco_maximo,preco_minimo,preco_medio,preco_ultimo_negocio
0,2022-01-03,RRRP3,10,3R PETROLEUM,34.14,34.78,32.36,33.81,34.15
1,2022-01-03,TTEN3,10,3TENTOS,9.60,9.76,8.97,9.28,9.17
2,2022-01-03,ABCB4,10,ABC BRASIL,15.82,16.19,15.48,15.75,15.55
3,2022-01-03,AERI3,10,AERIS,6.58,6.82,6.50,6.65,6.55
4,2022-01-03,AESB3,10,AES BRASIL,11.07,11.17,10.79,10.88,10.90
...,...,...,...,...,...,...,...,...,...
521334,1998-12-30,POMO4,10,MARCOPOLO,1.05,1.10,0.97,0.98,1.10
521335,1998-12-30,RAPT4,10,RANDON PART,0.18,0.18,0.18,0.18,0.18
521336,1997-08-18,VULC3,20,VULCABRAS,40.00,40.00,40.00,40.00,40.00
521337,1997-12-23,KEPL3,20,KEPLER WEBER,36.75,36.75,36.75,36.75,36.75


# Gathering Data From Yahoo Finance

### Get historical series of brazilian and american indexes
### Get historical series of Gold, Bitcoin and Ethererum
### Get historical series of american companies

## Yahoo Finance

In [8]:
# Set YFinance setting

symbol_list_br = ['^BVSP','^IBX50']
symbol_list_eua = ['^DJI','^IXIC','^GSPC','GC=F']
symbol_list_crypto = ['BTC-USD', 'ETH-USD']

name_dict={
    '^BVSP':'Ibovespa',
    '^IBX50':'IBrX50',
    '^DJI':'Dow Jones',
    '^IXIC':'NASDAQ',
    '^GSPC':'S&P 500',
    'GC=F':'Ouro ($)',
    'BTC-USD':'Bitcoin ($)',
    'ETH-USD':'Ethereum ($)'
}

col_names = ['date',
"open",
"high",
"low",
"close",
'cod_yfinance']

drop_col = ['Volume','Dividends','Stock Splits']

start = '2000-01-01'
start_br ='2004-11-03'

In [27]:
# Create empty indexes DataFrames
'''
df_br = pd.DataFrame()
df_eua = pd.DataFrame()
df_crypto = pd.DataFrame()
'''

In [28]:
# Fill brazilian indexes historical series dataset
'''
for ativo in symbol_list_br:
        chamada_api = yf.Ticker(ativo).history(period='max')
        chamada_api['cod_yfinace'] = ativo
        df_br = pd.concat([df_br, chamada_api])
'''

In [31]:
# Tranform data from index to column 0
'''
df_br = df_br.drop(drop_col, axis = 1)
df_br.reset_index(inplace=True)
df_br['Date'] = df_br['Date'].dt.date
df_br.columns = col_names
df_br
'''

Unnamed: 0,date,open,high,low,close,cod_yfinance
0,1993-04-27,24.799999,25.400000,24.500000,24.500000,^BVSP
1,1993-04-28,24.500000,24.600000,23.700001,24.299999,^BVSP
2,1993-04-29,24.299999,24.799999,23.700001,23.700001,^BVSP
3,1993-04-30,23.700001,24.200001,23.700001,24.100000,^BVSP
4,1993-05-03,24.100000,24.400000,23.799999,24.100000,^BVSP
...,...,...,...,...,...,...
9795,2022-12-05,18877.189453,18924.589844,18459.519531,18481.359375,^IBX50
9796,2022-12-06,18481.669922,18722.189453,18468.750000,18520.919922,^IBX50
9797,2022-12-07,18626.300781,18630.419922,18335.419922,18450.730469,^IBX50
9798,2022-12-08,18399.380859,18460.849609,18056.960938,18110.419922,^IBX50


In [42]:
# Put indexes names in the DF
df_br['name'] = df_br['cod_yfinance'].map(name_dict)

In [43]:
# Reroder columns
df_br = df_br[['date','name','open','high','low','close','cod_yfinance']]

In [44]:
# Save DataFrame
df_br.to_csv(f'{path}index_br.csv',
                    encoding='ISO-8859-1',
                    sep=';',
                    decimal='.',
                    index=False)

In [45]:
# Read DataFrame
index_br = pd.read_csv(f'{path}index_br.csv',
                    sep = ';',
                    decimal = '.',
                    encoding='ISO-8859-1',
                    index_col=False
)

In [46]:
index_br

Unnamed: 0,date,name,open,high,low,close,cod_yfinance
0,1993-04-27,Ibovespa,24.799999,25.400000,24.500000,24.500000,^BVSP
1,1993-04-28,Ibovespa,24.500000,24.600000,23.700001,24.299999,^BVSP
2,1993-04-29,Ibovespa,24.299999,24.799999,23.700001,23.700001,^BVSP
3,1993-04-30,Ibovespa,23.700001,24.200001,23.700001,24.100000,^BVSP
4,1993-05-03,Ibovespa,24.100000,24.400000,23.799999,24.100000,^BVSP
...,...,...,...,...,...,...,...
9795,2022-12-05,IBrX50,18877.189453,18924.589844,18459.519531,18481.359375,^IBX50
9796,2022-12-06,IBrX50,18481.669922,18722.189453,18468.750000,18520.919922,^IBX50
9797,2022-12-07,IBrX50,18626.300781,18630.419922,18335.419922,18450.730469,^IBX50
9798,2022-12-08,IBrX50,18399.380859,18460.849609,18056.960938,18110.419922,^IBX50


In [39]:
# Fill american indexes and gold historical series dataset
'''
for ativo in symbol_list_eua:
        chamada_api = yf.Ticker(ativo).history(period='max')
        chamada_api['cod_yfinance'] = ativo
        df_eua = pd.concat([df_eua, chamada_api])
'''

In [40]:
# Tranform data from index to column 0
'''
df_eua = df_eua.drop(drop_col, axis = 1)
df_eua.reset_index(inplace=True)
df_eua['Date'] = df_eua['Date'].dt.date
df_eua.columns = col_names
df_eua
'''

Unnamed: 0,date,open,high,low,close,cod_yfinance
0,1992-01-02,3152.100098,3172.629883,3139.310059,3172.399902,^DJI
1,1992-01-03,3172.399902,3210.639893,3165.919922,3201.500000,^DJI
2,1992-01-06,3201.500000,3213.330078,3191.860107,3200.100098,^DJI
3,1992-01-07,3200.100098,3210.199951,3184.479980,3204.800049,^DJI
4,1992-01-08,3204.800049,3229.199951,3185.820068,3203.899902,^DJI
...,...,...,...,...,...,...
50309,2022-12-05,1795.699951,1808.000000,1764.300049,1767.400024,GC=F
50310,2022-12-06,1768.599976,1779.400024,1767.900024,1769.300049,GC=F
50311,2022-12-07,1769.300049,1790.300049,1769.300049,1785.500000,GC=F
50312,2022-12-08,1782.000000,1790.900024,1782.000000,1788.699951,GC=F


In [47]:
# Put indexes names in the DF
df_eua['name'] = df_eua['cod_yfinance'].map(name_dict)

In [48]:
# Reorder columns
df_eua = df_eua[['date','name','open','high','low','close','cod_yfinance']]

In [49]:
# Save DataFrame
df_eua.to_csv(f'{path}index_eua.csv',
                    encoding='ISO-8859-1',
                    sep=';',
                    decimal='.',
                    index=False)

In [50]:
# Read DataFrame
index_eua = pd.read_csv(f'{path}index_eua.csv',
                    sep = ';',
                    decimal = '.',
                    encoding='ISO-8859-1',
                    index_col=False)

Unnamed: 0,date,name,open,high,low,close,cod_yfinance
0,1992-01-02,Dow Jones,3152.100098,3172.629883,3139.310059,3172.399902,^DJI
1,1992-01-03,Dow Jones,3172.399902,3210.639893,3165.919922,3201.500000,^DJI
2,1992-01-06,Dow Jones,3201.500000,3213.330078,3191.860107,3200.100098,^DJI
3,1992-01-07,Dow Jones,3200.100098,3210.199951,3184.479980,3204.800049,^DJI
4,1992-01-08,Dow Jones,3204.800049,3229.199951,3185.820068,3203.899902,^DJI
...,...,...,...,...,...,...,...
50309,2022-12-05,Ouro ($),1795.699951,1808.000000,1764.300049,1767.400024,GC=F
50310,2022-12-06,Ouro ($),1768.599976,1779.400024,1767.900024,1769.300049,GC=F
50311,2022-12-07,Ouro ($),1769.300049,1790.300049,1769.300049,1785.500000,GC=F
50312,2022-12-08,Ouro ($),1782.000000,1790.900024,1782.000000,1788.699951,GC=F


In [None]:
index_eua

In [51]:
# Fill cripto historical series dataset
'''
for ativo in symbol_list_crypto:
        chamada_api = yf.Ticker(ativo).history(period='max')
        chamada_api['cod_yfinace'] = ativo
        df_crypto = pd.concat([df_crypto, chamada_api])
'''

In [52]:
# Tranform data from index to column 0
'''
df_crypto = df_crypto.drop(drop_col, axis = 1)
df_crypto.reset_index(inplace=True)
df_crypto['Date'] = df_crypto['Date'].dt.date
df_crypto.columns = col_names
df_crypto
'''

Unnamed: 0,date,open,high,low,close,cod_yfinance
0,2014-09-17,465.864014,468.174011,452.421997,457.334015,BTC-USD
1,2014-09-18,456.859985,456.859985,413.104004,424.440002,BTC-USD
2,2014-09-19,424.102997,427.834991,384.532013,394.795990,BTC-USD
3,2014-09-20,394.673004,423.295990,389.882996,408.903992,BTC-USD
4,2014-09-21,408.084991,412.425995,393.181000,398.821014,BTC-USD
...,...,...,...,...,...,...
4858,2022-12-05,1279.998901,1302.237549,1252.472412,1259.676758,ETH-USD
4859,2022-12-06,1259.854248,1271.923218,1247.629883,1271.653809,ETH-USD
4860,2022-12-07,1271.553101,1272.694092,1224.447998,1232.437500,ETH-USD
4861,2022-12-08,1232.451782,1286.229736,1226.358521,1281.116333,ETH-USD


In [54]:
# Put indexes names in the DF
df_crypto['name'] = df_crypto['cod_yfinance'].map(name_dict)

In [55]:
# Reorder columns
df_crypto = df_crypto[['date','name','open','high','low','close','cod_yfinance']]

In [56]:
df_crypto.to_csv(f'{path}crypto.csv',
                    encoding='ISO-8859-1',
                    sep=';',
                    decimal='.',
                    index=False)

In [57]:
crypto = pd.read_csv(f'{path}crypto.csv',
                    sep = ';',
                    decimal = '.',
                    encoding='ISO-8859-1',
                    index_col=False)


In [58]:
crypto

Unnamed: 0,date,name,open,high,low,close,cod_yfinance
0,2014-09-17,Bitcoin ($),465.864014,468.174011,452.421997,457.334015,BTC-USD
1,2014-09-18,Bitcoin ($),456.859985,456.859985,413.104004,424.440002,BTC-USD
2,2014-09-19,Bitcoin ($),424.102997,427.834991,384.532013,394.795990,BTC-USD
3,2014-09-20,Bitcoin ($),394.673004,423.295990,389.882996,408.903992,BTC-USD
4,2014-09-21,Bitcoin ($),408.084991,412.425995,393.181000,398.821014,BTC-USD
...,...,...,...,...,...,...,...
4858,2022-12-05,Ethereum ($),1279.998901,1302.237549,1252.472412,1259.676758,ETH-USD
4859,2022-12-06,Ethereum ($),1259.854248,1271.923218,1247.629883,1271.653809,ETH-USD
4860,2022-12-07,Ethereum ($),1271.553101,1272.694092,1224.447998,1232.437500,ETH-USD
4861,2022-12-08,Ethereum ($),1232.451782,1286.229736,1226.358521,1281.116333,ETH-USD


# Gathering Data - American Companies

### List of american companies and composition of indexes - S&P500, Dow Jones, Nasdaq

## S&P500

In [59]:
url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'

data = pd.read_html(url)

In [60]:
data[0].head()

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded
0,MMM,3M,reports,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1976-08-09,66740,1902
1,AOS,A. O. Smith,reports,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800,1888
3,ABBV,AbbVie,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ABMD,Abiomed,reports,Health Care,Health Care Equipment,"Danvers, Massachusetts",2018-05-31,815094,1981


In [61]:
sp500 = data[0].iloc[:,[0,1,3,4]]

In [62]:
sp500.columns = ['ticker', 'name', 'sector', 'sub_industry']

In [63]:
sp500

Unnamed: 0,ticker,name,sector,sub_industry
0,MMM,3M,Industrials,Industrial Conglomerates
1,AOS,A. O. Smith,Industrials,Building Products
2,ABT,Abbott,Health Care,Health Care Equipment
3,ABBV,AbbVie,Health Care,Pharmaceuticals
4,ABMD,Abiomed,Health Care,Health Care Equipment
...,...,...,...,...
498,YUM,Yum! Brands,Consumer Discretionary,Restaurants
499,ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments
500,ZBH,Zimmer Biomet,Health Care,Health Care Equipment
501,ZION,Zions Bancorporation,Financials,Regional Banks


In [64]:
sp500.to_csv(f'{path}SP500.csv',
                    encoding='UTF-8',
                    sep=';',
                    decimal='.',
                    index=False)

In [65]:
pd.read_csv(f'{path}SP500.csv',
                    encoding='UTF-8',
                    sep=';',
                    decimal='.')

Unnamed: 0,ticker,name,sector,sub_industry
0,MMM,3M,Industrials,Industrial Conglomerates
1,AOS,A. O. Smith,Industrials,Building Products
2,ABT,Abbott,Health Care,Health Care Equipment
3,ABBV,AbbVie,Health Care,Pharmaceuticals
4,ABMD,Abiomed,Health Care,Health Care Equipment
...,...,...,...,...
498,YUM,Yum! Brands,Consumer Discretionary,Restaurants
499,ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments
500,ZBH,Zimmer Biomet,Health Care,Health Care Equipment
501,ZION,Zions Bancorporation,Financials,Regional Banks


## Nasdaq

In [66]:
url = 'https://en.wikipedia.org/wiki/Nasdaq-100'

data = pd.read_html(url)

In [74]:
data[4]
nasdaq = data[4]
nasdaq

Unnamed: 0,Company,Symbol,GICS Sector,GICS Sub-Industry
0,Activision Blizzard,ATVI,Communication Services,Interactive Home Entertainment
1,Adobe Inc.,ADBE,Information Technology,Application Software
2,ADP,ADP,Information Technology,Data Processing & Outsourced Services
3,Airbnb,ABNB,Consumer Discretionary,Internet & Direct Marketing Retail
4,Align Technology,ALGN,Health Care,Health Care Supplies
...,...,...,...,...
97,Walgreens Boots Alliance,WBA,Consumer Staples,Drug Retail
98,"Workday, Inc.",WDAY,Information Technology,Application Software
99,Xcel Energy,XEL,Utilities,Multi-Utilities
100,Zoom Video Communications,ZM,Information Technology,Application Software


In [75]:
nasdaq.columns = ['name','ticker', 'sector', 'sub_industry']

In [77]:
nasdaq.to_csv(f'{path}NASDAQ.csv',
                    encoding='UTF-8',
                    sep=';',
                    decimal='.',
                    index=False)

In [78]:
pd.read_csv(f'{path}NASDAQ.csv',
                    encoding='UTF-8',
                    sep=';',
                    decimal='.')

Unnamed: 0,name,ticker,sector,sub_industry
0,Activision Blizzard,ATVI,Communication Services,Interactive Home Entertainment
1,Adobe Inc.,ADBE,Information Technology,Application Software
2,ADP,ADP,Information Technology,Data Processing & Outsourced Services
3,Airbnb,ABNB,Consumer Discretionary,Internet & Direct Marketing Retail
4,Align Technology,ALGN,Health Care,Health Care Supplies
...,...,...,...,...
97,Walgreens Boots Alliance,WBA,Consumer Staples,Drug Retail
98,"Workday, Inc.",WDAY,Information Technology,Application Software
99,Xcel Energy,XEL,Utilities,Multi-Utilities
100,Zoom Video Communications,ZM,Information Technology,Application Software


## Dow Jones

In [79]:
url = 'https://en.wikipedia.org/wiki/Dow_Jones_Industrial_Average'

data = pd.read_html(url)

In [84]:
data[1]
dow_jones = data[1].iloc[:,[0,2,3]]
dow_jones

Unnamed: 0,Company,Symbol,Industry
0,3M,MMM,Conglomerate
1,American Express,AXP,Financial services
2,Amgen,AMGN,Biopharmaceutical
3,Apple,AAPL,Information technology
4,Boeing,BA,Aerospace and defense
5,Caterpillar,CAT,Construction and Mining
6,Chevron,CVX,Petroleum industry
7,Cisco,CSCO,Information technology
8,Coca-Cola,KO,Drink industry
9,Dow,DOW,Chemical industry


In [85]:
dow_jones.columns = ['name','ticker', 'sector']

In [86]:
dow_jones.to_csv(f'{path}DOW_JONES.csv',
                    encoding='UTF-8',
                    sep=';',
                    decimal='.',
                    index=False)

In [87]:
pd.read_csv(f'{path}DOW_JONES.csv',
                    encoding='UTF-8',
                    sep=';',
                    decimal='.')

Unnamed: 0,name,ticker,sector
0,3M,MMM,Conglomerate
1,American Express,AXP,Financial services
2,Amgen,AMGN,Biopharmaceutical
3,Apple,AAPL,Information technology
4,Boeing,BA,Aerospace and defense
5,Caterpillar,CAT,Construction and Mining
6,Chevron,CVX,Petroleum industry
7,Cisco,CSCO,Information technology
8,Coca-Cola,KO,Drink industry
9,Dow,DOW,Chemical industry


# Gathering Data - Companies Sector and Sub-Sector

### Get sector and sub-sector of all companies using WebScrapping on Yahoo Finance website

In [92]:
url = 'https://finance.yahoo.com/quote/MSFT/profile?p=MSFT'


In [100]:
#Runs the webdriver
driver = webdriver.Chrome(options=chromeOptions)

In [101]:
wait = 5

url = f'https://finance.yahoo.com/quote/MSFT/profile?p=MSFT'
driver.get(url)
driver.implicitly_wait(wait)


In [104]:
sector = driver.find_element(By.XPATH, '//*[@id="Col1-0-Profile-Proxy"]/section/div[1]/div/div/p[2]/span[2]')
driver.implicitly_wait(wait)

In [106]:
sector.text

'Technology'

In [107]:
industry = driver.find_element(By.XPATH, '//*[@id="Col1-0-Profile-Proxy"]/section/div[1]/div/div/p[2]/span[4]')
driver.implicitly_wait(wait)

In [108]:
industry.text

'Software—Infrastructure'

In [None]:
# Set the directory
os.chdir(r'C:\Users\Pedro\OneDrive\Desktop\Ironhack\04. GitHub\stocks_project\data')
sleep(wait)