In [1]:
import sys

import requests
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd
import numpy as np

In [2]:
url = 'https://sashares.co.za/shares-list/#gs.jxh54k'

In [3]:
# Get stock data from Sashares
data = requests.get(url).text

In [4]:
# Create a soup object 
soup = BeautifulSoup(data, 'html.parser')

In [5]:
# Find all the tables in the data
html_tables = soup.find_all('table')
# Count the tables
len(html_tables)

2

In [6]:
# We only want the first table containing stock data
stock_table = html_tables[0]

In [7]:
# Instantiate lists to store stock data
jse_code = []
share = []
company = []

In [8]:
# Get a list of rows, exclude table headings
table_rows = stock_table.find_all('tr')[1:]

In [9]:
# Check row data - what does it look like?
table_rows[0]

<tr id="table_1_row_0">
<td style=""><a data-wpel-link="internal" href="https://sashares.co.za/4sight-holdings-shares/">4SI</a></td>
<td style=""><a data-wpel-link="internal" href="https://sashares.co.za/4sight-holdings-shares/">4SIGHT</a></td>
<td style=""><a data-wpel-link="internal" href="https://sashares.co.za/4sight-holdings-shares/">4Sight Holdings Ltd.</a></td>
<td style="">0.23</td>
<td style="">0.00</td>
<td style="">0.00</td>
<td style="">151,770,000</td>
<td style=""><a data-wpel-link="internal" href="https://sashares.co.za/4sight-holdings-shares/" target="_self"><button class="">View</button></a></td>
<td style="">24/12/2021 11:37 AM</td>
<td style="">24/12/2021 12:00 PM</td>
<td style="">LISTED</td>
</tr>

In [10]:
# Iterate over list of rows and extract the stock symbol, name and company name from each row
for i, row in enumerate(table_rows):
    cells = row.find_all('td')
    jse_code.append(cells[0].string)
    share.append(cells[1].string)
    company.append(cells[2].string)

In [11]:
# Create a DataFrame with the stock data
stock_data = pd.DataFrame({"JSE_code": jse_code, "Ticker": share, "Company": company})

In [12]:
# Check first few rows
stock_data.head()

Unnamed: 0,JSE_code,Ticker,Company
0,4SI,4SIGHT,4Sight Holdings Ltd.
1,ABETNC,FNBETNABC,Exchange Traded Notes
2,ABETNQ,FNBETNABQ,Exchange Traded Notes
3,ABG,ABSA,Absa Group Ltd.
4,ABSGEA,UBS ABSGE,Exchange Traded Notes


In [13]:
# Check last few rows
stock_data.tail()

Unnamed: 0,JSE_code,Ticker,Company
523,WWETNQ,FNBETNWWQ,Exchange Traded Notes
524,YRK,YORK,York Timber Holdings Ltd.
525,YYLBEE,YEBOYETHU,YeboYethu Ltd.
526,ZED,ZEDER,Zeder Investments Ltd.
527,JSE Code,Share,Company Name


To clean the data we need to perform the following operations:
* Drop the column Ticker - it doesn't provide any useful information
* Drop the last row (527) which contains table headings
* Select stocks only - exclude Exchange Traded Notes
* Reset the index

In [14]:
# Drop column Ticker, not really useful.
stock_data.drop(columns=['Ticker'], inplace=True)
# Drop the last row containing heading data
stock_data.drop(index=527, inplace=True)
# Remove Exchange Traded Notes
stock_data = stock_data[stock_data['Company'] != 'Exchange Traded Notes']
# Remove Exchange Traded Funds
stock_data = stock_data[stock_data['Company'] != 'Exchange Traded Funds']
# Reset the index
stock_data.reset_index(inplace=True, drop=True)
# Now let's take a look
stock_data.iloc[40:60]

Unnamed: 0,JSE_code,Company
40,ART,Argent Industrial Ltd.
41,ASC,Ascendis Health Ltd.
42,ATI,Afristrat Investment Holdings Ltd.
43,ATID,Afristrat Investment Holdings Ltd.
44,ATIG,Afristrat Investment Holdings Ltd.
45,ATT,Attacq Ltd.
46,AVI,AVI Ltd.
47,AVL,Advanced Health Ltd.
48,AVV,Alviva Holdings Ltd.
49,AYO,AYO Technology Solutions Ltd.


In [15]:
stock_data.shape

(355, 2)

Next, we need to additional information about companies, such as market capitalisation and sector.

In [16]:
company_data = pd.read_csv('companies_list.csv')
# Displace the data
company_data.head()

Unnamed: 0,Code,Market Cap,Last trade,Change,%Change,Sector
0,PRX,2509610000000,1220.33,10.05,0.83,Technology
1,ANH,1647670000000,931.1,-17.36,-1.83,Consumer Goods
2,BTI,1418320000000,578.2,0.87,0.15,Consumer Goods
3,CFR,1185100000000,226.44,-0.59,-0.26,Consumer Goods
4,GLN,1131740000000,75.51,-2.08,-2.68,Basic Materials


In [17]:
# Create a new Dataframe to hold the combined info
stock_list = stock_data
# Add new empty columns for market capitalisation and sector
stock_list = stock_list.assign(Market_cap='', Sector='')

In [18]:
# Let's take a look at the new DataFrame
stock_list.head()

Unnamed: 0,JSE_code,Company,Market_cap,Sector
0,4SI,4Sight Holdings Ltd.,,
1,ABG,Absa Group Ltd.,,
2,ABSP,Absa Bank Ltd.,,
3,ACL,ArcelorMittal South Africa Ltd.,,
4,ACS,Acsion Ltd.,,


Next, we need match `JSE_code` in `stock_list` to `Code` in `company_data`. By matching the two, we can extract market cap and sector info for each available the stock from `company_data` and add it to `stock_list`. We'll try to do it in computationally less expensive way.

In [19]:
# Create lists of containing data we want to compare and extract.
jse_code_list = stock_list['JSE_code'].tolist()
comp_codes = company_data['Code'].tolist()
comp_marks = company_data['Market Cap'].tolist()
comp_sectors = company_data['Sector'].tolist()
# Create empty lists to store the matched stock ticker, extracted market cap and sector data
symbl_list=[]
mkt_list = []
sector_list = []
# Lets move down the list of stocks in stock_list - maintaining order is important when combining it later.
# For each stock symbol in stock list
for symbl in jse_code_list:
    # If the symbol is also in the stock list from company data.
    # using a set makes it faster and eliminates duplicates
    if symbl in set(comp_codes):
        # If the stock symbol matches, then
        for code, cap, sec in zip(comp_codes, comp_marks, comp_sectors):
            if code == symbl:
                # Add the stock symbol, market cap and sector to the empty lists
                symbl_list.append(code)
                mkt_list.append(cap)
                sector_list.append(sec)
    else:
        # If the stock is not in comp_codes
        # Still add the symbol to the list - maintain order and number
        # Add NAN for other missing values
        symbl_list.append(symbl)
        mkt_list.append(np.nan)
        sector_list.append(np.nan)

Let's verify our operation. The lengths of the lists must be sames as `stock_list`. 

In [20]:
# Check length of resulting lists
print(len(symbl_list))
print(len(mkt_list))
print(len(sector_list))
print(stock_list.shape)

355
355
355
(355, 4)


Let's add the lists to `stock_list`

In [21]:
stock_list['Market_cap'] = mkt_list
stock_list['Sector'] = sector_list

# Check the results
stock_list.head(20)

Unnamed: 0,JSE_code,Company,Market_cap,Sector
0,4SI,4Sight Holdings Ltd.,138570000.0,Financials
1,ABG,Absa Group Ltd.,126205000000.0,Financials
2,ABSP,Absa Bank Ltd.,,
3,ACL,ArcelorMittal South Africa Ltd.,10014900000.0,Basic Materials
4,ACS,Acsion Ltd.,2567240000.0,Financials
5,ACT,AfroCentric Investment Corporation Ltd.,3045840000.0,Health Care
6,ACZ,Arden Capital Ltd,2345330.0,Financials
7,ADH,ADvTECH Ltd.,9924830000.0,Consumer Services
8,ADI,Adapt IT Holdings Ltd.,976731000.0,Technology
9,ADR,Adcorp Holdings Ltd.,571764000.0,Industrials


Now let's save this as `stock_list.csv`. We will use this in the next stage of the project to select stock data.

In [22]:
stock_list.to_csv('stock_list.csv', index=False)