In [1]:
import sys

import requests
from bs4 import BeautifulSoup
import re
import unicodedata
import pandas as pd

In [2]:
url = 'https://sashares.co.za/shares-list/#gs.jxh54k'

In [3]:
# Get stock data from Sashares
data = requests.get(url).text

In [4]:
# Create a soup object 
soup = BeautifulSoup(data, 'html.parser')

In [5]:
# Find all the tables in the data
html_tables = soup.find_all('table')
# Count the tables
len(html_tables)

2

In [6]:
# We only want the first table containing stock data
stock_table = html_tables[0]

In [7]:
# Instantiate lists to store stock data
jse_code = []
share = []
company = []

In [8]:
# Get a list of rows, exclude table headings
table_rows = stock_table.find_all('tr')[1:]

In [9]:
# Check row data - what does it look like?
table_rows[0]

<tr id="table_1_row_0">
<td style=""><a data-wpel-link="internal" href="https://sashares.co.za/4sight-holdings-shares/">4SI</a></td>
<td style=""><a data-wpel-link="internal" href="https://sashares.co.za/4sight-holdings-shares/">4SIGHT</a></td>
<td style=""><a data-wpel-link="internal" href="https://sashares.co.za/4sight-holdings-shares/">4Sight Holdings Ltd.</a></td>
<td style="">0.23</td>
<td style="">0.01</td>
<td style="">4.55</td>
<td style="">145,170,000</td>
<td style=""><a data-wpel-link="internal" href="https://sashares.co.za/4sight-holdings-shares/" target="_self"><button class="">View</button></a></td>
<td style="">23/12/2021 11:41 AM</td>
<td style="">23/12/2021 12:00 PM</td>
<td style="">LISTED</td>
</tr>

In [10]:
# Iterate over list of rows and extract the stock symbol, name and company name from each row
for i, row in enumerate(table_rows):
    cells = row.find_all('td')
    jse_code.append(cells[0].string)
    share.append(cells[1].string)
    company.append(cells[2].string)

In [11]:
# Create a DataFrame with the stock data
stock_data = pd.DataFrame({"JSE_code": jse_code, "Ticker": share, "Company": company})

In [12]:
# Check first few rows
stock_data.head()

Unnamed: 0,JSE_code,Ticker,Company
0,4SI,4SIGHT,4Sight Holdings Ltd.
1,ABETNC,FNBETNABC,Exchange Traded Notes
2,ABETNQ,FNBETNABQ,Exchange Traded Notes
3,ABG,ABSA,Absa Group Ltd.
4,ABSGEA,UBS ABSGE,Exchange Traded Notes


In [13]:
# Check last few rows
stock_data.tail()

Unnamed: 0,JSE_code,Ticker,Company
523,WWETNQ,FNBETNWWQ,Exchange Traded Notes
524,YRK,YORK,York Timber Holdings Ltd.
525,YYLBEE,YEBOYETHU,YeboYethu Ltd.
526,ZED,ZEDER,Zeder Investments Ltd.
527,JSE Code,Share,Company Name


To clean the data we need to perform the following operations:
* Drop the column Ticker - it doesn't provide any useful information
* Drop the last row (527) which contains table headings
* Select stocks only - exclude Exchange Traded Notes
* Reset the index

In [14]:
# Drop column Ticker, not really useful.
stock_data.drop(columns=['Ticker'], inplace=True)
# Drop the last row containing heading data
stock_data.drop(index=527, inplace=True)
# Remove Exchange Traded Notes
stock_data = stock_data[stock_data['Company'] != 'Exchange Traded Notes']
# Remove Exchange Traded Funds
stock_data = stock_data[stock_data['Company'] != 'Exchange Traded Funds']
# Reset the index
stock_data.reset_index(inplace=True, drop=True)
# Now let's take a look
stock_data.iloc[40:60]

Unnamed: 0,JSE_code,Company
40,ART,Argent Industrial Ltd.
41,ASC,Ascendis Health Ltd.
42,ATI,Afristrat Investment Holdings Ltd.
43,ATID,Afristrat Investment Holdings Ltd.
44,ATIG,Afristrat Investment Holdings Ltd.
45,ATT,Attacq Ltd.
46,AVI,AVI Ltd.
47,AVL,Advanced Health Ltd.
48,AVV,Alviva Holdings Ltd.
49,AYO,AYO Technology Solutions Ltd.


In [15]:
stock_data.shape

(355, 2)