In [4]:
import nltk
import numpy
import string
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import pandas as pd

nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /Users/20790627/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/20790627/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
# these are the companies listed on the Johannesburg Stock Exchange
listed_companies = pd.read_excel('Complete-List-of-Listed-Companies-on-South-Africa-Johannesburg-Stock-Exchange-Jan-2021.xlsx')
listed_companies.head()

Unnamed: 0,S.No.,Company Name,Ticker,Sector
0,1,4Sight Holdings Limited,4SI,Financial Services
1,2,ABSA Bank Limited,ABSP,Banks
2,3,Barclays Africa Group Limited,ABG.ZA,Banks
3,4,Accelerate Property Fund Limited,APF,Real Estate Investment Trusts
4,5,Accentuate Limited,ACE,Chemicals


In [6]:
companies = list(listed_companies['Company Name'].unique())

In [7]:
# load the Zondo Comission Into State Capture reports
zondo_reports = pd.read_csv('zondo_reports.csv')
zondo_reports = zondo_reports.drop(columns=['Unnamed: 0'])
zondo_reports.head()

Unnamed: 0,report,text
0,OCR version - State Capture Commission Report ...,dicial \nn of Ir• \nS a \nI I eport: Part 1 \n...
1,OCR version - State Capture Commission Report ...,Judicial Commission \nOf \n Inquiry into...
2,OCR version - State Capture Commission Report ...,\n \nJudicial Commission \nof \nInquiry into...
3,OCR version - State Capture Commission Report ...,Chairperson: Justice RtM Zondo \nActing Chief ...
4,OCR version - State Capture Commission Report ...,\n \nJudicial Commission \nof \nInquiry into...


In [8]:
# amalgamate the content of all report volumes into a list
content = zondo_reports['text'].to_list()

In [9]:
content[0][0:1000]

'dicial \nn of Ir• \nS a \nI I eport: Part 1 \n: South African Airways and its \nAssociated Companies \n\\ \\ \nChairperson: Justice RMM Zondo \nActing Chief Justice of the Republic of South Africa e report of the Judicial Commission of Inquiry into allegations of \nure, Corruption and Fraud in the Public Sector including organs of \nlso known to the public and the media as the Zondo Commission JudicialJudicial\nCommission of Commission of Inquiry into Inquiry into \nState Capture State Capture \nReport: Part 1Report: Part 1\nVol. 1:  South African Airways and its Vol. 1:  South African Airways and its \nAssociated CompaniesAssociated Companies\nChairperson: Justice RMM ZondoActing Chief Justice of the Republic of South Africa\nReport of the Judicial Commission of Report of the Judicial Commission of Inquiry into State Capture: Part 1:  Vol.  1 Inquiry into State Capture: Part 1:  Vol.  1 \nThis is the report of the Judicial Commission of Inquiry into allegations of State Capture, Corr

In [10]:
# join items in content list as a literal string, then tokenize
all_content = " ".join(content[:])
all_content_tokens = word_tokenize(all_content) 

In [11]:
# now let's lowercase the tokens
lower_content_tokens = [word.lower() for word in all_content_tokens] 
print(lower_content_tokens[0:50])

['dicial', 'n', 'of', 'ir•', 's', 'a', 'i', 'i', 'eport', ':', 'part', '1', ':', 'south', 'african', 'airways', 'and', 'its', 'associated', 'companies', '\\', '\\', 'chairperson', ':', 'justice', 'rmm', 'zondo', 'acting', 'chief', 'justice', 'of', 'the', 'republic', 'of', 'south', 'africa', 'e', 'report', 'of', 'the', 'judicial', 'commission', 'of', 'inquiry', 'into', 'allegations', 'of', 'ure', ',', 'corruption']


In [12]:
# and filter out the stops
just_tokens = ['mr','ms','dr','p','``', '\'s','’','’’','\'\'','“','”','................................','l']

removables = set(stopwords.words('English') + list(string.punctuation) + list(string.digits) + just_tokens)

filtered_tokens = [token for token in lower_content_tokens if token not in removables]

In [13]:
from nltk.probability import FreqDist

In [14]:
# a look at the frequencies
frequencies = FreqDist(filtered_tokens)

frequencies.most_common(50)

[('would', 5289),
 ('transcript', 4920),
 ('day', 4037),
 ('evidence', 3753),
 ('said', 3236),
 ('agrizzi', 3236),
 ('president', 3113),
 ('board', 3109),
 ('also', 3062),
 ('state', 2869),
 ('minister', 2802),
 ('commission', 2774),
 ('eskom', 2521),
 ('line', 2372),
 ('meeting', 2329),
 ('made', 2326),
 ('bosasa', 2288),
 ('report', 2270),
 ('affidavit', 2232),
 ('testified', 2177),
 ('contract', 2066),
 ('one', 2056),
 ('page', 2040),
 ('could', 2034),
 ('zuma', 1944),
 ('public', 1898),
 ('2015', 1771),
 ('2020', 1761),
 ('time', 1669),
 ('may', 1642),
 ('gupta', 1640),
 ('national', 1542),
 ('department', 1534),
 ('2019', 1488),
 ('exhibit', 1486),
 ('project', 1441),
 ('members', 1369),
 ('committee', 1360),
 ('however', 1339),
 ('anc', 1336),
 ('process', 1329),
 ('para', 1287),
 ('watson', 1271),
 ('investigation', 1256),
 ('terms', 1236),
 ('business', 1225),
 ('told', 1218),
 ('whether', 1170),
 ('2016', 1168),
 ('van', 1163)]

In [15]:
# we can get a candidate list of companies mentioned in the findings 
# by finding the frequencies for the first word of each company's name 

companies_only_names = []

for company in companies:
    company = company.lower()
    name_parts = company.split(sep=' ')
    companies_only_names.append(name_parts[0])
  
comp_freqs = {}

for company_name in companies_only_names:
    comp_freqs[company_name] = frequencies[company_name]

In [16]:
comp_freqs

{'4sight': 0,
 'absa': 64,
 'barclays': 2,
 'accelerate': 4,
 'accentuate': 0,
 'acsion': 0,
 'adapt': 3,
 'adcock': 0,
 'adcorp': 0,
 'adrenna': 0,
 'advanced': 59,
 'advtech': 0,
 'aeci': 0,
 'aep': 0,
 'african': 449,
 'afrimat': 0,
 'afrocentric': 0,
 'ah-vest': 0,
 'alaris': 0,
 'alert': 21,
 'alexander': 12,
 'allied': 3,
 'alviva': 0,
 'amalgamated': 8,
 'anchor': 4,
 'andulela': 0,
 'anglo': 1,
 'anglogold': 0,
 'ab': 11,
 'arb': 0,
 'arcelormittal': 2,
 'argent': 0,
 'arrowhead': 0,
 'ascendis': 0,
 'ascension': 0,
 'aspen': 0,
 'assore': 0,
 'astoria': 0,
 'astral': 0,
 'astrapak': 0,
 'atlantic': 1,
 'atlatsa': 0,
 'attacq': 0,
 'aveng': 0,
 'avi': 0,
 'avior': 0,
 'awethu': 0,
 'ayo': 0,
 'balwin': 0,
 'barloworld': 0,
 'basil': 0,
 'bauba': 0,
 'beige': 0,
 'bell': 9,
 'bhp': 2,
 'bid': 364,
 'bk': 3,
 'blue': 15,
 'bonatla': 0,
 'bowler': 1,
 'brainworks': 0,
 'brait': 0,
 'brikor': 0,
 'brimstone': 0,
 'british': 7,
 'bsi': 0,
 'buffalo': 1,
 'buildmax': 0,
 'bytes': 0,


In [65]:
# sort these by frequency descending and filter out non-mentioned companies
mentioned_companies = {k: v for k, v in comp_freqs.items() if v>0}
companies_sorted = sorted(mentioned_companies.items(), key=lambda item: item[1], reverse=True)
mentioned_companies

{'absa': 64,
 'barclays': 2,
 'accelerate': 4,
 'adapt': 3,
 'advanced': 59,
 'african': 449,
 'alert': 21,
 'alexander': 12,
 'allied': 3,
 'amalgamated': 8,
 'anchor': 4,
 'anglo': 1,
 'ab': 11,
 'arcelormittal': 2,
 'atlantic': 1,
 'bell': 9,
 'bhp': 2,
 'bid': 364,
 'bk': 3,
 'blue': 15,
 'bowler': 1,
 'british': 7,
 'buffalo': 1,
 'capital': 228,
 'cargo': 4,
 'caxton': 3,
 'central': 54,
 'chemical': 6,
 'city': 147,
 'clientele': 1,
 'combined': 17,
 'command': 17,
 'conduit': 7,
 'consolidated': 8,
 'discovery': 6,
 'distribution': 30,
 'e': 875,
 'eastern': 59,
 'echo': 5,
 'efficient': 57,
 'eoh': 108,
 'exxaro': 11,
 'firstrand': 1,
 'freedom': 27,
 'glencore': 190,
 'global': 48,
 'go': 427,
 'gold': 20,
 'grand': 3,
 'great': 80,
 'group': 814,
 'harmony': 3,
 'hospitality': 6,
 'huge': 36,
 'impala': 1,
 'imperial': 2,
 'international': 127,
 'investec': 14,
 'isa': 3,
 'jse': 3,
 'kap': 2,
 'kumba': 2,
 'lewis': 3,
 'liberty': 1,
 'life': 77,
 'london': 13,
 'marshall': 

In [66]:
# clearly some of these results may just be from normal sentences that 
# coincidentally have the same words as in a company name, 
# like 'south' in 'South Ocean Holdings Limited'. So let's see which 
# of the mentioned companies' names are also english words

nltk.download('words')
from nltk.corpus import words
setofwords = set(words.words())

companies_that_are_words = []
for key in mentioned_companies:
    if key in setofwords:
        companies_that_are_words.append(key)

companies_that_are_words

[nltk_data] Downloading package words to /Users/20790627/nltk_data...
[nltk_data]   Package words is already up-to-date!


['accelerate',
 'adapt',
 'advanced',
 'alert',
 'allied',
 'anchor',
 'atlantic',
 'bell',
 'bid',
 'blue',
 'bowler',
 'buffalo',
 'capital',
 'cargo',
 'central',
 'chemical',
 'city',
 'clientele',
 'combined',
 'command',
 'conduit',
 'consolidated',
 'discovery',
 'distribution',
 'e',
 'eastern',
 'echo',
 'efficient',
 'freedom',
 'global',
 'go',
 'gold',
 'grand',
 'great',
 'group',
 'harmony',
 'hospitality',
 'huge',
 'impala',
 'imperial',
 'international',
 'lewis',
 'liberty',
 'life',
 'mas',
 'master',
 'middle',
 'mine',
 'mix',
 'net',
 'new',
 'old',
 'pan',
 'pick',
 'premier',
 'prescient',
 'quantum',
 'rand',
 'rare',
 'resource',
 'rex',
 'royal',
 'sa',
 'south',
 'standard',
 'sun',
 'super',
 'taste',
 'total',
 'tower',
 'transaction',
 'union',
 'universal',
 'value',
 'visual',
 'w',
 'wilderness',
 'york']

In [92]:
# for each of these 'gray-area' cases, we create a list of tuples to 
# include the first two words in the companyies' names
companies_tuples_list = []
for company in companies:
    words = company.lower().split()
    # print('words:', words)
    if words[0] in companies_that_are_words:
        # print('word found:', words[0])
        companies_tuples_list.append((words[0], words[1]))

companies_tuples_list

[('accelerate', 'property'),
 ('adapt', 'it'),
 ('advanced', 'health'),
 ('alert', 'steel'),
 ('allied', 'electronics'),
 ('anchor', 'group'),
 ('atlantic', 'leaf'),
 ('bell', 'equipment'),
 ('bid', 'corp'),
 ('blue', 'financial'),
 ('blue', 'label'),
 ('bowler', 'metcalf'),
 ('buffalo', 'coal'),
 ('capital', '&'),
 ('capital', '&'),
 ('capital', 'appreciation'),
 ('cargo', 'carriers'),
 ('central', 'rand'),
 ('chemical', 'specialities'),
 ('city', 'lodge'),
 ('clientele', 'limited'),
 ('combined', 'motor'),
 ('command', 'holdings'),
 ('conduit', 'capital'),
 ('consolidated', 'infrastructure'),
 ('discovery', 'limited'),
 ('distribution', 'and'),
 ('e', 'media'),
 ('eastern', 'platinum'),
 ('echo', 'polska'),
 ('efficient', 'group'),
 ('freedom', 'property'),
 ('global', 'asset'),
 ('go', 'life'),
 ('gold', 'brands'),
 ('gold', 'fields'),
 ('grand', 'parade'),
 ('great', 'basin'),
 ('group', 'five'),
 ('harmony', 'gold'),
 ('hospitality', 'property'),
 ('huge', 'group'),
 ('impala', 'p

In [94]:
# if any of the tuples appear in the reports then we can say that the company is mentioned

bigrams = list(nltk.bigrams(filtered_tokens)) 

for tuple in companies_tuples_list:
    if tuple in bigrams:
        print('Appeared in text:', tuple)


Appeared in text: ('blue', 'label')
Appeared in text: ('city', 'lodge')
Appeared in text: ('harmony', 'gold')
Appeared in text: ('impala', 'platinum')
Appeared in text: ('middle', 'east')
Appeared in text: ('rand', 'merchant')
Appeared in text: ('royal', 'bafokeng')
Appeared in text: ('standard', 'bank')


In [None]:
# of the companies with english words as names, the above are mentioned in the report