In [2]:
import nltk
import numpy
import string
import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import pandas as pd

nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /Users/20790627/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/20790627/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# these are the companies listed on the Johannesburg Stock Exchange
listed_companies = pd.read_excel('Complete-List-of-Listed-Companies-on-South-Africa-Johannesburg-Stock-Exchange-Jan-2021.xlsx')
listed_companies.head()

Unnamed: 0,S.No.,Company Name,Ticker,Sector
0,1,4Sight Holdings Limited,4SI,Financial Services
1,2,ABSA Bank Limited,ABSP,Banks
2,3,Barclays Africa Group Limited,ABG.ZA,Banks
3,4,Accelerate Property Fund Limited,APF,Real Estate Investment Trusts
4,5,Accentuate Limited,ACE,Chemicals


In [4]:
companies = list(listed_companies['Company Name'].unique())

In [5]:
# load the Zondo Comission Into State Capture reports
zondo_reports = pd.read_csv('zondo_reports.csv')
zondo_reports = zondo_reports.drop(columns=['Unnamed: 0'])
zondo_reports.head()

Unnamed: 0,report,text
0,OCR version - State Capture Commission Report ...,dicial \nn of Ir• \nS a \nI I eport: Part 1 \n...
1,OCR version - State Capture Commission Report ...,Judicial Commission \nOf \n Inquiry into...
2,OCR version - State Capture Commission Report ...,\n \nJudicial Commission \nof \nInquiry into...
3,OCR version - State Capture Commission Report ...,Chairperson: Justice RtM Zondo \nActing Chief ...
4,OCR version - State Capture Commission Report ...,\n \nJudicial Commission \nof \nInquiry into...


In [6]:
# amalgamate the content of all report volumes into a list
content = zondo_reports['text'].to_list()

In [7]:
content[0][0:1000]

'dicial \nn of Ir• \nS a \nI I eport: Part 1 \n: South African Airways and its \nAssociated Companies \n\\ \\ \nChairperson: Justice RMM Zondo \nActing Chief Justice of the Republic of South Africa e report of the Judicial Commission of Inquiry into allegations of \nure, Corruption and Fraud in the Public Sector including organs of \nlso known to the public and the media as the Zondo Commission JudicialJudicial\nCommission of Commission of Inquiry into Inquiry into \nState Capture State Capture \nReport: Part 1Report: Part 1\nVol. 1:  South African Airways and its Vol. 1:  South African Airways and its \nAssociated CompaniesAssociated Companies\nChairperson: Justice RMM ZondoActing Chief Justice of the Republic of South Africa\nReport of the Judicial Commission of Report of the Judicial Commission of Inquiry into State Capture: Part 1:  Vol.  1 Inquiry into State Capture: Part 1:  Vol.  1 \nThis is the report of the Judicial Commission of Inquiry into allegations of State Capture, Corr

In [8]:
# join items in content list as a literal string, then tokenize
all_content = " ".join(content[:])
all_content_tokens = word_tokenize(all_content) 

In [59]:
# and filter out the stops
just_tokens = ['mr','ms','dr','p','``', '\'s','’','’’','\'\'','“','”','................................','l']

removables = set(stopwords.words('English') + list(string.punctuation) + list(string.digits) + just_tokens)

filtered_tokens = [token for token in all_content_tokens if token not in removables]

In [62]:
# create list of company names
# remove stop words from company names
# split into single word, double word, triple words
stops = ['Holdings', 'Limited', 'Ltd', 'Plc', 'plc', 'Group', 'Company', 'Fund', 'Corporation', 'Corp', 'Investments', 'Compagnie Financiere', 'Ld', 'International', 'NV']
one_word_comps = []
two_word_comps = []
three_word_comps = []
four_word_comps = []
for name in listed_companies['Company Name']:
    for stop in stops:
        name = name.replace(stop, '')
        name = name.strip()
    if len(name.split()) == 1:
        one_word_comps.append(name)
    elif len(name.split()) == 2:
        name = tuple(name.split(' '))
        two_word_comps.append(name)
    elif len(name.split()) == 3:
        name = tuple(name.split(' '))
        three_word_comps.append(name)
    else:
        first_three_words = ' '.join(name.split()[:3])
        first_three_words = tuple(first_three_words.split(' '))
        three_word_comps.append(first_three_words)


# create bigrams and trigrams
# search for words in their relevant set
# add found companies to final list

bigrams = list(nltk.bigrams(filtered_tokens)) 
trigrams = list(nltk.trigrams(filtered_tokens)) 
final_list = []

for comp in one_word_comps:
    if comp in filtered_tokens:
        print('1 word company appeared in text:', comp)
        final_list.append(comp)

for comp in two_word_comps:
    if comp in bigrams:
        print('2 word company appeared in text:', comp)
        final_list.append(comp)

for comp in three_word_comps:
    if comp in trigrams:
        print('3 word company appeared in text:', comp)
        final_list.append(comp)

print('Final list of companies to reconsider:', final_list)


1 word company appeared in text: Bid
1 word company appeared in text: Command
1 word company appeared in text: Efficient
1 word company appeared in text: EOH
1 word company appeared in text: Firstrand
1 word company appeared in text: Glencore
1 word company appeared in text: Five
1 word company appeared in text: Imperial
1 word company appeared in text: Hotel
1 word company appeared in text: Investec
1 word company appeared in text: Investec
1 word company appeared in text: ISA
1 word company appeared in text: JSE
1 word company appeared in text: Lewis
1 word company appeared in text: Liberty
1 word company appeared in text: Moneyweb
1 word company appeared in text: Mpact
1 word company appeared in text: MultiChoice
1 word company appeared in text: Naspers
1 word company appeared in text: Nedbank
1 word company appeared in text: PPC
1 word company appeared in text: PSV
1 word company appeared in text: RMB
1 word company appeared in text: Sasfin
1 word company appeared in text: Sasol
1 

In [None]:
# other things to add: 
# maybe manually move Efficient Group and Five Group to the two_name_comps 
# create a final dataframe with the full company names
# if this isn't enough we can add frequencies