In [65]:
# Imports
import re
import pdfplumber
import nltk
from nltk.tree import Tree
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('treebank')

[nltk_data] Downloading package punkt_tab to /Users/mj/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/mj/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mj/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/mj/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     /Users/mj/nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to /Users/mj/nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package punkt to /Users/mj/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_p

True

### 1. Extract all texts from the given pdf file.

In [16]:
riskpdf = "/Users/mj/Desktop/DSCI 314 (Text Mining)/CAS_ERM_overview.pdf"

# Open and read the PDF
with pdfplumber.open(riskpdf) as pdf:
    output = []
    for page in pdf.pages:
        # Extract text from each page
        output.append(page.extract_text())
        
# Concatenate all text
alltexts = ' '.join(output)

#Lower case the string
alltexts = alltexts.lower()
# Remove \n from the texts
alltexts = re.sub(r'\n', '', alltexts)
# Remove punctuation from the texts
alltexts = re.sub(r'[^\w\s]','',alltexts)
# Remove number from texts
num = r'[0-9]'
# Match all digits in the string and replace them by empty string
alltexts = re.sub(num , '', alltexts)

### 2. Extract all the tokens from the texts.

In [17]:
from nltk.tokenize import word_tokenize

# Tokenize by splitting into words
tokens = word_tokenize(alltexts)
print(tokens[:100])

['overview', 'of', 'enterprise', 'risk', 'managementcasualty', 'actuarial', 'societyenterprise', 'risk', 'management', 'committeemay', 'overview', 'of', 'enterprise', 'risk', 'managementtable', 'of', 'contentspagei', 'executive', 'summary', 'ii', 'the', 'erm', 'evolution', 'iii', 'erm', 'definition', 'and', 'conceptual', 'framework', 'iv', 'erm', 'language', 'measures', 'models', 'and', 'tools', 'v', 'erm', 'case', 'studies', 'vi', 'practical', 'considerations', 'in', 'implementing', 'erm', 'appendicesa', 'riskrelated', 'regulatory', 'rating', 'agency', 'and', 'corporate', 'governanceguidelines', 'and', 'requirements', 'b', 'a', 'continuum', 'of', 'risk', 'modeling', 'methods', 'c', 'erm', 'bibliography', 'overview', 'of', 'enterprise', 'risk', 'managementi', 'executive', 'summarythis', 'document', 'is', 'intended', 'primarily', 'to', 'further', 'the', 'risk', 'management', 'education', 'ofcandidates', 'for', 'membership', 'in', 'the', 'casualty', 'actuarial', 'society', 'cas', 'curren

### 3. Perform Stemming on the texts.

In [18]:
# Stemming various words related to management
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
words = ["managementi", "management", "managementtable", "managementcasualty"]
for stemmedword in words:
    print(stemmedword, ": ", stemmer.stem(stemmedword))

managementi :  managementi
management :  manag
managementtable :  managementt
managementcasualty :  managementcasualti


### 4. Perform Lemmatization on the texts.

In [24]:
# Lemmatize various words related to management
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
words = ["managementi", "management", "managementtable", "managementcasualty"]
for lemword in words:
    print(lemword, ": ", lemmatizer.lemmatize(lemword))

managementi :  managementi
management :  management
managementtable :  managementtable
managementcasualty :  managementcasualty


### 5. Remove all the default stop words in NLTK from the texts.

In [29]:
# Import stopwords
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# Remove stopwords
without_stopword = [w for w in tokens if not w in stop_words]
print(without_stopword[:100])

['overview', 'enterprise', 'risk', 'managementcasualty', 'actuarial', 'societyenterprise', 'risk', 'management', 'committeemay', 'overview', 'enterprise', 'risk', 'managementtable', 'contentspagei', 'executive', 'summary', 'ii', 'erm', 'evolution', 'iii', 'erm', 'definition', 'conceptual', 'framework', 'iv', 'erm', 'language', 'measures', 'models', 'tools', 'v', 'erm', 'case', 'studies', 'vi', 'practical', 'considerations', 'implementing', 'erm', 'appendicesa', 'riskrelated', 'regulatory', 'rating', 'agency', 'corporate', 'governanceguidelines', 'requirements', 'b', 'continuum', 'risk', 'modeling', 'methods', 'c', 'erm', 'bibliography', 'overview', 'enterprise', 'risk', 'managementi', 'executive', 'summarythis', 'document', 'intended', 'primarily', 'risk', 'management', 'education', 'ofcandidates', 'membership', 'casualty', 'actuarial', 'society', 'cas', 'current', 'members', 'ofthe', 'cas', 'well', 'risk', 'management', 'professionals', 'also', 'find', 'material', 'ofinterestin', 'cha

### 6. Customize the stop words in NLTK by adding "language" and "processing" to the stop words and removing "most" from the default stop words.

In [36]:
# Add words "language" and "processing" to to stop words
add_stopwords = ['language','processing']
new_stopwords_list = stop_words + add_stopwords

# Removing "most" from stopwords
delete_stopwords = {'most'} 
custom_stop_words = set([word for word in new_stopwords_list if word not in delete_stopwords])

# Print without new stop words
without_custom_stopword = [w for w in tokens if not w in custom_stop_words]
print(without_custom_stopword[:100])

['overview', 'enterprise', 'risk', 'managementcasualty', 'actuarial', 'societyenterprise', 'risk', 'management', 'committeemay', 'overview', 'enterprise', 'risk', 'managementtable', 'contentspagei', 'executive', 'summary', 'ii', 'erm', 'evolution', 'iii', 'erm', 'definition', 'conceptual', 'framework', 'iv', 'erm', 'measures', 'models', 'tools', 'v', 'erm', 'case', 'studies', 'vi', 'practical', 'considerations', 'implementing', 'erm', 'appendicesa', 'riskrelated', 'regulatory', 'rating', 'agency', 'corporate', 'governanceguidelines', 'requirements', 'b', 'continuum', 'risk', 'modeling', 'methods', 'c', 'erm', 'bibliography', 'overview', 'enterprise', 'risk', 'managementi', 'executive', 'summarythis', 'document', 'intended', 'primarily', 'risk', 'management', 'education', 'ofcandidates', 'membership', 'casualty', 'actuarial', 'society', 'cas', 'current', 'members', 'ofthe', 'cas', 'well', 'risk', 'management', 'professionals', 'also', 'find', 'material', 'ofinterestin', 'chapter', 'ii',

### 7. Perform the part of speech tagging for the texts.

In [40]:
# Speech tagging the text and printing the first 100 words
pos = nltk.pos_tag(tokens)
print(pos[:100])

[('overview', 'NN'), ('of', 'IN'), ('enterprise', 'NN'), ('risk', 'NN'), ('managementcasualty', 'NN'), ('actuarial', 'JJ'), ('societyenterprise', 'NN'), ('risk', 'NN'), ('management', 'NN'), ('committeemay', 'NN'), ('overview', 'NN'), ('of', 'IN'), ('enterprise', 'NN'), ('risk', 'NN'), ('managementtable', 'NN'), ('of', 'IN'), ('contentspagei', 'JJ'), ('executive', 'NN'), ('summary', 'NN'), ('ii', 'VBD'), ('the', 'DT'), ('erm', 'JJ'), ('evolution', 'NN'), ('iii', 'NN'), ('erm', 'JJ'), ('definition', 'NN'), ('and', 'CC'), ('conceptual', 'JJ'), ('framework', 'NN'), ('iv', 'JJ'), ('erm', 'JJ'), ('language', 'NN'), ('measures', 'NNS'), ('models', 'NNS'), ('and', 'CC'), ('tools', 'NNS'), ('v', 'VBP'), ('erm', 'JJ'), ('case', 'NN'), ('studies', 'NNS'), ('vi', 'VBP'), ('practical', 'JJ'), ('considerations', 'NNS'), ('in', 'IN'), ('implementing', 'VBG'), ('erm', 'JJ'), ('appendicesa', 'NN'), ('riskrelated', 'VBD'), ('regulatory', 'JJ'), ('rating', 'NN'), ('agency', 'NN'), ('and', 'CC'), ('corpo

### 8. Perform the named entities recognization for the texts.

In [67]:
chunks = nltk.ne_chunk(pos, binary = True)
for chunk in chunks:
   if hasattr(chunk, 'label'):
       print(' '.join(c[0] for c in chunk), chunk.label())