In [31]:
import PyPDF2
import string
import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import re

# Loading and Preprocessing the Corpus

## Extracting the Data

In [16]:
def extract_pdf(file_path,from_page_number,to_page_number):
    extracted_data = ""
    pdf = PyPDF2.PdfReader(file_path)
    for i in range(from_page_number,to_page_number):
        page = pdf.pages[i]
        extracted_data += page.extract_text()
    return extracted_data

In [32]:
extracted_data_pmbok_6 = extract_pdf("../input/PMBOK6-2017.pdf",308,355)
extracted_data_PMI = extract_pdf("../input/practice-standard-project-risk-management.pdf",12,123)

In [21]:
print(extracted_data_PMI)

1©2009 Project Management Institute. Practice Standard for Project Risk Management1
  CHAPTER 1 
 INTRODUCTION 
 Project Management Institute (PMI) practice standards are guides to the use of a tool, technique, or process 
identiﬁ  ed in  A Guide to the Project Management Body of Knowledge   ( PMBOK   ®  Guide  – Fourth Edition) or 
other PMI standards. Practice standards are targeted at audiences who participate in the management of 
projects. This includes project managers, project personnel, contract personnel, supervisors, and other project stakeholders. 
 A PMI practice standard describes processes, activities, inputs, and outputs for a speciﬁ  c Knowledge Area. 
It provides information on what the signiﬁ  cant process, tool, or technique is, what it does, why it is signiﬁ  cant, 
when it should be performed or executed, and, if necessary for further clariﬁ  cation, who should perform the process. A practice standard does not prescribe how the process is to be implemented, leaving

# Cleaning PMI

In [33]:
def cleaning_pmi(pmi_corpus,tokenize=False):
    # Define a list of patterns and substitutions
    patterns_substitutions = [
        (r'ﬁ\s\s', 'fi'),  # Replace 'fi' with 'fi'
        (r'\d©2009 Project Management Institute\. Practice Standard for Project Risk Management\d', ''),  # Remove footer
        (r'©2009 Project Management Institute. Practice Standard for Project Risk Management', ''),  # Remove the copyright notice
        (r'[^a-zA-Z0-9 .]+', ''),  # Remove punctuation and unwanted characters
        (r'\s+', ' ') #remove extra spaces
    ]

    # Apply all patterns and substitutions in sequence
    for pattern, substitution in patterns_substitutions:

        pmi_corpus = re.sub(pattern, substitution, pmi_corpus)
    if tokenize :
        tokens = word_tokenize(pmi_corpus)

        # Stopword removal
        stopwords_set = set(stopwords.words('english'))
        filtered_tokens = [token for token in tokens if token not in stopwords_set]
    

        return filtered_tokens
    return pmi_corpus


In [34]:
print(cleaning_pmi(extracted_data_PMI,tokenize=True))
tokenized_pmi = cleaning_pmi(extracted_data_PMI,tokenize=True)



In [35]:
import spacy

# Load spaCy's English NLP model
nlp = spacy.load("en_core_web_sm")

text = " ".join(tokenized_pmi)

doc = nlp(text)

# Access POS tags for each token in the Doc
for token in doc:
    print(f"{token.text}: {token.pos_}")

CHAPTER: NOUN
1: NUM
INTRODUCTION: NOUN
Project: PROPN
Management: PROPN
Institute: PROPN
PMI: PROPN
practice: NOUN
standards: NOUN
guides: NOUN
use: VERB
tool: NOUN
technique: NOUN
process: NOUN
identified: VERB
A: DET
Guide: PROPN
Project: PROPN
Management: PROPN
Body: PROPN
Knowledge: PROPN
PMBOK: PROPN
Guide: PROPN
Fourth: PROPN
Edition: PROPN
PMI: PROPN
standards: NOUN
.: PUNCT
Practice: NOUN
standards: NOUN
targeted: VERB
audiences: NOUN
participate: VERB
management: NOUN
projects: NOUN
.: PUNCT
This: PRON
includes: VERB
project: NOUN
managers: NOUN
project: NOUN
personnel: NOUN
contract: NOUN
personnel: NOUN
supervisors: NOUN
project: VERB
stakeholders: NOUN
.: PUNCT
A: DET
PMI: PROPN
practice: NOUN
standard: NOUN
describes: NOUN
processes: VERB
activities: NOUN
inputs: VERB
outputs: NOUN
specific: ADJ
Knowledge: PROPN
Area: PROPN
.: PUNCT
It: PRON
provides: VERB
information: NOUN
significant: ADJ
process: NOUN
tool: NOUN
technique: NOUN
significant: ADJ
performed: VERB
executed

# Cleaning PMBOK

## Checking whole extracted data

In [34]:
print(extracted_data_pmbok_6)

Not For Distribution, Sale or Reproduction.
273Figure 8-2. Major Project Quality Management Process Interrelations
KEY C ONCEPTS FOR P ROJECT QUALI TY M ANAGEMENT
Project Quality Management addresses the management of the project and the deliverables of the project. It 
applies to all projects, regardless of the nature of their deliverables. Quality measures and techniques are speciﬁc 
to the type of deliverables being produced by the project. For example, the project quality management of software deliverables may use different approaches and measures from those used when building a nuclear power plant. In either case, failure to meet the quality requirements can have serious negative consequences for any or all of the project’s stakeholders. For example:
uuMeeting customer requirements by overworking the project team may result in decreased proﬁts and increased levels of overall project risks, employee attrition, errors, or rework.
uuMeeting project schedule objectives by rushing pla

## Removing Watermark

In [35]:
result_pmbok = extracted_data_pmbok_6.replace('Not For Distribution, Sale or Reproduction.','')

## Removing Footer

In [36]:
pattern = r'\d{3} Part 1 - Guide'
result_pmbok = re.sub(pattern, '', result_pmbok)

In [37]:
print(result_pmbok)


273Figure 8-2. Major Project Quality Management Process Interrelations
KEY C ONCEPTS FOR P ROJECT QUALI TY M ANAGEMENT
Project Quality Management addresses the management of the project and the deliverables of the project. It 
applies to all projects, regardless of the nature of their deliverables. Quality measures and techniques are speciﬁc 
to the type of deliverables being produced by the project. For example, the project quality management of software deliverables may use different approaches and measures from those used when building a nuclear power plant. In either case, failure to meet the quality requirements can have serious negative consequences for any or all of the project’s stakeholders. For example:
uuMeeting customer requirements by overworking the project team may result in decreased proﬁts and increased levels of overall project risks, employee attrition, errors, or rework.
uuMeeting project schedule objectives by rushing planned quality inspections may result in unde

## Remove ba9lewa thats represented as 'uu'

In [38]:
result_pmbok = result_pmbok.replace('uu','')

In [39]:
print(result_pmbok)


273Figure 8-2. Major Project Quality Management Process Interrelations
KEY C ONCEPTS FOR P ROJECT QUALI TY M ANAGEMENT
Project Quality Management addresses the management of the project and the deliverables of the project. It 
applies to all projects, regardless of the nature of their deliverables. Quality measures and techniques are speciﬁc 
to the type of deliverables being produced by the project. For example, the project quality management of software deliverables may use different approaches and measures from those used when building a nuclear power plant. In either case, failure to meet the quality requirements can have serious negative consequences for any or all of the project’s stakeholders. For example:
Meeting customer requirements by overworking the project team may result in decreased proﬁts and increased levels of overall project risks, employee attrition, errors, or rework.
Meeting project schedule objectives by rushing planned quality inspections may result in undetect

## Removing Mouraba3 that's represented as 'u n'

In [24]:
result_pmbok = result_pmbok.replace('u n','')
print(result_pmbok)


273Figure 8-2. Major Project Quality Management Process Interrelations
KEY C ONCEPTS FOR P ROJECT QUALI TY M ANAGEMENT
Project Quality Management addresses the management of the project and the deliverables of the project. It 
applies to all projects, regardless of the nature of their deliverables. Quality measures and techniques are speciﬁc 
to the type of deliverables being produced by the project. For example, the project quality management of software deliverables may use different approaches and measures from those used when building a nuclear power plant. In either case, failure to meet the quality requirements can have serious negative consequences for any or all of the project’s stakeholders. For example:
Meeting customer requirements by overworking the project team may result in decreased proﬁts and increased levels of overall project risks, employee attrition, errors, or rework.
Meeting project schedule objectives by rushing planned quality inspections may result in undetect

In [5]:
khtout = lambda: print(''.join([i for i in '-'*200]))
glimpse_of_text = lambda text: print('\n'.join(text.split('\n')[1:10]))

In [6]:
print('extracted_data_pmbok_6 before removing punctuation : ')
khtout()
glimpse_of_text(extracted_data_pmbok_6)
khtout()
print('extracted_data_pmbok_6 after removing punctuation : ')
khtout()
extracted_data_pmbok_6 = remove_punctuation(extracted_data_pmbok_6)
glimpse_of_text(extracted_data_pmbok_6)

extracted_data_pmbok_6 before removing punctuation : 
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
274 Part 1 - GuideQuality and grade are not the same concepts. Quality as a delivered performance or result is “the degree to which a 
set of inherent characteristics fulﬁll requirements” (ISO 9000 [18].). Grade as a design intent is a category assigned to 
deliverables having the same functional use but different technical characteristics. The project manager and the project management team are responsible for managing the trade-offs associated with delivering the required levels of both quality and grade. While a quality level that fails to meet quality requirements is always a problem, a low-grade product may not be a problem. For example:
uuIt may not be a problem if a suitable low-grade product (one with a limited number of featur

In [7]:
print('extracted_data_PMI before removing punctuation : ')
khtout()
glimpse_of_text(extracted_data_PMI)
khtout()
print('extracted_data_PMI after removing punctuation : ')
khtout()
extracted_data_PMI = remove_punctuation(extracted_data_PMI)
glimpse_of_text(extracted_data_PMI)

extracted_data_PMI before removing punctuation : 
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 1.2 Project Risk Management Deﬁ  nition 
 The deﬁ  nition of Project Risk Management, as deﬁ  ned in the  PMBOK   ®  Guide  – Fourth Edition, is the basis 
for this practice standard: “Project Risk Management includes the processes concerned with conducting 
risk management planning, identiﬁ  cation, analysis, responses, and monitoring and control on a project.” The  PMBOK   
®  Guide –  Fourth Edition also states: “The objectives of Project Risk Management are to increase the 
probability and impact of positive events, and decrease the probability and impact of negative events in the 
project.” In the  PMBOK   ®  Guide  – Fourth Edition, “project risk is an uncertain event or condition that, if it occurs, 
has a positive or negative effe

## Lower Case

In [8]:
extracted_data_pmbok_6 = extracted_data_pmbok_6.lower()
extracted_data_PMI = extracted_data_PMI.lower()
glimpse_of_text(extracted_data_pmbok_6)
khtout()
glimpse_of_text(extracted_data_PMI)

274 part 1  guidequality and grade are not the same concepts quality as a delivered performance or result is “the degree to which a 
set of inherent characteristics fulﬁll requirements” iso 9000 18 grade as a design intent is a category assigned to 
deliverables having the same functional use but different technical characteristics the project manager and the project management team are responsible for managing the tradeoffs associated with delivering the required levels of both quality and grade while a quality level that fails to meet quality requirements is always a problem a lowgrade product may not be a problem for example
uuit may not be a problem if a suitable lowgrade product one with a limited number of features is of high quality no obvious defects in this example the product would be appropriate for its general purpose of use
uuit may be a problem if a highgrade product one with numerous features is of low quality many defects in essence a highgrade feature set would prove i

In [9]:
stopwords = nltk.corpus.stopwords.words('english')
stopwords[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

## Removing Stop words

In [10]:
def remove_stopwords(text):
    return " ".join([i for i in text.split() if i not in stopwords])

## Stemming 
* a text processing technique used in natural language processing (NLP) to reduce words to their root or base form. It involves removing suffixes from words to find a common form. The idea is to simplify words so that variations of the same word are treated as the same word.

Example:

Word: "Jumping"
Stemmed Form: "Jump"

In [11]:
porter_stemmer = PorterStemmer()
def stemming(text):
    return " ".join([porter_stemmer.stem(i) for i in text.split()])

## Lemmitization
* a text processing technique used in NLP, but it goes a step further than stemming. It reduces words to their base or dictionary form, known as the lemma. Lemmatization considers the context and part of speech of a word to provide a more accurate base form.

Example:

Word: "Better"
Lemma: "Good"

In [15]:
wordnet_lemmatizer = WordNetLemmatizer()
def lemmatization(text):
    return " ".join([wordnet_lemmatizer.lemmatize(i) for i in text.split()])

In [12]:
extracted_data_pmbok_6 = remove_stopwords(extracted_data_pmbok_6)
extracted_data_PMI = remove_stopwords(extracted_data_PMI)

In [13]:
extracted_data_pmbok_6 = stemming(extracted_data_pmbok_6)
extracted_data_PMI = stemming(extracted_data_PMI)

In [16]:
extracted_data_pmbok_6 = lemmatization(extracted_data_pmbok_6)
extracted_data_PMI = lemmatization(extracted_data_PMI)