In [1]:
# required package installation
!pip install docx2txt
!pip install keybert

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting docx2txt
  Downloading docx2txt-0.8.tar.gz (2.8 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: docx2txt
  Building wheel for docx2txt (setup.py) ... [?25l[?25hdone
  Created wheel for docx2txt: filename=docx2txt-0.8-py3-none-any.whl size=3980 sha256=0f8183288353ae321c6da617d084312247154af3fb026a7af5a23eece43628d0
  Stored in directory: /root/.cache/pip/wheels/55/f0/2c/81637d42670985178b77df6d41b9b6c6dc18c94818447414b9
Successfully built docx2txt
Installing collected packages: docx2txt
Successfully installed docx2txt-0.8
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting keybert
  Downloading keybert-0.7.0.tar.gz (21 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentence-transformers>=0.3.8
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)


In [2]:
# This cell imports all required package
"""NLTK (Natural Language Toolkit) is the go-to API for NLP (Natural Language Processing) 
with Python. It is a really powerful tool to preprocess text data for further analysis 
like with ML models for instance. It helps convert text into numbers, which the model can 
then easily work with."""

"""" docx2txt is a pure python-based utility to extract text and images from docx files """

"""KeyBERT is a minimal and easy-to-use keyword extraction technique that 
leverages BERT embeddings to create keywords and keyphrases that are most similar to a document"""

import nltk
from nltk.corpus import stopwords
from nltk import tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import docx2txt
from keybert import KeyBERT
from collections import Counter
from operator import itemgetter
import math
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import zipfile
import math

In [3]:
# Mount google drive to access the dataset 
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# make a directory named as "extracted_file" in the present working directory 
!mkdir extracted_file

In [5]:
# Unzip the zip document to '/content/extracted_file' 
with zipfile.ZipFile("/content/drive/MyDrive/Downloads.zip","r") as zip_ref:
    zip_ref.extractall("/content/extracted_file")

In [6]:
# Create a list of all the names of .docx files in the path  ('/content/drive/extracted_file' )
extracted_file = os.listdir('/content/extracted_file')

In [7]:
extracted_file

['Dynamics 365 FDD_Cust-18_Print Binning tickets v0.3.docx',
 'Dynamics 365 FDD_Cust-14 Packing slip enhancements V0.2.docx',
 'Dynamics 365 FDD_Cust-02 Customer enhancements_v1.2 (2).docx',
 'Dynamics 365 FDD_Cust-01 Product enhancements_v1.6 (1).docx',
 'Dynamics 365 FDD_Cust-36 Nota Fiscal on D365 Invoice v1.2.docx',
 'Dynamics 365 FDD_Cust_35 Print a shipment label for the master box (1).docx',
 'Dynamics 365 FDD_Cust-11 Return Orders v2.1 (1).docx',
 'Dynamics 365 FDD_Cust-03 Vendor enhancements_v1.1.docx',
 'Dynamics 365 FDD_Cust-25 Packing slip Enhancements v1.6.docx',
 'Dynamics 365 FDD_Cust-05 v0.2 (1).docx',
 'Dynamics 365 FDD_Cust-07_Availability Check v1.1.docx',
 'Dynamics 365 FDD_Cust_35B Picking tickets label (1).docx',
 'Dynamics 365 FDD_Cust-09 Order hold on line level V1.1.docx']

In [8]:
# read in word file
""" Regular text, listed items, hyperlink text, and table text will all be returned in a single string."""

result = docx2txt.process(os.path.join("/content/extracted_file",extracted_file[0]))

print('the length of returned text, listed items,... for the first .docx file in the list',len(result))

the length of returned text, listed items,... for the first .docx file in the list 27558


In [9]:
"""The stopwords are a list of words that are very very common but don’t provide 
useful information for most text analysis procedures."""

nltk.download('stopwords')
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
 
# This provide a list of lexical stop words in English.
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [10]:
""" This function Remove punctuations, links, stopwords, mentions and \r\n new line characters """

def strip_all_entities(tweet_text): 
    tweet_text = tweet_text.replace('\r', '').replace('\n', ' ').lower() #remove \n and \r and lowercase
    tweet_text = re.sub(r"(?:\@|https?\://)\S+", "", tweet_text) #remove links and mentions
    tweet_text = re.sub(r'[^\x00-\x7f]',r'', tweet_text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation  #  Return all sets of punctuation.
    table = str.maketrans('', '', banned_list)  # banned_list will be removed if found.
    tweet_text = tweet_text.translate(table) # Use the table of the previous line to replace.
    tweet_text = [word for word in tweet_text.split() if word not in stop_words] 
    tweet_text = ' '.join(tweet_text) # Join all items in a list into a string, using a space character as separator.
    tweet_text =' '.join(word for word in tweet_text.split() if len(word) < 14) # remove words longer than 14 characters
    return tweet_text

""" This function removes contractions"""
def decontract(tweet_text):
    tweet_text = re.sub(r"can\'t", "can not", tweet_text)
    tweet_text = re.sub(r"n\'t", " not", tweet_text)
    tweet_text = re.sub(r"\'re", " are", tweet_text)
    tweet_text = re.sub(r"\'s", " is", tweet_text)
    tweet_text = re.sub(r"\'d", " would", tweet_text)
    tweet_text = re.sub(r"\'ll", " will", tweet_text)
    tweet_text = re.sub(r"\'t", " not", tweet_text)
    tweet_text = re.sub(r"\'ve", " have", tweet_text)
    tweet_text = re.sub(r"\'m", " am", tweet_text)
    return tweet_text

""" This function Filter special characters such as "&" and "$" present in some words """
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

""" This function Remove multiple sequential spaces """
def remove_mult_spaces(tweet_text):
    return re.sub("\s\s+" , " ", tweet_text)    

""" This function apply all aforementioned functions to the text """
def deep_clean(tweet_text):
    tweet_text = decontract(tweet_text)
    tweet_text = strip_all_entities(tweet_text)
    tweet_text = filter_chars(tweet_text)
    tweet_text = remove_mult_spaces(tweet_text)
    return tweet_text

In [11]:
# text preprocessing
texts_clean = deep_clean(result)

In [12]:
#create an instance of keyBERT and specify embedding model name, here : 'all-mpnet-base-v2'
#kw_model = KeyBERT(model='all-mpnet-base-v2')
kw_model = KeyBERT(model='all-mpnet-base-v2')

"""The extract_keywords function accepts several parameters, the most important 
of which are: 
               the text, 
               keyphrase_ngram_range : consider a range of tokens as possible keys,
               Max Sum Similarity (use_maxsum) : diversify the output by seeing the use_maxsum parameter to true and providing an integer to nr_candidates.
               top_n: the number of keys we would like to receive """

keywords_with_diversity =  kw_model.extract_keywords(texts_clean, 
                                                    keyphrase_ngram_range=(3, 3), 
                                                    stop_words='english',
                                                    use_maxsum=True, 
                                                    nr_candidates=20, 
                                                    top_n=10)

"""Note that:
  The output is a list of tuples where the first index in the tuple is the string 
  value for the key and the second value is the distance of the key which can 
  be thought of as a score to reflect the model's certainly between the range 
  of 0 to 1  with higher values being more certain."""

print(keywords_with_diversity)

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

[('business process process', 0.4933), ('summary functional design', 0.5019), ('work template v2', 0.5028), ('warehouse work template', 0.5079), ('fdd custom work', 0.5127), ('design document mcs', 0.5131), ('executive summary functional', 0.5326), ('intended functionality developed', 0.5327), ('implement requirements identified', 0.5414), ('design document cust18', 0.5624)]


In [13]:
keywords_with_diersity

[('business process process', 0.4933),
 ('summary functional design', 0.5019),
 ('work template v2', 0.5028),
 ('warehouse work template', 0.5079),
 ('fdd custom work', 0.5127),
 ('design document mcs', 0.5131),
 ('executive summary functional', 0.5326),
 ('intended functionality developed', 0.5327),
 ('implement requirements identified', 0.5414),
 ('design document cust18', 0.5624)]

In [14]:
Simple_keywords = kw_model.extract_keywords(texts_clean, 
                                            keyphrase_ngram_range=(1, 1), 
                                            stop_words=None, 
                                            top_n=10)
Simple_keywords

[('design', 0.4019),
 ('implement', 0.4004),
 ('document', 0.3654),
 ('ssrs', 0.3638),
 ('development', 0.3565),
 ('management', 0.3387),
 ('customisation', 0.3363),
 ('ssis', 0.3362),
 ('organization', 0.3355),
 ('documents', 0.335)]

In [15]:
""" This function generate Simple and Diverse keywords and save the results in extrated_keyword.csv file """


def Simple_diverse_keywords_generation(file_path):

  import pandas as pd

  keywords_with_simplicity = []
  keywords_with_diersity = []


  for file_name in extracted_file:
      path = os.path.join(file_path, file_name)
      result = docx2txt.process(path)
      texts_clean = deep_clean(result)
      kw_model = KeyBERT()
      simple_keywords = kw_model.extract_keywords(texts_clean, keyphrase_ngram_range=(1, 1), stop_words=None, top_n=10)
      diverse_keywords = kw_model.extract_keywords(texts_clean, keyphrase_ngram_range=(3, 3), stop_words='english',use_maxsum=True, nr_candidates=20, top_n=10)
      keywords_with_simplicity.append(simple_keywords)
      keywords_with_diersity.append(diverse_keywords)

  # initialize data of lists.
  data = {'File Name': extracted_file,
          'Diversed_Key_words': keywords_with_diersity,
          'Simple_Key_words':keywords_with_simplicity
          }
  df = pd.DataFrame(data) 
  file_name = 'extrated_keyword.csv'
    
  # saving the excel
  df.to_csv(file_name)
  print('DataFrame is written to Excel File successfully.')

In [16]:
Simple_diverse_keywords_generation(file_path='/content/extracted_file')

Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

DataFrame is written to Excel File successfully.
