# Reading MS Word document

We read the document and create a .csv file with the list of paragaphs and their lengths

In [1]:
docTitle = "22104-j10.docx"

## Solution 2: docx

This is a library that understands MS Word styles

In [2]:
# !pip install -q docx2python
from docx2python import docx2python
from transformers import pipeline, BartForConditionalGeneration, BartTokenizer, PegasusTokenizer, PegasusForConditionalGeneration


2024-02-28 13:40:03.959412: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
# extract docx content
doc_result = docx2python(docTitle, paragraph_styles = True, html=True)

In [4]:
print(doc_result)

DocxContent(docx_reader=DocxReader(), docx2python_kwargs={'docx_filename': '22104-j10.docx', 'image_folder': None, 'html': True, 'paragraph_styles': True, 'extract_image': None, 'duplicate_merged_cells': False, 'docx_context': DocxReader()})


In [5]:
strSectionTitle = ""
dictSections = {}
listLatency = []
listLatencyLines = []
for oneLine in doc_result.text.split('\n'):
  if "<h" in oneLine:
    strSectionTitle = oneLine
    dictSections[strSectionTitle] = []

  if strSectionTitle != "":  
    dictSections[strSectionTitle].append(oneLine)

  if "latency" in oneLine: 
    listLatency.append(strSectionTitle)
    listLatencyLines.append(oneLine)

In [6]:
for title in listLatency:
  if title in dictSections.keys():
    print(dictSections[title])

['<h1>', '1\tScope</h1>', '', 'The present document provides Stage 1 normative service requirements for 5G systems, in particular service requirements for cyber-physical control applications in vertical domains and requirements for auxiliary applications. In the context of the present document, cyber-physical systems are to be understood as systems that include engineered, interacting networks of physical and computational components; control applications are to be understood as applications that control physical processes. Examples for auxiliary applications are distributed sensing and asset monitoring.', '', 'Communication services supporting cyber-physical control applications need to be ultra-reliable and, in some cases, the end-to-end latency must be very low. Communication for cyber-physical control applications supports operation in various vertical domains, for instance industrial automation, Smart Grid . ', '', 'The aspects addressed in the present document include:', '', '-\t

In [7]:
model_name_bart = "knkarthick/MEETING-SUMMARY-BART-LARGE-XSUM-SAMSUM-DIALOGSUM-AMI"
tokenizer_bart = BartTokenizer.from_pretrained(model_name_bart)
model_bart = BartForConditionalGeneration.from_pretrained(model_name_bart).to('cuda')

def getSummaryBART(line):

    inputs = tokenizer_bart.encode("summarize: " + line, return_tensors="pt", max_length=1024, truncation=True).to('cuda')
    summary_ids = model_bart.generate(inputs, max_length=150, min_length=5, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer_bart.decode(summary_ids[0], skip_special_tokens=True)

    return summary

In [8]:
model_name_pegassus = "google/pegasus-xsum"
tokenizer_pegassus = PegasusTokenizer.from_pretrained(model_name_pegassus)
model_pegassus = PegasusForConditionalGeneration.from_pretrained(model_name_pegassus).to('cuda:0')

def getSummaryPegasus(line):
    inputs = tokenizer_pegassus.encode("summarize: " + line, return_tensors="pt", max_length=1024, truncation=True).to('cuda:0')
    summary_ids = model_pegassus.generate(inputs, max_length=150, min_length=5, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer_pegassus.decode(summary_ids[0], skip_special_tokens=True)

    return summary

In [9]:
model_name_fb = "nickmuchi/fb-bart-large-finetuned-trade-the-event-finance-summarizer"
tokenizer_fb = BartTokenizer.from_pretrained(model_name_fb)
model_fb = BartForConditionalGeneration.from_pretrained(model_name_fb).to('cuda:1')

def getSummaryBart2(line):
    inputs = tokenizer_fb.encode("summarize: " + line, return_tensors="pt", max_length=1024, truncation=True).to('cuda:1')
    summary_ids = model_fb.generate(inputs, max_length=150, min_length=5, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer_fb.decode(summary_ids[0], skip_special_tokens=True)

    return summary

In [10]:
model_name_db = "sshleifer/distilbart-cnn-12-6"
tokenizer_db = BartTokenizer.from_pretrained(model_name_db)
model_db = BartForConditionalGeneration.from_pretrained(model_name_db).to('cuda')

def getSummaryDistilBart(line):
    inputs = tokenizer_db.encode("summarize: " + line, return_tensors="pt", max_length=1024, truncation=True).to('cuda')
    summary_ids = model_db.generate(inputs, max_length=150, min_length=5, length_penalty=2.0, num_beams=4, early_stopping=True)
    summary = tokenizer_db.decode(summary_ids[0], skip_special_tokens=True)

    return summary

In [11]:

dictResults = {}
iCounter = 0

for eachLine in listLatencyLines:
    iCounter += 1
    summary = getSummaryBART(eachLine)
    summary2 = getSummaryDistilBart(eachLine)
    summary3 = getSummaryBart2(eachLine)
    summary4 = getSummaryPegasus(eachLine)
    # print(f'-- line: {eachLine} \nsummary Bart: {summary}\nsummary distBart: {summary2}\nsummary bart2: {summary3} \n')

    # print the number of items processed every 10 items
    if iCounter % 10 == 0:
        print(f'-- processed {iCounter} items')
        
    dictResults[eachLine] = [summary, summary2, summary3, summary4]

-- processed 10 items
-- processed 20 items
-- processed 30 items
-- processed 40 items
-- processed 50 items
-- processed 60 items
-- processed 70 items
-- processed 80 items


In [13]:
# number of keys in the dictionary is 
print(f'-- number of keys in the dictionary is {len(dictResults.keys())}')

-- number of keys in the dictionary is 62


In [14]:
import Levenshtein as lev
import pandas as pd

# Function to calculate Jaccard similarity
def jaccard_similarity(list1, list2):
    s1 = set(list1)
    s2 = set(list2)
    return len(s1.intersection(s2)) / len(s1.union(s2))

dictResult = {} 

for key in dictResults.keys():
    # Levenshtein distances
    lev1 = lev.distance(dictResults[key][0], dictResults[key][1])
    lev2 = lev.distance(dictResults[key][0], dictResults[key][2])
    lev3 = lev.distance(dictResults[key][0], dictResults[key][3])
    lev4 = lev.distance(dictResults[key][1], dictResults[key][2])
    lev5 = lev.distance(dictResults[key][1], dictResults[key][3])
    lev6 = lev.distance(dictResults[key][2], dictResults[key][3])

    # Jaccard similarities
    jac1 = jaccard_similarity(dictResults[key][0], dictResults[key][1])
    jac2 = jaccard_similarity(dictResults[key][0], dictResults[key][2])
    jac3 = jaccard_similarity(dictResults[key][0], dictResults[key][3])
    jac4 = jaccard_similarity(dictResults[key][1], dictResults[key][2])
    jac5 = jaccard_similarity(dictResults[key][1], dictResults[key][3])
    jac6 = jaccard_similarity(dictResults[key][2], dictResults[key][3])

    print(f'-- key: {key} \nlev1: {lev1} \nlev2: {lev2} \nlev3: {lev3} \nlev4: {lev4} \nlev5: {lev5} \nlev6: {lev6} \njac1: {jac1} \njac2: {jac2} \njac3: {jac3} \njac4: {jac4} \njac5: {jac5} \njac6: {jac6} \n')

    # make it into a dictionary
    dictResult[key.replace('$', '_')] = [lev1, lev2, lev3, lev4, lev5, lev6, jac1, jac2, jac3, jac4, jac5, jac6]

# make the dictionary into a dataframe
df = pd.DataFrame.from_dict(dictResult, orient='index', columns=['lev1', 'lev2', 'lev3', 'lev4', 'lev5', 'lev6', 'jac1', 'jac2', 'jac3', 'jac4', 'jac5', 'jac6'])

# name the index line
df.index.name = 'line'

# save the dataframe to a csv file with separator $
df.to_csv('levenshtein_jaccard.csv', sep='$')

-- key: B.6	Unacceptable deviation from target end-to-end latency	82 
lev1: 18 
lev2: 49 
lev3: 146 
lev4: 55 
lev5: 141 
lev6: 131 
jac1: 0.7777777777777778 
jac2: 0.6774193548387096 
jac3: 0.6129032258064516 
jac4: 0.7575757575757576 
jac5: 0.5555555555555556 
jac6: 0.6216216216216216 

-- key: C.4.2	Network latency requirement formulated by use of timeliness	91 
lev1: 22 
lev2: 14 
lev3: 130 
lev4: 12 
lev5: 129 
lev6: 134 
jac1: 0.8275862068965517 
jac2: 0.8275862068965517 
jac3: 0.5555555555555556 
jac4: 0.9259259259259259 
jac5: 0.4864864864864865 
jac6: 0.5277777777777778 

-- key: Communication services supporting cyber-physical control applications need to be ultra-reliable and, in some cases, the end-to-end latency must be very low. Communication for cyber-physical control applications supports operation in various vertical domains, for instance industrial automation, Smart Grid .  
lev1: 157 
lev2: 78 
lev3: 103 
lev4: 177 
lev5: 192 
lev6: 108 
jac1: 0.7931034482758621 
jac

In [16]:
# make this a dataframe
import pandas as pd

dfResults = pd.DataFrame.from_dict(dictResults, orient='index', columns=['Bart', 'DistilBart', 'Bart2', 'Pegasus'])

# name the index line
dfResults.index.name = 'Line'

# save to csv with a $ delimiter including the index
dfResults.to_csv('results.csv', sep='$')