In [1]:
def __get_sentences( article: str) -> 'list[str]':
    '''Gets individual sentences for text chunking.'''
    sentences = article.split('<eos>')

    return sentences

def __chunk_text(sentences: 'list[str]') -> 'list[str]':
    '''Chunks text for each chunk to be less than the max length.'''
    current_chunk = 0
    chunks = []

    for sentence in sentences:
        if len(chunks) == current_chunk + 1:
            # Check if the chunk is less than max_chunk
            if len(chunks[current_chunk]) + len(sentence.split()) <= 250:
                chunks[current_chunk].extend(sentence.split())
            # Next chunk
            else:
                current_chunk += 1
                chunks.append(sentence.split())
        else:
            chunks.append(sentence.split())

    for chunk_id in range (len(chunks)):
        chunks[chunk_id] = ' '.join(chunks[chunk_id])

    return chunks

def __add_tokens(text: str) -> str:
    '''Adds tokens to text for easier processing.'''
    text = text.replace('.', '.<eos>')
    text = text.replace('!', '!<eos>')
    text = text.replace('?', '?<eos>')
    return text

In [69]:
import pdfplumber

article = ''
with pdfplumber.open('../program/uploads/SupportLetter.pdf') as pdf:
    for page in pdf.pages:
        article += ' '.join(((page.extract_text(layout=False)).replace('\n', '')).split())

article = __add_tokens(text=article)

In [70]:
sentences = __get_sentences(article=article)
chunks = __chunk_text(sentences=sentences)

In [71]:
from transformers import pipeline
from transformers import BartForConditionalGeneration, AutoModelForSeq2SeqLM
from transformers import AutoTokenizer
from transformers.pipelines.base import Pipeline
from summarizer import Summarizer
import os
import pickle

checkpoint = 'sshleifer/distilbart-cnn-12-6'
model = BartForConditionalGeneration.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

summarizer = pipeline(
            'summarization', model=model, tokenizer=tokenizer)

In [84]:
article = ''
with open('../program/uploads/sample1.txt', 'r', encoding='utf-8') as f:
    article += ' '.join(((f.read())).replace('\n', ' ').split())

article = __add_tokens(text=article)

In [85]:
sentences = __get_sentences(article=article)
chunks = __chunk_text(sentences=sentences)

In [86]:
results = summarizer(chunks, return_text='True')

In [87]:
results

[{'summary_text': ' The day I picked my dog up from the pound was one of the happiest days of both of our lives . "Looking for houses was supposed to be a fun and exciting process. Unfortunately, none of the ones that we saw seemed to match the specifications that we had established. They were too small, too impersonal, too close to the neighbors. After days of finding nothing even close, we began to wonder: was there really a perfect house out there for us? "'}]

In [74]:
results[0]

{'summary_text': ' Industrial Training Programme offered by the Faculty of Computing and Information Technology, Tunku Abdul Rahman University College (TAR UC) The main objective of the industrial training programme is to provide students with practical training opportunities in one or more of the following areas . We believe with the expert guidance and experience of your esteemed organisation, our students will acquire relevant practical skills and experience which would be valuable to the students later in their working life .'}

In [82]:
title_model = pickle.load(open('../models/title-generator-t5-arxiv-16-4.pkl', 'rb'))
title_summarizer = title_model.predict

In [7]:
def __transpose_dict(dict_: dict) -> dict:
    '''Transposes the keys and values of the dictionary object. Based on the assumption that all keys and values are unique.'''
    return {y:x for x, y in dict_.items()}

In [83]:
title_summarizer("Industrial Training Programme offered by the Faculty of Computing and Information Technology, Tunku Abdul Rahman University College (TAR UC) The main objective of the industrial training programme is to provide students with practical training opportunities in one or more of the following areas . We believe with the expert guidance and experience of your esteemed organisation, our students will acquire relevant practical skills and experience which would be valuable to the students later in their working life .")

Generating outputs: 100%|██████████| 65/65 [00:06<00:00, 10.20it/s]
Decoding outputs: 100%|██████████| 65/65 [00:14<00:00,  4.50it/s]


['Industri',
 'Al Train al Train',
 'Prog ing Prog ing Prog ing Prog ing Prog',
 'Ramme of ramme of ramme of',
 'Fered by fered by fered by fered by fered by',
 'Facing the Facc Fac',
 'ulty of ulty of ulty of',
 'Computin Computin',
 'In g and Infrared',
 'Formatio',
 'Technological n Technological n Technological',
 'logy, Tu Tu logy, Tu Tu Tu Tu Tu Tu Tu Tu Tu Tu Tu',
 'Abdudu nku Abdu Abdu nku Abdu Abdu',
 'Rahman Rahman',
 "Universit's Universit'es Universit'e",
 'ity Coll Coll Coll Coll',
 'Ege (TAR): a ege (TAR)',
 'UC Theorems',
 'Main obstructor ob',
 'jective jective jive',
 'I',
 'ndustria ndustria',
 'Traini l traini',
 'ng progr ng progr ng progr ng prog',
 'Amme isotropic and amme isotropic',
 'Proviate Proviant Proviant Proviant',
 'De stude em es em es em',
 'nts with nts with nts',
 'Practic practicum',
 'Al train',
 'Oppospos oppos oppos oppos oppos oppos',
 'Rtunitie rtunitie',
 'One s s s a s a s',
 'Or more: A note on the number of adobes',
 'The',
 'Followin follo

In [88]:
new_results2 = []
# results

for i in range(len(results)):
    body = results[i]['summary_text']
    new_results2.append({title_summarizer(body)[0]: body})

print(new_results2)

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and the tokenizer under the `as_target_tokenizer` context manager to prepare
your targets.

Here is a short example:

model_inputs = tokenizer(src_texts, ...)
with tokenizer.as_target_tokenizer():
    labels = tokenizer(tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

Generating outputs: 100%|██████████| 56/56 [00:05<00:00, 10.31it/s]
Decoding outputs: 100%|██████████| 56/56 [00:15<00:00,  3.71it/s]


[{'The day of the day': ' The day I picked my dog up from the pound was one of the happiest days of both of our lives . "Looking for houses was supposed to be a fun and exciting process. Unfortunately, none of the ones that we saw seemed to match the specifications that we had established. They were too small, too impersonal, too close to the neighbors. After days of finding nothing even close, we began to wonder: was there really a perfect house out there for us? "'}]


In [44]:
new_results = []
for dict_ in results:
    # Now the body is the key and the title is the value
    dict_ = __transpose_dict(dict_)
    for body in dict_:
        # Summarize the given text
        dict_[body] = title_summarizer(body)[0] # Returns a list so take only the first element
        pass
    dict_ = __transpose_dict(dict_)
    new_results.append(dict_)
print('new_results:', new_results)

Generating outputs: 100%|██████████| 50/50 [00:04<00:00, 10.92it/s]
Decoding outputs: 100%|██████████| 50/50 [00:17<00:00,  2.86it/s]
Generating outputs: 100%|██████████| 44/44 [00:03<00:00, 11.08it/s]
Decoding outputs: 100%|██████████| 44/44 [00:14<00:00,  2.97it/s]
Generating outputs: 100%|██████████| 44/44 [00:04<00:00, 10.07it/s]
Decoding outputs: 100%|██████████| 44/44 [00:14<00:00,  2.96it/s]
Generating outputs: 100%|██████████| 41/41 [00:03<00:00, 10.96it/s]
Decoding outputs: 100%|██████████| 41/41 [00:14<00:00,  2.77it/s]
Generating outputs: 100%|██████████| 39/39 [00:03<00:00, 10.36it/s]
Decoding outputs: 100%|██████████| 39/39 [00:14<00:00,  2.62it/s]
Generating outputs: 100%|██████████| 53/53 [00:04<00:00, 10.64it/s]
Decoding outputs: 100%|██████████| 53/53 [00:15<00:00,  3.53it/s]
Generating outputs: 100%|██████████| 49/49 [00:05<00:00,  9.40it/s]
Decoding outputs: 100%|██████████| 49/49 [00:15<00:00,  3.24it/s]
Generating outputs: 100%|██████████| 54/54 [00:04<00:00, 11.08

new_results: [{'Using the e-commerce method to analyze the use of a p-value': ' The user has requested enhancement of the downloaded file . See discussions, stats, and author profiles for this publication at: https://www.researchgate.net/publication/357876127Artiﬁcial Intelligence Art: Attitudes and Perceptions Toward Human Versus Artificial Intelligence Artworks . The research is a study on the young generation views and acceptance of Artificial Intelligence (AI) art .'}, {'Ternary Ternoids': ' The term Artificial Intelligence (AI) originated in the 1950’s in modelling human cognition . Nowadays, the term has evolved to refer to application that rely on deep neural networks . AI art refers to artwork made by collaboration between AI algorithms and human artists . 54% of respondents did not correctly identify emotions in AI artworks .'}, {'A pamphlet on the paternal pamphlet': ' This paper is constructed in four main sections: Literature Review, Research Methodology, Research Results a