In [14]:
from transformers import pipeline
import os
import spacy
import tensorflow as tf
# Load summarization pipeline
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
nlp = spacy.load("en_core_web_sm")


All PyTorch model weights were used when initializing TFBartForConditionalGeneration.

All the weights of TFBartForConditionalGeneration were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBartForConditionalGeneration for predictions without further training.


In [15]:
def check_word_validity(word):
    if word.is_alpha and word.text.lower() not in nlp.Defaults.stop_words:
        return True
    else:
        return False


def extract_sample_from_file(file_name, character_count, start_position=0):
    possible_encodings = ['utf-8', 'latin-1', 'windows-1252']
    unprocessed_text = ''
    for encoding in possible_encodings:
        try:
            with open(file_name, 'r', encoding=encoding) as f:
                f.seek(start_position)
                text = f.read(character_count)
                unprocessed_text += text
            doc = nlp(text)
            # Check and remove the first token if it's not a valid word
            if check_word_validity(doc[0]):
                print("removing first token: ", doc[0])
                doc = doc[1:]

            # Check and remove the last token if it's not a valid word
            if check_word_validity(doc[-1]):
                print("removing last token: ", doc[-1])
                doc = doc[:-1]
            return doc, unprocessed_text
        except UnicodeDecodeError:
            continue

In [16]:



the_lindsays_sample, unprocessed_the_lindsays_sample  = extract_sample_from_file(os.path.join("data","the_lindsays.txt"), character_count=2000, start_position=10000)

# Generate summary
# summary = summarizer(unprocessed_the_lindsays_sample, max_length=100, min_length=30, do_sample=False)[0]

summary = summarizer(unprocessed_the_lindsays_sample, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)

    

removing first token:  n
removing last token:  ho


In [17]:
print("Original Text: \n", unprocessed_the_lindsays_sample)

Original Text: 
 n, and as I bade good-night
to the cross-questioning farmer, I observed a grim smile of triumph on
his firmly compressed lips. He evidently knew the dog-cart, and would
now be able to trace the mysterious stranger.

I and my portmanteau were finally left on the side of the road, and
the young man in the dog-cart civilly turned the vehicle round (with
some difficulty on account of the narrow road), and drew up beside me,
to save my carrying my luggage a dozen yards. At first I was a little
uncertain whether I had one of my third (or fourth, which is it?)
cousins before me, or simply a young man from Mr. Lindsay’s farm. He
was dressed in very coarse tweeds, and his hands were rough, and spoke
of manual labour, and he breathed the incense of the farm-yard; but I
thought his finely-cut features and sensitive lips bespoke him to be of
gentle blood, and, luckily, I made a hit in the right direction.

‘You are one of Mr. Lindsay’s sons, I think--that is to say, one of my
cous

In [20]:
# # Turn key sentences into a single string for summarization
# key_sentences = [sentence.text for sentence in doc.sents if len(sentence.text.split()) > 5]
# input_text = " ".join(key_sentences)

# # Use the pre-trained LLM for summarization
# summary = summarizer(input_text, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)

print("Generated Summary:")
print(summary[0]["summary_text"])


Generated Summary:
The young man in the dog-cart civilly turned the vehicle round (with some difficulty on account of the narrow road), and drew up beside me. At first I was a littleuncertain whether I had one of my third (or fourth, which is it?) grotesquecousins before me, or simply a young man from Mr. Lindsay’s farm. But I could see, even as we drove together along that solitary lane, that his was a frank, ingenuous nature.
