In [32]:
import json
import os

import nltk
import pandas as pd
import numpy as np

from gensim.models import Word2Vec
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from nltk.tokenize import sent_tokenize, word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BartForConditionalGeneration, BartTokenizer, Trainer, TrainingArguments
from datasets import Dataset


The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [4]:
def remove_invalid_lines(file_path):
    valid_lines = []
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    for line in lines:
        try:
            json.loads(line)
            valid_lines.append(line)
        except json.JSONDecodeError:
            continue

    with open(file_path, 'w') as file:
        file.writelines(valid_lines)

In [5]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return tokens

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ice1s\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
for file_name in os.listdir("data"):
    if file_name.startswith(("test_", "train_", "dev_")):
        print(file_name)
        remove_invalid_lines(os.path.join("data", file_name))

dev_part_aa
dev_part_ab
dev_part_ac
dev_part_ad
dev_part_ae
dev_part_af
test_part_aa
test_part_ab
test_part_ac
test_part_ad
test_part_ae
test_part_af
train_part_aa
train_part_ab
train_part_ac
train_part_ad
train_part_ae
train_part_af
train_part_ag
train_part_ah
train_part_ai
train_part_aj
train_part_ak
train_part_al
train_part_am
train_part_an
train_part_ao
train_part_ap
train_part_aq
train_part_ar
train_part_as
train_part_at
train_part_au
train_part_av
train_part_aw
train_part_ax
train_part_ay
train_part_az
train_part_ba
train_part_bb
train_part_bc
train_part_bd
train_part_be
train_part_bf
train_part_bg
train_part_bh
train_part_bi
train_part_bj
train_part_bk
train_part_bl
train_part_bm
train_part_bn
train_part_bo
train_part_bp
train_part_bq
train_part_br
train_part_bs
train_part_bt
train_part_bu
train_part_bv


In [7]:
schema = ["url", "archive", "title", "date", "text", "summary", "compression", "coverage", "density", "compression_bin", "coverage_bin", "density_bin"]
records = pd.DataFrame(columns=schema)

for file_name in os.listdir("data"):
    if file_name.startswith(("test_", "train_", "dev_")):
        path = os.path.join("data", file_name)
        
        with open(path, 'r', encoding='utf-8') as file:
            print(file_name)
            jsonObj = pd.read_json(path_or_buf=path, lines=True)
            records = pd.concat([records, jsonObj], ignore_index=True)

data = pd.DataFrame(records)
data.head()

dev_part_aa


  records = pd.concat([records, jsonObj], ignore_index=True)


dev_part_ab
dev_part_ac
dev_part_ad
dev_part_ae
dev_part_af
test_part_aa
test_part_ab
test_part_ac
test_part_ad
test_part_ae
test_part_af
train_part_aa
train_part_ab
train_part_ac
train_part_ad
train_part_ae
train_part_af
train_part_ag
train_part_ah
train_part_ai
train_part_aj
train_part_ak
train_part_al
train_part_am
train_part_an
train_part_ao
train_part_ap
train_part_aq
train_part_ar
train_part_as
train_part_at
train_part_au
train_part_av
train_part_aw
train_part_ax
train_part_ay
train_part_az
train_part_ba
train_part_bb
train_part_bc
train_part_bd
train_part_be
train_part_bf
train_part_bg
train_part_bh
train_part_bi
train_part_bj
train_part_bk
train_part_bl
train_part_bm
train_part_bn
train_part_bo
train_part_bp
train_part_bq
train_part_br
train_part_bs
train_part_bt
train_part_bu
train_part_bv


Unnamed: 0,url,archive,title,date,text,summary,compression,coverage,density,compression_bin,coverage_bin,density_bin
0,http://www.washingtonpost.com/wp-dyn/content/a...,https://web.archive.org/web/2008090319id_/http...,"Despite U.S. Aid, Coca Cultivation On Rise in ...",2033-08-19 18:51:59,"""The prices of oranges, mandarins, coffee and ...","COROICO, Bolivia -- Benito Cocarico admits tha...",11.625,0.625,1.0,low,low,abstractive
1,http://www.washingtonpost.com/wp-dyn/content/a...,https://web.archive.org/web/2008090319id_/http...,2.5 Million Indians Stranded by Floods,2033-08-19 18:51:59,"NEW DELHI, Sept. 2 -- Nearly 2.5 million India...","World news headlines from the Washington Post,...",13.065217,0.347826,0.347826,low,low,abstractive
2,http://www.washingtonpost.com/wp-dyn/content/a...,https://web.archive.org/web/2008090319id_/http...,Grassley Seeks Information About Hospital With...,2033-08-19 18:51:59,The ranking Republican on the Senate Finance C...,Latest news on the US federal government. Inf...,26.083333,0.5,0.555556,medium,low,abstractive
3,http://www.washingtonpost.com/wp-dyn/content/a...,https://web.archive.org/web/2008090319id_/http...,Gilchrest Breaks With the GOP In Md. 1st Distr...,2033-08-19 18:51:59,"U.S. Rep. Wayne T. Gilchrest, who lost a bitte...","U.S. Rep. Wayne T. Gilchrest, who lost a bitte...",19.475,1.0,40.0,medium,high,extractive
4,http://www.washingtonpost.com/wp-dyn/content/a...,https://web.archive.org/web/2008090319id_/http...,'Raising the Bar' and Clearing It in the Ratings,2033-08-19 18:51:59,Steven Bochco set a new record Monday when his...,Steven Bochco set a new record Monday when his...,16.555556,1.0,45.0,medium,high,extractive


In [8]:
data = pd.DataFrame(records)
data.head()

Unnamed: 0,url,archive,title,date,text,summary,compression,coverage,density,compression_bin,coverage_bin,density_bin
0,http://www.washingtonpost.com/wp-dyn/content/a...,https://web.archive.org/web/2008090319id_/http...,"Despite U.S. Aid, Coca Cultivation On Rise in ...",2033-08-19 18:51:59,"""The prices of oranges, mandarins, coffee and ...","COROICO, Bolivia -- Benito Cocarico admits tha...",11.625,0.625,1.0,low,low,abstractive
1,http://www.washingtonpost.com/wp-dyn/content/a...,https://web.archive.org/web/2008090319id_/http...,2.5 Million Indians Stranded by Floods,2033-08-19 18:51:59,"NEW DELHI, Sept. 2 -- Nearly 2.5 million India...","World news headlines from the Washington Post,...",13.065217,0.347826,0.347826,low,low,abstractive
2,http://www.washingtonpost.com/wp-dyn/content/a...,https://web.archive.org/web/2008090319id_/http...,Grassley Seeks Information About Hospital With...,2033-08-19 18:51:59,The ranking Republican on the Senate Finance C...,Latest news on the US federal government. Inf...,26.083333,0.5,0.555556,medium,low,abstractive
3,http://www.washingtonpost.com/wp-dyn/content/a...,https://web.archive.org/web/2008090319id_/http...,Gilchrest Breaks With the GOP In Md. 1st Distr...,2033-08-19 18:51:59,"U.S. Rep. Wayne T. Gilchrest, who lost a bitte...","U.S. Rep. Wayne T. Gilchrest, who lost a bitte...",19.475,1.0,40.0,medium,high,extractive
4,http://www.washingtonpost.com/wp-dyn/content/a...,https://web.archive.org/web/2008090319id_/http...,'Raising the Bar' and Clearing It in the Ratings,2033-08-19 18:51:59,Steven Bochco set a new record Monday when his...,Steven Bochco set a new record Monday when his...,16.555556,1.0,45.0,medium,high,extractive


In [9]:
data = data[['text', 'summary']]
data

Unnamed: 0,text,summary
0,"""The prices of oranges, mandarins, coffee and ...","COROICO, Bolivia -- Benito Cocarico admits tha..."
1,"NEW DELHI, Sept. 2 -- Nearly 2.5 million India...","World news headlines from the Washington Post,..."
2,The ranking Republican on the Senate Finance C...,Latest news on the US federal government. Inf...
3,"U.S. Rep. Wayne T. Gilchrest, who lost a bitte...","U.S. Rep. Wayne T. Gilchrest, who lost a bitte..."
4,Steven Bochco set a new record Monday when his...,Steven Bochco set a new record Monday when his...
...,...,...
468775,The Federal Emergency Management Agency made $...,The Federal Emergency Management Agency made $...
468776,"SOWERBY BRIDGE, England -- During the first tw...","World news headlines from the Washington Post,..."
468777,Elizabeth Taylor has White Diamonds. Coco Chan...,"Get Washington DC, Maryland, Virginia news. In..."
468778,"BALTIMORE, May 18 -- A disease believed to be ...","Get sports news, schedules, rosters for Washin..."


In [10]:
data.describe()

Unnamed: 0,text,summary
count,468780,468780
unique,7794,5270
top,"Each Wednesday (okay, this one is a little lat...",Join live discussions from the Washington Post...
freq,120,37440


In [11]:
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)
test_data.to_csv('test_data.csv')
val_data.to_csv('val_data.csv')

In [12]:
train_data.drop_duplicates()

Unnamed: 0,text,summary
313427,"» This Story:Read +|Watch +\n\n{ ""movie"":""http...",The Pentagon Memorial is to be dedicated on Se...
401856,"Sure, there were cathedrals around every corne...","Find Washington DC, Virginia and Maryland trav..."
54855,The actions far inside undisputed Georgian ter...,"OUTSIDE GORI, Georgia, Aug. 13 -- A day after ..."
227165,"The Egyptian bureau of al-Hurra, an Arabic-lan...",CAIRO First of two articles The Egyptian bure...
205776,Among those being forced to move is Sister Ang...,"SANTA BARBARA, Calif. -- In Southern Californi..."
...,...,...
26362,It is rare that a retirement announcement by a...,It is rare that a retirement announcement by a...
313833,If you had to put together the Help Wanted ad ...,A simple and deceptively tricky question: What...
394615,"In the grand narrative of World War II, the Ba...",The AK-47 has become the world's most prolific...
448182,From the opening line of his statement yesterd...,From the opening line of his statement yesterd...


In [13]:
text_len = train_data['text'].apply(len)
summary_len = train_data['summary'].apply(len)

print(f"Text:\n min: {text_len.min()}\n max: {text_len.max()}\n mean: {text_len.mean()}")
print(f"Summary:\n min: {summary_len.min()}\n max: {summary_len.max()}\n mean: {summary_len.mean()}")


Text:
 min: 603
 max: 592186
 mean: 8733.952616281653
Summary:
 min: 3
 max: 873
 mean: 205.7146103501708


In [14]:
train_data = train_data[train_data['summary'].apply(len) <= 200]
train_data

Unnamed: 0,text,summary
402312,Chris Clark didn't hesitate when the question ...,Chris Clark scores two goals as the Capitals w...
135975,Small Internet radio stations were offered a b...,Small Internet radio stations were offered a b...
79797,"Babbin is the author of three books, most rece...",Former Deputy Undersecretary of Defense Jed L....
243965,"One of the many unique things about Farid, the...",Islam's Advance on PostGlobal; blog of politic...
371316,If I told you that there was something in Wash...,The urgent issue of war and peace in Iraq shou...
...,...,...
354119,Al Gore became the third Baptist to win the No...,A conversation on religion with Jon Meacham an...
461378,Erica Williams worries about the wave of polit...,Erica Williams worries about the wave of polit...
15260,I'm always trying to crack the credit-scoring ...,I'm always trying to crack the credit-scoring ...
260263,President Bush is so predictable! He needs a w...,Miriam Leitao at PostGlobal on PostGlobal; blo...


In [15]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ice1s\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\ice1s\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ice1s\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [16]:
train_data_preprocessed = pd.DataFrame()

train_data_preprocessed['text_tokens'] = train_data['text'].apply(preprocess_text)
train_data_preprocessed['summary_tokens'] = train_data['summary'].apply(preprocess_text)

train_data_preprocessed


Unnamed: 0,text_tokens,summary_tokens
402312,"[chris, clark, hesitate, question, posed, foll...","[chris, clark, score, two, goal, capital, win,..."
135975,"[small, internet, radio, station, offered, bre...","[small, internet, radio, station, offered, bre..."
79797,"[babbin, author, three, book, recently, word, ...","[former, deputy, undersecretary, defense, jed,..."
243965,"[one, many, unique, thing, farid, subject, vid...","[islam, advance, postglobal, blog, politics, c..."
371316,"[told, something, washington, called, baker, c...","[urgent, issue, war, peace, iraq, decided, peo..."
...,...,...
354119,"[al, gore, became, third, baptist, win, nobel,...","[conversation, religion, jon, meacham, sally, ..."
461378,"[erica, williams, worry, wave, political, invo...","[erica, williams, worry, wave, political, invo..."
15260,"[always, trying, crack, code, play, big, role,...","[always, trying, crack, code, play, big, role,..."
260263,"[president, bush, predictable, need, war, desp...","[miriam, leitao, postglobal, postglobal, blog,..."


In [17]:
train_data_preprocessed.to_csv('train_data.csv')

In [18]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

df = pd.read_csv('train_data.csv', index_col=0)

# Step 2: Train Word2Vec model
class Feng:
    def __init__(self, preprocessed_df: pd.DataFrame):
        self.preprocessed_df = preprocessed_df  # Store input data
        self.word2vec_model = None  # Initialize Word2Vec model
        self.all_tokens = self.preprocessed_df['text_tokens'].tolist() + self.preprocessed_df['summary_tokens'].tolist()

    def w2v_init(self):
        self.word2vec_model = Word2Vec(
            sentences=self.all_tokens,  # List of token lists
            vector_size=100,            # Dimension of vectors
            window=5,                   # Context window size
            min_count=2,                # Minimum word frequency
            workers=4                   # Number of threads
        )

        self.word2vec_model.save("word2vec.model")
        self.word2vec_model.load("word2vec.model")

    def get_w2v_vector(self, tokens):
        """
        Compute average vector for a list of tokens.
        Skip words not in Word2Vec model.
        """
        valid_vectors = [
            self.word2vec_model.wv[word]
            for word in tokens
            if word in self.word2vec_model.wv
        ]
        if valid_vectors:
            return np.mean(valid_vectors, axis=0)
        else:
            return np.zeros(self.word2vec_model.vector_size)  # Return zero vector if no valid words

    def replace_w_embeddings(self):
        """
        Replace text and summary with their Word2Vec embeddings.
        """
        self.preprocessed_df['text_vector'] = self.preprocessed_df['text_tokens'].apply(
            lambda tokens: self.get_w2v_vector(tokens)
        )
        self.preprocessed_df['summary_vector'] = self.preprocessed_df['summary_tokens'].apply(
            lambda tokens: self.get_w2v_vector(tokens)
        )

# Initialize and process data with Feng
feng_instance = Feng(preprocessed_df=df)
feng_instance.w2v_init()
feng_instance.replace_w_embeddings()
feng_instance.word2vec_model.save("word2vec.model")

# Step 3: Extractive summarization
def rank_sentences(text, model):
    """
    Rank sentences in a document based on their similarity to the document vector.
    """
    sentences = sent_tokenize(text)
    sentence_tokens = [preprocess_text(sentence) for sentence in sentences]

    # Compute sentence vectors
    sentence_vectors = [model.get_w2v_vector(tokens) for tokens in sentence_tokens]

    # Compute document vector
    document_vector = np.mean(sentence_vectors, axis=0)

    # Rank sentences by similarity to document vector
    similarities = [
        cosine_similarity([sentence_vector], [document_vector])[0][0]
        if sentence_vector.any() else 0
        for sentence_vector in sentence_vectors
    ]

    ranked_sentences = sorted(
        zip(sentences, similarities), key=lambda x: x[1], reverse=True
    )

    return [sentence for sentence, _ in ranked_sentences[:3]]  # Top 3 sentences

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ice1s\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ice1s\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ice1s\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


This is a sample document.
It has multiple sentences for testing summarization.
Summarization is important.


In [19]:
feng_instance_loaded = Feng(preprocessed_df=df)
feng_instance_loaded.word2vec_model.load("word2vec.model")

In [30]:
sample_text = test_data.sample(1).iloc[0].text
print(str(sample_text))

The future arrived at no more than 10 mph, in a white Chrysler with a gold winged hood ornament.

On a day in January, Vera Freeman drove the quarter-mile to Steve Stanley's house, passing her legendary restaurant, Vera's White Sands, and the community that has bloomed around it near Lusby.

After a half-century as proprietress, Freeman was ready to sell Vera's White Sands. She had chosen Stanley and his wife, Lisa Del Ricco, as her successors. Should they accept, they would take the wheel of her life's work, a dream project that started in 1953 on 800 acres of Calvert County wilderness.

Vera's White Sands -- yacht club, Polynesian mirage, Xanadu for the semi-famous and wholly eccentric -- has been the anomalous treasure of the county since. Tucked two miles down a dead-end road on a Patuxent River tributary, the restaurant has lost some of its luster in the past decade as Freeman entered her nineties and Solomons Island burgeoned into a serious beach destination six miles south.

It 

In [31]:
summary = rank_sentences(sample_text, feng_instance_loaded)
print("\n".join(summary))

The future arrived at no more than 10 mph, in a white Chrysler with a gold winged hood ornament.
On a day in January, Vera Freeman drove the quarter-mile to Steve Stanley's house, passing her legendary restaurant, Vera's White Sands, and the community that has bloomed around it near Lusby.
After a half-century as proprietress, Freeman was ready to sell Vera's White Sands.
