## CS365 Final Project


Introduction and Conclusion part

### Libraries

In [1]:
import os
import random, string, re, spacy
import torch
import pandas as pd
import numpy as np
import transformers
from transformers import pipeline, EncoderDecoderModel, AutoTokenizer
from datasets import load_metric
import sacrebleu
from torch.utils.data import Dataset
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset, Dataset
from torch.nn.utils.rnn import pad_sequence
import torch.nn.functional as F
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

### Data Processing

In [2]:
#cleaning
def text_cleaner(text):
    #remove words between ()
    cleaned_string = ""
    paren_depth = 0
    for c in text:
        if c == '(':
            paren_depth += 1
        elif c == ')' and paren_depth:
            paren_depth -= 1
        elif paren_depth == 0:
            cleaned_string += c

    newString = cleaned_string.lower()
    newString = re.sub('"','', newString)  
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zA-Z]", " ", newString) 

    tokens = newString.split()
    long_words = [i for i in tokens if len(i) > 1]

    return (" ".join(long_words)).strip()


In [3]:
#training data
paper_data_lstm = []
abstract_data_lstm = []

vocab_dict = {}

#File
paper_file_path = [
    "Comp_Ling_paper_{}.txt".format(i) for i in range(1, 71)
]
abstract_file_path = [
    "abstract_{}.txt".format(i) for i in range(1, 71)
]

for i in range(len(paper_file_path)):
    with open(paper_file_path[i], "r") as f:
        paper = f.read()

    with open(abstract_file_path[i], "r") as f:
        abstract = f.read()
    
    cleaned_paper = text_cleaner(paper)
    cleaned_abstract = text_cleaner(abstract)

    #Tokenization
    nlp = spacy.load("en_core_web_sm")

    paper_tokens = [tok.text for tok in nlp(cleaned_paper)]
    abstract_tokens = ['<sos>'] + [tok.text for tok in nlp(cleaned_abstract)] + ['<eos>']

    for token in paper_tokens:
        if token not in vocab_dict:
            vocab_dict[token] = len(vocab_dict)
    for token in abstract_tokens:
        if token not in vocab_dict:
            vocab_dict[token] = len(vocab_dict)
            
    paper_data_lstm.append(paper_tokens)
    abstract_data_lstm.append(abstract_tokens)


In [4]:
#test data
test_paper = 'paper_ex.txt'
#test_paper = "Comp_Ling_paper_2.txt"
with open(test_paper, "r") as f:
    test_text = f.read()

cleaned_example = text_cleaner(test_text)
test_tokens_original = word_tokenize(cleaned_example)

for token in test_tokens_original:
        if token not in vocab_dict:
            vocab_dict[token] = len(vocab_dict)
test_tokens = test_tokens_original[:2132]
test_vocab = {token: idx for idx, token in enumerate(set(test_tokens))}
test_vocab_size  = len(test_vocab)
vocab_dict.update(test_vocab)


In [5]:
vocab_size = []
paper_input = []
abstract_input = []
for i in range(len(paper_data_lstm)):
    paper_vocab = {token: idx for idx, token in enumerate(set(paper_data_lstm[i]))}
    abstract_vocab = {token: idx for idx, token in enumerate(set(abstract_data_lstm[i]))}

    vocab_size.append([len(paper_vocab), len(abstract_vocab)])

    paper_indices = [paper_vocab[token] for token in paper_data_lstm[i]]
    abstract_indices = [abstract_vocab[token] for token in abstract_data_lstm[i]]

    paper_indices_tensor = torch.tensor(paper_indices)
    abstract_indices_tensor = torch.tensor(abstract_indices)

    paper_input.append(paper_indices_tensor)
    abstract_input.append(abstract_indices_tensor)

### LSTM

In [6]:
#LSTM Model
class MyDataset(Dataset):
    def __init__(self, paper_data, abstract_data):
        self.paper_data = paper_data
        self.abstract_data = abstract_data

    def __len__(self):
        return len(self.paper_data)

    def __getitem__(self, idx):
        return self.paper_data[idx], self.abstract_data[idx]

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout=dropout, batch_first=True)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        input = input.unsqueeze(1)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(1))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        batch_size = trg.shape[0]
        trg_len = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
        
        # tensor to store decoder outputs
        outputs = torch.zeros(batch_size, trg_len, trg_vocab_size).to(self.device)
        
        # last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        
        # first input to the decoder is the <sos> tokens
        input = trg[:,0]
        
        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[:,t,:] = output
            # decide if we will use teacher forcing or not
            teacher_force = random.random() < teacher_forcing_ratio
            # get the highest predicted token from our predictions
            top1 = output.argmax(1) 
            # if teacher forcing, use actual next token as next input. If not, use predicted token
            input = trg[:,t] if teacher_force else top1
        
        return outputs


In [7]:
def validate(model, val_loader, criterion, device):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    with torch.no_grad():  # Disable gradient computation
        for paper_data, abstract_data in val_loader:
            paper_data, abstract_data = paper_data.to(device), abstract_data.to(device)
            output = model(paper_data, abstract_data)
            output_dim = output.shape[-1]
            output = output[1:].reshape(-1, output_dim)
            target = abstract_data[1:].contiguous().view(-1)
            loss = criterion(output, target)
            total_loss += loss.item()
    return total_loss / len(val_loader)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_epochs  = 20
batch_size = 512

def my_collate_fn(batch):
    paper_batch, abstract_batch = zip(*batch)
    paper_padded = pad_sequence(paper_batch, batch_first=True, padding_value=0)
    abstract_padded = pad_sequence(abstract_batch, batch_first=True, padding_value=0)
    return paper_padded, abstract_padded

paper_train, paper_val, abstract_train, abstract_val = train_test_split(
    paper_input, abstract_input, test_size=0.2, random_state=42
)

train_dataset = MyDataset(paper_train, abstract_train)
val_dataset = MyDataset(paper_val, abstract_val)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=my_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, collate_fn=my_collate_fn)

INPUT_DIM = 3000
OUTPUT_DIM = 500
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

encoder = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
decoder = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(encoder, decoder, device).to(device)
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()


#Training
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch_idx, (paper_data, abstract_data) in enumerate(train_loader):
        paper_data, abstract_data = paper_data.to(device), abstract_data.to(device)
        optimizer.zero_grad()
        output = model(paper_data, abstract_data)
        output_dim = output.shape[-1]
        
        output = output[1:].reshape(-1, output_dim)
        target = abstract_data[1:].contiguous().view(-1)
        
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_loader)
    avg_val_loss = validate(model, val_loader, criterion, device)
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {avg_train_loss}, Val Loss: {avg_val_loss}')

torch.save(model.state_dict(), 'lstm_summarization_model_70_512.pth')

Epoch [1/20], Train Loss: 6.23647928237915, Val Loss: 5.957086563110352
Epoch [2/20], Train Loss: 5.827421188354492, Val Loss: 5.353848934173584
Epoch [3/20], Train Loss: 4.931849002838135, Val Loss: 4.782479286193848
Epoch [4/20], Train Loss: 4.070378303527832, Val Loss: 5.311925411224365
Epoch [5/20], Train Loss: 4.331934452056885, Val Loss: 4.750409126281738
Epoch [6/20], Train Loss: 4.0025458335876465, Val Loss: 4.276785373687744
Epoch [7/20], Train Loss: 3.6773667335510254, Val Loss: 4.165059566497803
Epoch [8/20], Train Loss: 3.6823103427886963, Val Loss: 4.106593132019043
Epoch [9/20], Train Loss: 3.6944262981414795, Val Loss: 4.049398422241211
Epoch [10/20], Train Loss: 3.614169120788574, Val Loss: 3.969383955001831
Epoch [11/20], Train Loss: 3.5549328327178955, Val Loss: 3.915844440460205
Epoch [12/20], Train Loss: 3.4680659770965576, Val Loss: 3.8978474140167236
Epoch [13/20], Train Loss: 3.41715669631958, Val Loss: 3.914144515991211
Epoch [14/20], Train Loss: 3.3750383853912

In [14]:
def generate_summary(model, tokens, paper_vocab, device):
    text_indices = [paper_vocab[token] for token in tokens]
    paper_data_tensor = torch.tensor(text_indices).unsqueeze(0).to(device)
    model.eval()
    
    with torch.no_grad():
        #seq2seq
        hidden, cell = model.encoder(paper_data_tensor)
        trg = torch.tensor([paper_vocab['<sos>']]).to(device)

        generated_tokens = []


        for _ in range(198): #max summary length
            output, hidden, cell = model.decoder(trg, hidden, cell)
            output_probs = F.softmax(output, dim=1)
            predicted_token = torch.multinomial(output_probs, 1).item()
            generated_tokens.append(predicted_token)

            if predicted_token == paper_vocab['<eos>'] or len(generated_tokens) >= 111:
                break

            trg = torch.tensor([predicted_token]).to(device)

    return generated_tokens

model = Seq2Seq(encoder, decoder, device)
model.load_state_dict(torch.load('lstm_summarization_model_70_512.pth', map_location=device))
model.eval()

summary_tokens = generate_summary(model, test_tokens, vocab_dict, device)

def get_key_from_value(dictionary, target_value):
    for key, value in dictionary.items():
        if value == target_value:
            return key
    return ' '

summary_text = ''
for token in summary_tokens:
    word  = get_key_from_value(vocab_dict, token)
    summary_text += word
    summary_text += ' '

print("Generated Summary:")
print(summary_text)

Generated Summary:
generative concise work pretrained related performance generated such responses attribution generative grounded there motivating issues section dialogue neural marked motivating classification there natural designers sources text work user negation phenomena variety responses seek negation pretrained automatic classification framework large generates performance seek known he underlying pro phenomena documents accurate closely related basis forward was attribution advanced user signaling album accurate problem large addition sources hallucinate often if interdisciplinary experimental viewpoint faithfulness generative contains system phenomena requires sources large large large large large posals posals pretrained phenomena sources user performance contextual contributing being of using reported generates evaluation for seek classification faithfulness large underlying also according using closely accurate viewpoint faithfulness system 


data_36 

music coherence music of models old annotation he showing issues faithfulness which support translation related pretrained of closely george corroboration offers cues generative closely harrison is support relations interrelationship computational related old been large tributes large discuss advanced is summarization for evaluation coherence sources attribution support support broaden evaluation large

### Evaluation

In [15]:
def calculate_rouge_scores(predictions, references):
    rouge = load_metric("rouge")
    results = rouge.compute(predictions=predictions, references=references)
    return results

#test data
test_abstract = 'abstract_ex.txt'

with open(test_paper, "r") as f:
    test_abs = f.read()

cleaned_example_abstract = text_cleaner(test_abs)

# example summary and reference summary
generated_summary = summary_text
reference_summary = cleaned_example_abstract
GPT_summary = 'The text discusses discourse relations (DRs) and their significance in understanding discourse coherence. It highlights the interdisciplinary nature of DRs, with linguistics focusing on their marking in discourse through corpus studies, while Natural Language Processing (NLP) aims at predicting discourse markers for various tasks. Despite these advancements, there is limited interdisciplinary dialogue. The study aims to address this gap by exploring the contribution of lexical semantics, specifically synonymy and antonymy, in signaling contrast and concession relations, and their interaction with different parts of speech (POS). The methodological approach involves computational modeling to analyze semantic signals in discourse relations, allowing for a transparent interpretation of results without manual coding. The study finds that adjectives, verbs, and nouns play a significant role in contrast relations compared to concession relations. Additionally, it observes differences between implicit and explicit relations. However, the approach has limitations, such as the need to consider phrasal verbs and sentence polarity in future research. Overall, the study contributes to bridging the gap between linguistic and computational approaches to discourse analysis. It offers insights into the role of lexical cues in discourse relations and proposes a method that can be automated for broader application in corpus linguistics research.'

# Calculate ROUGE scores
rouge_scores = calculate_rouge_scores([generated_summary], [reference_summary])
rouge_scores_gpt = calculate_rouge_scores([GPT_summary], [reference_summary])

# Print ROUGE scores
print("Generated Summary")
for key, value in rouge_scores.items():
    print(f"{key}: {value.mid.precision:.4f} (Precision), {value.mid.recall:.4f} (Recall), {value.mid.fmeasure:.4f} (F1 Score)")
print("GPT Summary")
for key, value in rouge_scores_gpt.items():
    print(f"{key}: {value.mid.precision:.4f} (Precision), {value.mid.recall:.4f} (Recall), {value.mid.fmeasure:.4f} (F1 Score)")




You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Generated Summary
rouge1: 0.2703 (Precision), 0.0313 (Recall), 0.0562 (F1 Score)
rouge2: 0.0000 (Precision), 0.0000 (Recall), 0.0000 (F1 Score)
rougeL: 0.1171 (Precision), 0.0136 (Recall), 0.0243 (F1 Score)
rougeLsum: 0.1171 (Precision), 0.0136 (Recall), 0.0243 (F1 Score)
GPT Summary
rouge1: 0.7879 (Precision), 0.1630 (Recall), 0.2701 (F1 Score)
rouge2: 0.2995 (Precision), 0.0617 (Recall), 0.1023 (F1 Score)
rougeL: 0.4697 (Precision), 0.0972 (Recall), 0.1610 (F1 Score)
rougeLsum: 0.4697 (Precision), 0.0972 (Recall), 0.1610 (F1 Score)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
