# Extractive Summarization with Discourse Graph

The goal of this data challenge is to build a extractive summarization system using the discourse graph to classify if each utterance is important to the dialogue transcription. 

We use a fine-tuned Bert model and the AdaBoost classifier to predict the importance of each utterance. 



## Preprocessing and loading

In [2]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
from multiprocessing import Pool
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import numpy as np
import re
%matplotlib inline

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')


pd.set_option("display.max_rows", 400)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Emile\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Emile\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Emile\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
print(os.path.exists("training.csv"))

True


In [4]:
training_data_folder = "training"
annotations_file = "training_labels.json"
training_csv = "training.csv"

In [7]:
tags = ['Acknowledgement',
 'Alternation',
 'Background',
 'Clarification_question',
 'Comment',
 'Conditional',
 'Continuation',
 'Contrast',
 'Correction',
 'Elaboration',
 'Explanation',
 'Narration',
 'Parallel',
 'Q-Elab',
 'Question-answer_pair',
 'Result']

### Loading the training data

In [18]:
def load_training_data () -> pd.DataFrame:
    """Loads the training data from the json and txt files"""
    with open(annotations_file, 'r') as file:
        annotations = json.load(file)

    json_files = [f for f in os.listdir(training_data_folder) if f.endswith('.json')]
    json_files.sort()
    dfs = []
    for json_file in json_files:
        file_path = os.path.join(training_data_folder, json_file)
        with open(file_path, 'r') as file:
            data = pd.json_normalize(json.load(file))
        shortname = json_file.split(".")[0]
        data["file"] = shortname
        relevance = annotations[shortname]
        data["relevance"] = relevance
        dfs.append (data)

    df = pd.concat (dfs, ignore_index=True)
    txt_files = [f for f in os.listdir(training_data_folder) if f.endswith('.txt')]
    for tag in tags:
        df[tag] = 0
    txt_files = [f for f in os.listdir(training_data_folder) if f.endswith('.txt')]
    txt_files.sort()
    print("extraction des données du graphe (cela va prendre un certain temps)")
    for i,txt_file in tqdm(enumerate(txt_files)):
        shortname = txt_file.split(".")[0]
        with open(os.path.join(training_data_folder,txt_file), 'r') as file:
            txt = file.read()
        for line in txt.split("\n"):
            if line:
                items = line.split(" ")
                tag = items[1]
                referenced = items[2]
                df.loc[(df['index'] == int(referenced)) & (df['file'] == shortname), tag] = 1
    return df

### Preprocessing

In [19]:
def filter_special_characters (text):
    """Removes special characters from the text"""
    regex = r'[^a-zA-Z0-9\s.]'
    text = re.sub(regex,'',text)
    return text

def keep_only_noun_and_verbs (text):
    """Keeps only nouns and verbs from the text"""
    pos_tag = nltk.pos_tag(text.split())
    pos_tagged_noun_verb = []
    for word,tag in pos_tag:
        if tag == "NN" or tag == "NNP" or tag == "NNS" or tag == "VB" or tag == "VBD" or tag == "VBG" or tag == "VBN" or tag == "VBP" or tag == "VBZ":
            pos_tagged_noun_verb.append(word)
    return " ".join(pos_tagged_noun_verb)

def tokenize_and_filter_stopwords(text):
    """Tokenize the text"""
    text = filter_special_characters (text)
    text = keep_only_noun_and_verbs(text)
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]

    return filtered_words

def sentencize (text):
    """Splits a text into sentences"""
    sentences = sent_tokenize(text)
    tokenized_sentences = [tokenize_and_filter_stopwords(sentence) for sentence in sentences]
    indexes = range(0,len(sentences))
    return list(zip(sentences,indexes)), list(zip(tokenized_sentences,indexes))

def get_files (df : pd.DataFrame):
    return list(set(df["file"].values.tolist()))

### TF-IDF scores

In [20]:
def frequency (token, tokens):
    """Computes the frequency of a token in a list of tokens"""
    return len([t for t in tokens if t==token])/len(tokens)

def inverse_document_frequency (token, tokenized_sentences):
    """Computes the inverse document frequency of a token in a list of tokenized sentences"""
    d = len(tokenized_sentences)
    presence = len([sentence for sentence in tokenized_sentences if token in sentence[0]])
    return d/presence

def tfidf (tokenized_sentence, tokenized_sentences):
    """Computes the tfidf score of a tokenized sentence in a list of tokenized sentences"""
    #tokenized_sentence = tokenize_and_filter_stopwords(sentence)
    #tokenized_sentences = [tokenize_and_filter_stopwords(sentence) for sentence in sentences]
    words = set (tokenized_sentence)
    words_scores = {}
    for word in words:
        tfidf_ = frequency(word,words)*(np.log(1+inverse_document_frequency(word,tokenized_sentences)))
        words_scores[word] = tfidf_
    return words_scores


def sentences_scores (sentences ):
    """Computes the tfidf score of each sentence in a list of sentences"""
    sentences_scores = {}
    for sentence,index in sentences:
        words_scores = tfidf(sentence,sentences)
        score = sum([words_scores[word] for word in sentence])
        sentences_scores[" ".join(sentence)] = (score, index)
    return dict((sentences_scores.items()))

def tfidf_sentence_scores (sentences):
    """Computes the tfidf score of each sentence in a list of sentences"""
    tokenized_sentences = list(zip([tokenize_and_filter_stopwords(sentence) for sentence in sentences],range(len(sentences))))
    sentences_scores_ = sentences_scores (tokenized_sentences)
    return sentences_scores_

def add_tfidf_scores (df : pd.DataFrame):
    files = sorted(get_files(df))
    for file in tqdm(files):
        sentences = df[df["file"]==file]["text"].values.tolist()
        sentences_and_scores = tfidf_sentence_scores (sentences)
        scores = [0] * len(sentences)
        for score,index in sentences_and_scores.values():
            scores[index] = score

        df.loc[df["file"] == file, "score"] = scores

In [14]:
if os.path.exists(training_csv):
    df = pd.read_csv (training_csv)
else:
    print(f"Génération de {training_csv}")
    df = load_training_data ()
    add_tfidf_scores(df)
    df.to_csv(training_csv,index=False)

original_df = df
shuffled_df = original_df.sample(frac=1,random_state=42) # shuffle the dataframe for noise reduction

In [21]:
### Visualization
shuffled_df.head()

Unnamed: 0.1,Unnamed: 0,speaker,text,index,file,relevance,Acknowledgement,Alternation,Background,Clarification_question,...,Contrast,Correction,Elaboration,Explanation,Narration,Parallel,Q-Elab,Question-answer_pair,Result,score
3137,3137,ID,they're still very individual tools .,920,ES2002d,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,5.71373
37268,37268,UI,So um we've got a <disfmarker> i in this in th...,79,IS1002c,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4.305849
22494,22494,PM,um I wouldn't know for in,203,ES2012d,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,4.194438
19242,19242,ME,"No , splinters would <disfmarker>",234,ES2010c,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,4.127505
14452,14452,PM,"Um okay ,",936,ES2008d,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,3.233698


## Prepare training and train

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import numpy as np

from transformers import BertTokenizer, BertForSequenceClassification, AdamW, logging
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.model_selection import train_test_split
import warnings

import torch.nn as nn

### Tokenize and create DataLoader

In [25]:
class CustomDataset(Dataset):
    def __init__(self, text, labels, tokenizer, max_len):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = str(self.text[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


class BertTrainer:
    def __init__(self, device, class_weights, model_name, max_len, lr, loss_type):
        self.device = device
        self.class_weights = class_weights
        if not os.path.exists (model_name):
            model_name = "bert-base-uncased"
        self.max_len = max_len
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
        self.model.to(device)
        self.model.train()
        self.lr = lr
        
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=lr)

        #self.optimizer = AdamW(self.model.parameters(), lr=lr)
        #self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=1, gamma=0.5)
        self.loss_type = loss_type

    def train(self, x_train, y_train, x_val, y_val, epochs, batch_size):
        print("Training...")
        train_dataset = CustomDataset(text=x_train, labels=y_train, tokenizer=self.tokenizer, max_len=self.max_len)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

        val_dataset = CustomDataset(text=x_val, labels=y_val, tokenizer=self.tokenizer, max_len=self.max_len)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        macro_f1_loss = MacroF1BinaryLoss(self.device)
        if os.path.exists("best_fscore.txt"):
            with open("best_fscore.txt","r") as file:
                best_f1 = float(file.read ())

        else:
            best_f1 = 0
        new_best = 0
        for epoch in range(epochs):
            for batch in tqdm(train_loader):
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['label'].to(self.device)
                

                self.optimizer.zero_grad()

                outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)               
               
                if self.loss_type=="custom":
                    loss = macro_f1_loss(outputs.logits, labels) # 
                    loss.backward()
                else:
                    weighted_loss = torch.nn.functional.cross_entropy(outputs.logits, labels, weight=self.class_weights)
                    weighted_loss.backward()
                
                self.optimizer.step()
            
            #self.scheduler.step()
            print("End of training.")
            print("Starting validation...")
            
            # Validation loop
            self.model.eval()
            val_losses = []
            val_correct = 0
            all_predictions = []
            all_labels = []
            with torch.no_grad():
                for batch in tqdm(val_loader):
                    input_ids = batch['input_ids'].to(self.device)
                    attention_mask = batch['attention_mask'].to(self.device)
                    labels = batch['label'].to(self.device)

                    outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
                    
                    logits = outputs.logits

                    if self.loss_type=="custom":
                        loss = macro_f1_loss(outputs.logits, labels)
                    else:
                        loss = torch.nn.functional.cross_entropy(outputs.logits, labels, weight=self.class_weights)
                    probabilities = torch.softmax(logits, dim=1)
                    predictions = [torch.argmax(softmax) for softmax in probabilities]
                    all_predictions.extend(predictions)
                    all_labels.extend(labels.cpu().numpy())
                    val_losses.append(loss.item())
                    val_correct += (torch.argmax(logits, dim=1) == labels).sum().item()

            val_loss = sum(val_losses) / len(val_losses)
            val_accuracy = val_correct / len(x_val)
            all_labels_cpu = np.array(all_labels)
            all_predictions_cpu = [pred.cpu().numpy() for pred in all_predictions]
            val_f1 = f1_score(all_labels_cpu, all_predictions_cpu, pos_label=1)

            if val_f1 >= best_f1:
                print("Best F1-score")
                print("Saving model")
                self.model.save_pretrained('best_tuned_bert_model')

                self.model.save_pretrained('current_best_tuned_bert_model')
                print("Saving F1-score ")
                with open('best_fscore.txt', 'w') as file:
                    # Write the value of the variable to the file

                    file.write(str(val_f1))
                best_f1 = val_f1
                new_best = val_f1
            elif val_f1 >= new_best:
                print("Current best F1-score")
                print("Saving model")
                self.model.save_pretrained('current_best_tuned_bert_model')
                print("current best : " + str(val_f1))
                new_best = val_f1

            print (f'Epoch {epoch + 1}/{epochs}, F1 : {val_f1} Loss: {loss.item()}, Val Loss: {val_loss}, Val Accuracy: {val_accuracy}')
            print(f'And best F1 : {best_f1}')
            self.model.save_pretrained('last_tuned_bert_model')
            
            
class BertInference:
    def __init__ (self, model_path, max_len, device):
        self.device = torch.device(device)
        self.max_len = max_len
        self.model_path = model_path  # Update with the correct path
        self.loaded_model = BertForSequenceClassification.from_pretrained(self.model_path)
        self.loaded_model.to(self.device)
        self.loaded_model.eval()
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    def infer(self,sentences, return_type="labels"):
        labels = []
        probabilities = []
        for sentence in tqdm(sentences):
          encoded_input = self.tokenizer.encode_plus(
                  sentence,
                  add_special_tokens=True,
                  max_length=self.max_len,
                  return_token_type_ids=False,
                  padding="max_length",
                  return_attention_mask=True,
                  return_tensors='pt',
                  truncation=True,
              )
          encoded_input = {key: tensor.to(self.device) for key, tensor in encoded_input.items()}
          with torch.no_grad():
              outputs = self.loaded_model(**encoded_input)
              logits = outputs.logits
              probabilities_ = torch.softmax(logits, dim=1)
              predicted_label = torch.argmax(probabilities_, dim=1).tolist()
              probabilities.append (probabilities_[0])
              labels.append(predicted_label[0])

        return labels if return_type=="labels" else probabilities


class TFVectorizer:
  def __init__(self, max_embedding):
    self.max_embedding = max_embedding
    self.vectorizer = TfidfVectorizer(max_features=max_embedding)

  def train (self, x_train):
    self.vectorizer.fit_transform(x_train) 
    # prevent overfitting by using different training sets for vectorizer and classifier

  def infer(self,x_infer):
    x_test_vectorized = self.vectorizer.transform(x_infer)
    return x_test_vectorized


### Custom Loss(es)

In [26]:
class MacroF1BinaryLoss(nn.Module):
    """Custom loss function for binary classification to maximize F1 score."""
    def __init__(self, device):
        self.device = device
        super(MacroF1BinaryLoss, self).__init__()
        
    def forward(self, logits, labels):
        # Apply sigmoid to get probabilities
        probabilities = torch.softmax(logits, dim=1)
        
        # Convert labels to one-hot encoding
        labels = labels.to(self.device)
        eye = torch.eye(2).to(self.device)
        one_hot_labels = eye[labels]

        # Calculate true positive, false positive, and false negative
        true_positive = (probabilities * one_hot_labels).sum(dim=0)
        false_positive = (probabilities * (1 - one_hot_labels)).sum(dim=0)
        false_negative = ((1 - probabilities) * one_hot_labels).sum(dim=0)

        # Calculate precision, recall, and F1 score for each class
        precision = true_positive / (true_positive + false_positive + 1e-10)
        recall = true_positive / (true_positive + false_negative + 1e-10)
        f1_score = 2 * (precision * recall) / (precision + recall + 1e-10)

        # Macro-F1 is the average F1 score across all classes
        macro_f1 = f1_score.mean()

        # Use 1 - Macro-F1 as the loss (since we want to minimize it)
        loss = 1 - macro_f1
        return loss

In [27]:
def test_model (model_name, test_df, device):
    """Tests the model on the test set"""
    max_len  = 128
    bert_inference = BertInference(model_name, max_len, device)

    sentences = test_df["text"].values.tolist()
    labels = test_df["relevance"].values.tolist()
    predicted = bert_inference.infer(sentences)
    return f1_score (labels, predicted, pos_label = 1)

### Splitting dataset

In [32]:
def split_dataset (dataset, split,testing_size):
    """Splits the dataset into a training, validation and testing set"""
    training_and_val_size = dataset.shape[0] - testing_size
    df = dataset[0:training_and_val_size]
    train_df, val_df = train_test_split(df, test_size=split, random_state=42)
    test_df = shuffled_df[training_and_val_size:]
    return train_df, val_df, test_df

In [33]:
testing_size = 10000
split = 0.16
split_text = "016"

train_df, val_df, test_df = split_dataset (shuffled_df, split,testing_size)

print("training : ", train_df.shape[0])
print("validation : ", val_df.shape[0])
print("training + validation : ", train_df.shape[0]+val_df.shape[0])
print("test : ", test_df.shape[0])

training :  52603
validation :  10020
training + validation :  62623
test :  10000


### Checking if the dataset is balanced

In [31]:
def proportions (df__):
    return df__[df__["relevance"]==1].shape[0]/df__.shape[0]


print("Class 1 proportion in the dataset. Regenerate it if there is an imbalance")
print(proportions(shuffled_df))
print(proportions(train_df))
print(proportions(val_df))
print(proportions(test_df))

Class 1 proportion in the dataset. Regenerate it if there is an imbalance
0.18302741555705493
0.18324049959127806
0.17904191616766468
0.1859


### Training Bert model

In [35]:
# Training parameters
assert torch.cuda.is_available()
device = "cuda"
class_weights = torch.tensor([1, 4], dtype=torch.float32).to(device)
max_len = 128

In [36]:
# TRAINING 1/2 batch 0 => 16 with cross entropy loss 

batch_size=16
epochs = 4

# model_name = "best_tuned_bert_model" #if continuing training
model_name = "bert-base-uncased" #if starting training from scratch
size = train_df.shape[0]

lr = 2e-5 # we tried other values but this one gave the best results
decay = 0.9
for i in range(7) :
    print(f"Learning rate : {lr}")
    bert_trainer = BertTrainer(device, class_weights, model_name, max_len,lr,"classic")

    bert_trainer.train(train_df["text"].values.tolist(),
                       train_df["relevance"].values.tolist(),
                       val_df["text"].values.tolist(),
                       val_df["relevance"].values.tolist(),
                       epochs, batch_size)

    lr = lr*decay


Learning rate : 2e-05


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
#TRAINING 2/2 : batch 16==>32 with MacroF1 loss

batch_size=32
epochs = 10
max_len = 128
model_name = "best_tuned_bert_model" #We continue training the model we made at last step
size = train_df.shape[0]


lr = 2e-3 # best learning rate because losses are not on the same scale
print(f"Learning rate : {lr}")
bert_trainer = BertTrainer(device, class_weights, model_name, max_len,lr,"custom")

bert_trainer.train(train_df["text"].values.tolist(),
                   train_df["relevance"].values.tolist(),
                   val_df["text"].values.tolist(),
                   val_df["relevance"].values.tolist(),
                   epochs, batch_size)

### Testing Bert on its own

In [37]:
#If you are using cuda and you just trained the Bert model you might need to run the following line
#torch.cuda.empty_cache()  

# HERE WE TEST THE BERT MODEL ON ITS  OWN
test_score = test_model ("best_tuned_bert_model", test_df, "cuda")
print("Testing score of best iteration : ", test_score)
"""
test_score = test_model ("pretrained_bert_model", test_df)
print("Testing score of last round best iteration : ", test_score)"""

  0%|          | 0/10000 [00:04<?, ?it/s]


KeyboardInterrupt: 