#  Data visualisation

In [4]:
! pip install tqdm
! pip install nltk
! pip install transformers
! pip install scikit-learn



In [5]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import json
from multiprocessing import Pool
from tqdm import tqdm

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import numpy as np
import re
%matplotlib inline

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')


pd.set_option("display.max_rows", 400)

[nltk_data] Downloading package punkt to /home/remi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/remi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/remi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:

print(os.path.exists("training.csv"))

True


In [7]:
training_data_folder = "training"
annotations_file = "training_labels.json"
training_csv = "training.csv"

In [8]:



tags = set()
def get_tags_from_files (): #Run only once
    txt_files = [f for f in os.listdir(training_data_folder) if f.endswith('.txt')]

    for txt_file in txt_files:
        with open(os.path.join(training_data_folder,txt_file), 'r') as file:
            txt = file.read()
        for line in txt.split("\n"):
            if line:
                tag = line.split(" ")[1]
                if not tag in tags:
                    tags.add (tag)
#get_tags_from_files()
tags = ['Acknowledgement',
 'Alternation',
 'Background',
 'Clarification_question',
 'Comment',
 'Conditional',
 'Continuation',
 'Contrast',
 'Correction',
 'Elaboration',
 'Explanation',
 'Narration',
 'Parallel',
 'Q-Elab',
 'Question-answer_pair',
 'Result']


In [9]:
#TFIDF computing

def filter_special_characters (text):
    regex = r'[^a-zA-Z0-9\s.]'
    text = re.sub(regex,'',text)
    return text

def keep_only_noun_and_verbs (text):
    pos_tag = nltk.pos_tag(text.split())
    pos_tagged_noun_verb = []
    for word,tag in pos_tag:
        if tag == "NN" or tag == "NNP" or tag == "NNS" or tag == "VB" or tag == "VBD" or tag == "VBG" or tag == "VBN" or tag == "VBP" or tag == "VBZ":
            pos_tagged_noun_verb.append(word)
    return " ".join(pos_tagged_noun_verb)

def tokenize_and_filter_stopwords(text):
    # Tokenize the text
    text = filter_special_characters (text)
    text = keep_only_noun_and_verbs(text)
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]

    return filtered_words

def frequency (token, tokens):
    return len([t for t in tokens if t==token])/len(tokens)

def inverse_document_frequency (token, tokenized_sentences):
    d = len(tokenized_sentences)
    presence = len([sentence for sentence in tokenized_sentences if token in sentence[0]])
    return d/presence

def tfidf (tokenized_sentence, tokenized_sentences):
    #tokenized_sentence = tokenize_and_filter_stopwords(sentence)
    #tokenized_sentences = [tokenize_and_filter_stopwords(sentence) for sentence in sentences]
    words = set (tokenized_sentence)
    words_scores = {}
    for word in words:
        tfidf_ = frequency(word,words)*(np.log(1+inverse_document_frequency(word,tokenized_sentences)))
        words_scores[word] = tfidf_
    return words_scores


def sentencize (text):

    sentences = sent_tokenize(text)
    tokenized_sentences = [tokenize_and_filter_stopwords(sentence) for sentence in sentences]
    indexes = range(0,len(sentences))
    return list(zip(sentences,indexes)), list(zip(tokenized_sentences,indexes))

def sentences_scores (sentences ):
    sentences_scores = {}
    for sentence,index in sentences:
        words_scores = tfidf(sentence,sentences)
        score = sum([words_scores[word] for word in sentence])
        sentences_scores[" ".join(sentence)] = (score, index)
    return dict((sentences_scores.items()))


def extractive_summary (sentences, threshold):

    tokenized_sentences = list(zip([tokenize_and_filter_stopwords(sentence) for sentence in sentences],
                                   range(len(sentences))))

    sentences_scores_ = list(sentences_scores (tokenized_sentences).items())

    sentences_indexes = [sentence_and_score[1][1] for sentence_and_score in  sentences_scores_ if sentence_and_score[1][0]>=threshold]
    sentences_indexes.sort()
    summary = [sentences[index] for index in sentences_indexes]

    return summary
    #print(summary)


def tfidf_sentence_scores (sentences):
    tokenized_sentences = list(zip([tokenize_and_filter_stopwords(sentence) for sentence in sentences],range(len(sentences))))
    sentences_scores_ = sentences_scores (tokenized_sentences)
    return sentences_scores_



In [10]:
#Loading training data from json and txt files. 

def load_training_data ():

    with open(annotations_file, 'r') as file:
        annotations = json.load(file)

    json_files = [f for f in os.listdir(training_data_folder) if f.endswith('.json')]
    json_files.sort()
    dfs = []
    for json_file in json_files:
        file_path = os.path.join(training_data_folder, json_file)
        with open(file_path, 'r') as file:
            data = pd.json_normalize(json.load(file))
        shortname = json_file.split(".")[0]
        data["file"] = shortname
        relevance = annotations[shortname]
        data["relevance"] = relevance
        dfs.append (data)

    df = pd.concat (dfs, ignore_index=True)
    txt_files = [f for f in os.listdir(training_data_folder) if f.endswith('.txt')]
    for tag in tags:
        df[tag] = 0
    txt_files = [f for f in os.listdir(training_data_folder) if f.endswith('.txt')]
    txt_files.sort()
    print("extraction des données du graphe (cela va prendre un certain temps)")
    for i,txt_file in tqdm(enumerate(txt_files)):
        shortname = txt_file.split(".")[0]
        with open(os.path.join(training_data_folder,txt_file), 'r') as file:
            txt = file.read()
        for line in txt.split("\n"):

            if line:
                items = line.split(" ")
                tag = items[1]
                referenced = items[2]
                df.loc[(df['index'] == int(referenced)) & (df['file'] == shortname), tag] = 1
    return df

def get_files ():
    return list(set(df["file"].values.tolist()))

def add_tfidf_scores ():
    files = sorted(get_files())
    for file in tqdm(files):
        sentences = df[df["file"]==file]["text"].values.tolist()
        sentences_and_scores = tfidf_sentence_scores (sentences)
        scores = [0] * len(sentences)
        for score,index in sentences_and_scores.values():
            scores[index] = score

        df.loc[df["file"] == file, "score"] = scores

In [11]:
#You only need to run load_trainigng_data once
if os.path.exists(training_csv):
    df = pd.read_csv (training_csv)
else:
    print(f"Génération de {training_csv}")
    df = load_training_data ()
    add_tfidf_scores()
    df.to_csv(training_csv,index=False)

original_df = df
shuffled_df = original_df.sample(frac=1,random_state=42)

In [13]:
df.head()

Unnamed: 0.1,Unnamed: 0,speaker,text,index,file,relevance,Acknowledgement,Alternation,Background,Clarification_question,...,Contrast,Correction,Elaboration,Explanation,Narration,Parallel,Q-Elab,Question-answer_pair,Result,score
0,0,PM,Okay,0,ES2002a,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
1,1,PM,Right,1,ES2002a,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0.0
2,2,PM,<vocalsound> Um well this is the kick-off meet...,2,ES2002a,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3.954745
3,3,PM,Um <vocalsound> and um,3,ES2002a,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,3.531381
4,4,PM,this is just what we're gonna be doing over th...,4,ES2002a,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,4.949238


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, f1_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

import numpy as np


In [66]:
DO NOT RUN : OLD TRAINING LOOP


"""def compute_sample_weights(y, class_weights):
    sample_weights = [class_weights[label] for label in y]
    return sample_weights




original_df_size = original_df.shape[0]


additional_features = tags + ["score"]
max_embedding_features = [300,400,500]
#sizes = [500,800,1000,1500,2000,2500,3000,3500,4000]
sizes = [5000,7500,10000]
repeats = 3
#while size<=32000:
#ada_means=[]
#svc_means=[]
class_weights = [1,10]
results_df_ada = pd.DataFrame ({f"Size {s}":[] for s in sizes})
results_df_svm = pd.DataFrame ({f"Size {s}":[] for s in sizes})
print(results_df_ada)
for size in sizes:
    for max_embedding in max_embedding_features:
        print(f"Size {size}")
        print(f"Features {max_embedding_features}")
        mean_ada = 0
        mean_svc = 0
        for j in range(repeats):
            print("starting")
            #df = original_df.head(size)#for quick tests

            #df = original_df
            # Split the dataset
            df = shuffled_df
            X_train, X_test, y_train, y_test = train_test_split(df[['text']+additional_features], df['relevance'], test_size=1-size/original_df_size, random_state=42)
            X_train_sentences = X_train['text']
            X_test_sentences = X_test['text']



            print("splitting done")

            # Vectorize sentences using TfidfVectorizer
            vectorizer = TfidfVectorizer(max_features=max_embedding)
            X_train_vectorized = vectorizer.fit_transform(X_train_sentences)
            X_test_vectorized = vectorizer.transform(X_test_sentences)

            print("fit and transform done")

            # Add features from 'tags' column

            # Concatenate the TF-IDF vectorized features with the 'tags' features
            X_train_combined = np.concatenate([X_train_vectorized.toarray(), X_train[additional_features]], axis=1)
            X_test_combined = np.concatenate([X_test_vectorized.toarray(), X_test[additional_features]], axis=1)

            print("test transform done")
            print("training samples : ", len(y_train))
            print("embedding dimension : ", X_train_combined.shape[1])


            # SVM Classifier
            classifier = SVC(kernel='linear',class_weight={0:class_weights[0],1:class_weights[1]})
            classifier.fit(X_train_combined, y_train)

            print("classifier trained")

            # Predictions
            print("SVC")
            predictions = classifier.predict(X_test_combined)
            print(f"For size {size}")
            print("Classification Report:")
            print(classification_report(y_test, predictions))
            mean_svc += f1_score(y_test,predictions,pos_label=1)
            #size = size*2

            print("ADA")

            sample_weights = compute_sample_weights(y_train, class_weights)
            ada_classifier = AdaBoostClassifier(n_estimators=50, random_state=42)
            ada_classifier.fit(X_train_combined, y_train, sample_weight=sample_weights)

            ada_predictions = ada_classifier.predict(X_test_combined)

            mean_ada += f1_score(y_test,ada_predictions, pos_label=1)
            print(f"For size {size}")
            print("Classification Report:")
            print(classification_report(y_test, ada_predictions))

        mean_svc/=repeats
        mean_ada/=repeats
        results_df_ada[f"Size {size}"][f"Embeddings {max_embedding}"] = mean_ada
        results_df_svm[f"Size {size}"][f"Embeddings {max_embedding}"] = mean_svc
        #ada_means.append(mean_ada)
        #svc_means.append(mean_svc)
"""

Empty DataFrame
Columns: [Size 5000, Size 7500, Size 10000]
Index: []
Size 5000
Features [300, 400, 500]
starting
splitting done
  (0, 238)	0.40233572424686553
  (0, 122)	0.32383689342588384
  (0, 35)	0.6000280124502345
  (0, 235)	0.22905742828369255
  (0, 261)	0.26644581350216784
  (0, 244)	0.34482677825852837
  (0, 280)	0.361740005718357
  (1, 1)	0.4511022544451834
  (1, 21)	0.36155731400078656
  (1, 135)	0.47973596607564817
  (1, 279)	0.27708537486353696
  (1, 286)	0.48109388387179636
  (1, 244)	0.35694374559355757
  (2, 163)	1.0
  (3, 265)	0.6883726901612565
  (3, 254)	0.4365562050988124
  (3, 123)	0.3834710560660379
  (3, 10)	0.4341793044232719
  (4, 168)	1.0
  (5, 294)	1.0
  (6, 286)	1.0
  (7, 169)	1.0
  (8, 133)	0.7130173379683883
  (8, 217)	0.4365387427140126
  (8, 147)	0.46214937264880757
  :	:
  (4992, 244)	0.309787678914371
  (4993, 266)	0.44327523612531045
  (4993, 149)	0.41124847831077094
  (4993, 247)	0.5688961780562365
  (4993, 117)	0.2828979619153487
  (4993, 85)	0.2920

KeyboardInterrupt: ignored

In [None]:
original_df.shape

In [18]:
#Loading the libraries we need for

from transformers import BertTokenizer, BertForSequenceClassification, AdamW, logging
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.model_selection import train_test_split
import warnings

import torch.nn as nn

#import intel_extension_for_pytorch as ipex


In [19]:
# Tokenize and create DataLoader
class CustomDataset(Dataset):
    def __init__(self, text, labels, tokenizer, max_len):
        self.text = text
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, idx):
        text = str(self.text[idx])
        label = int(self.labels[idx])

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True,
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


class BertTrainer():
    def __init__(self, device, class_weights, model_name, max_len, lr, loss_type):
        self.device = device
        self.class_weights = class_weights
        if not os.path.exists (model_name):
            model_name = "bert-base-uncased"
        self.max_len = max_len
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
        self.model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)
        self.model.to(device)
        self.model.train()
        self.lr = lr
        
        self.optimizer = torch.optim.SGD(self.model.parameters(), lr=lr)

        #self.optimizer = AdamW(self.model.parameters(), lr=lr)
        #self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=1, gamma=0.5)
        self.loss_type = loss_type

    def train(self, x_train, y_train, x_val, y_val, epochs, batch_size):
        print("training")
        train_dataset = CustomDataset(text=x_train, labels=y_train, tokenizer=self.tokenizer, max_len=self.max_len)
        train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)

        val_dataset = CustomDataset(text=x_val, labels=y_val, tokenizer=self.tokenizer, max_len=self.max_len)
        val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
        macro_f1_loss = MacroF1BinaryLoss(self.device)
        if os.path.exists("best_fscore.txt"):
            with open("best_fscore.txt","r") as file:
                best_f1 = float(file.read ())

        else:

            best_f1 = 0
        new_best = 0
        for epoch in range(epochs):
            for batch in tqdm(train_loader):
                # print("batch")
                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                labels = batch['label'].to(self.device)
                

                self.optimizer.zero_grad()

                outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
                # loss = outputs.loss
                
               
                if self.loss_type=="custom":
                    loss = macro_f1_loss(outputs.logits, labels)
                    loss.backward()
                else:
                    weighted_loss = torch.nn.functional.cross_entropy(outputs.logits, labels, weight=self.class_weights)

                    weighted_loss.backward() 
                self.optimizer.step()
            
            #self.scheduler.step()
            print("trained. Validation starts now")
            # Validation loop
            self.model.eval()
            val_losses = []
            val_correct = 0
            all_predictions = []
            all_labels = []
            with torch.no_grad():
                for batch in tqdm(val_loader):
                    input_ids = batch['input_ids'].to(self.device)
                    attention_mask = batch['attention_mask'].to(self.device)
                    labels = batch['label'].to(self.device)

                    outputs = self.model(input_ids, attention_mask=attention_mask, labels=labels)
                    
                    logits = outputs.logits
                    #loss = macro_f1_loss(logits, labels)
                    
                    #loss = torch.nn.functional.cross_entropy(outputs.logits, labels, weight=self.class_weights)

                    if self.loss_type=="custom":
                        loss = macro_f1_loss(outputs.logits, labels)
                    else:
                        loss = torch.nn.functional.cross_entropy(outputs.logits, labels, weight=self.class_weights)
                    probabilities = torch.softmax(logits, dim=1)
                    predictions = [torch.argmax(softmax) for softmax in probabilities]
                    all_predictions.extend(predictions)
                    all_labels.extend(labels.cpu().numpy())
                    val_losses.append(loss.item())
                    val_correct += (torch.argmax(logits, dim=1) == labels).sum().item()

            val_loss = sum(val_losses) / len(val_losses)
            val_accuracy = val_correct / len(x_val)
            all_labels_cpu = np.array(all_labels)
            all_predictions_cpu = [pred.cpu().numpy() for pred in all_predictions]
            val_f1 = f1_score(all_labels_cpu, all_predictions_cpu, pos_label=1)

            if val_f1 >= best_f1:
                print("Best F1-score")
                print("Saving model")
                self.model.save_pretrained('best_tuned_bert_model')

                self.model.save_pretrained('current_best_tuned_bert_model')
                print("Saving F1-score ")
                with open('best_fscore.txt', 'w') as file:
                    # Write the value of the variable to the file

                    file.write(str(val_f1))
                best_f1 = val_f1
                new_best = val_f1
            elif val_f1 >= new_best:
                print("Current best F1-score")
                print("Saving model")
                self.model.save_pretrained('current_best_tuned_bert_model')
                print("current best : " + str(val_f1))
                new_best = val_f1

            #print(x_val)
            """
            labels = []
            for sentence in x_val:
                encoded_input = self.tokenizer.encode_plus(
                    sentence,
                    add_special_tokens=True,
                    max_length=self.max_len,
                    return_token_type_ids=False,
                    padding="max_length",
                    return_attention_mask=True,
                    return_tensors='pt',
                    truncation=True,
                )

                with torch.no_grad():
                    outputs = self.model(**encoded_input)
                    logits = outputs.logits
                    probabilities = torch.softmax(logits, dim=1)
                    predicted_label = torch.argmax(probabilities, dim=1).tolist()
                    labels.append(predicted_label[0])
            print(all_predictions)
            print(labels)

            f1 = f1_score(y_val, labels)
            """
            print (f'Epoch {epoch + 1}/{epochs}, F1 : {val_f1} Loss: {loss.item()}, Val Loss: {val_loss}, Val Accuracy: {val_accuracy}')
            print(f'And best F1 : {best_f1}')
            self.model.save_pretrained('last_tuned_bert_model')

class MacroF1BinaryLoss(nn.Module):
    def __init__(self, device):
        self.device = device
        super(MacroF1BinaryLoss, self).__init__()
    def forward(self, logits, labels):
        # Apply sigmoid to get probabilities
        probabilities = torch.softmax(logits, dim=1)
        
        # Convert labels to one-hot encoding
        labels = labels.to(self.device)
        eye = torch.eye(2).to(self.device)
        one_hot_labels = eye[labels]

        # Calculate true positive, false positive, and false negative
        true_positive = (probabilities * one_hot_labels).sum(dim=0)
        false_positive = (probabilities * (1 - one_hot_labels)).sum(dim=0)
        false_negative = ((1 - probabilities) * one_hot_labels).sum(dim=0)

        # Calculate precision, recall, and F1 score for each class
        precision = true_positive / (true_positive + false_positive + 1e-10)
        recall = true_positive / (true_positive + false_negative + 1e-10)
        f1_score = 2 * (precision * recall) / (precision + recall + 1e-10)

        # Macro-F1 is the average F1 score across all classes
        macro_f1 = f1_score.mean()

        # Use 1 - Macro-F1 as the loss (since we want to minimize it)
        loss = 1 - macro_f1
        return loss
            
            
class BertInference ():
    def __init__ (self, model_path, max_len, device):
        self.device = torch.device(device)
        self.max_len = max_len
        self.model_path = model_path  # Update with the correct path
        self.loaded_model = BertForSequenceClassification.from_pretrained(self.model_path)
        self.loaded_model.to(self.device)
        self.loaded_model.eval()
        self.tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

    def infer(self,sentences, return_type="labels"):

        labels = []
        probabilities = []
        for sentence in tqdm(sentences):
          encoded_input = self.tokenizer.encode_plus(
                  sentence,
                  add_special_tokens=True,
                  max_length=self.max_len,
                  return_token_type_ids=False,
                  #pad_to_max_length=True,
                  padding="max_length",
                  return_attention_mask=True,
                  return_tensors='pt',
                  truncation=True,
              )
          encoded_input = {key: tensor.to(self.device) for key, tensor in encoded_input.items()}
          with torch.no_grad():
              outputs = self.loaded_model(**encoded_input)
              logits = outputs.logits
              probabilities_ = torch.softmax(logits, dim=1)
              predicted_label = torch.argmax(probabilities_, dim=1).tolist()
              probabilities.append (probabilities_[0])
              labels.append(predicted_label[0])

        return labels if return_type=="labels" else probabilities
    """

    def infer(self, sentences, return_type="labels"):
        labels = []
        probabilities = []

        encoded_inputs = self.tokenizer(sentences, 
                                        add_special_tokens=True,
                                        max_length=self.max_len,
                                        padding="max_length",
                                        truncation=True,
                                        return_tensors='pt')

        encoded_inputs = {key: tensor.to(self.device) for key, tensor in encoded_inputs.items()}

        with torch.no_grad():
            outputs = self.loaded_model(**encoded_inputs)
            logits = outputs.logits
            probabilities_ = torch.softmax(logits, dim=1)
            predicted_labels = torch.argmax(probabilities_, dim=1).tolist()

            probabilities.extend(probabilities_.cpu().numpy())
            labels.extend(predicted_labels)

        return labels if return_type == "labels" else probabilities
    """


class TFVectorizer ():
  def __init__(self, max_embedding):
    self.max_embedding = max_embedding
    self.vectorizer = TfidfVectorizer(max_features=max_embedding)

  def train (self, x_train):
    x_train_vectorized = self.vectorizer.fit_transform(x_train)
  def infer(self,x_infer):
    x_test_vectorized = self.vectorizer.transform(x_infer)
    return x_test_vectorized








def test_model (model_name, test_df, device):
    #test_df = shuffled_df.sample(frac=1)[10000:15000]
    max_len  = 128
    bert_inference = BertInference(model_name, max_len, device)

    sentences = test_df["text"].values.tolist()
    labels = test_df["relevance"].values.tolist()
    predicted = bert_inference.infer(sentences)
    return f1_score (labels, predicted, pos_label = 1)
    """x=[]
    predicted = []
    scores= []
    for i,sentence in tqdm(enumerate(sentences)):
        predicted.append (bert_inference.infer ([sentence])[0])

    return f1_score (labels, predicted, pos_label = 1)"""

def split_dataset (dataset, split,testing_size):
  training_and_val_size = dataset.shape[0] - testing_size
  df = dataset[0:training_and_val_size]
  train_df, val_df = train_test_split(df, test_size=split, random_state=42)
  test_df = shuffled_df[training_and_val_size:]
  return train_df, val_df, test_df




In [20]:
#HERE WE SPLIT OUR DATASET
testing_size = 10000
split = 0.16
split_text = "016"

train_df, val_df, test_df = split_dataset (shuffled_df, split,testing_size)

print("training : ", train_df.shape[0])
print("validation : ", val_df.shape[0])
print("training + validation : ", train_df.shape[0]+val_df.shape[0])
print("test : ", test_df.shape[0])


training :  52603
validation :  10020
training + validation :  62623
test :  10000


In [21]:
#Checking the dataset is balanced

def proportions (df__):
    return df__[df__["relevance"]==1].shape[0]/df__.shape[0]


print("Class 1 proportion in the dataset. Regenerate it if there is an imbalance")
print(proportions(shuffled_df))
print(proportions(train_df))
print(proportions(val_df))
print(proportions(test_df))

Class 1 proportion in the dataset. Regenerate it if there is an imbalance
0.18302741555705493
0.18324049959127806
0.17904191616766468
0.1859


In [45]:
# Uncomment if you wanna train starting from scratch instead of using best_tuned_bert_model
#! rm best_fscore.txt

rm: cannot remove 'best_tuned_bert_model': No such file or directory
rm: cannot remove 'best_fscore.txt': No such file or directory


In [23]:
#Some hyperparameters
device = "cuda"
class_weights = torch.tensor([1, 4], dtype=torch.float32).to(device)
max_len = 128


SyntaxError: unterminated string literal (detected at line 2) (1351807629.py, line 2)

In [None]:
# TRAINING 1/2

batch_size=16
epochs = 4

model_name = "best_tuned_bert_model" #if continuing training
model_name = "bert-base-uncased" #if starting training
size = train_df.shape[0]

#lr = 3e-5
#lr = 2.6e-6
lr = 2e-5
decay = 0.9
for i in range(7) :
    print(f"Learning rate : {lr}")
    bert_trainer = BertTrainer(device, class_weights, model_name, max_len,lr,"classic")

    bert_trainer.train(train_df["text"].values.tolist(),
                       train_df["relevance"].values.tolist(),
                       val_df["text"].values.tolist(),
                       val_df["relevance"].values.tolist(),
                       epochs, batch_size)

    lr = lr*decay


Learning rate : 2e-05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


training


100%|██████████████████████████████████████| 3288/3288 [07:55<00:00,  6.92it/s]


trained. Validation starts now


100%|████████████████████████████████████████| 627/627 [00:36<00:00, 17.27it/s]


Current best F1-score
Saving model
current best : 0.49490482599500085
Epoch 1/4, F1 : 0.49490482599500085 Loss: 0.5528117418289185, Val Loss: 0.5934765278532554, Val Accuracy: 0.7378243512974052
And best F1 : 0.5922242314647378


100%|██████████████████████████████████████| 3288/3288 [07:52<00:00,  6.96it/s]


trained. Validation starts now


100%|████████████████████████████████████████| 627/627 [00:36<00:00, 17.00it/s]


Current best F1-score
Saving model
current best : 0.5402470984649943
Epoch 2/4, F1 : 0.5402470984649943 Loss: 0.35192930698394775, Val Loss: 0.48555406177062926, Val Accuracy: 0.7548902195608782
And best F1 : 0.5922242314647378


100%|██████████████████████████████████████| 3288/3288 [07:52<00:00,  6.96it/s]


trained. Validation starts now


100%|████████████████████████████████████████| 627/627 [00:36<00:00, 17.18it/s]


Current best F1-score
Saving model
current best : 0.5544591323775834
Epoch 3/4, F1 : 0.5544591323775834 Loss: 0.256929486989975, Val Loss: 0.45826278115953933, Val Accuracy: 0.7611776447105788
And best F1 : 0.5922242314647378


100%|██████████████████████████████████████| 3288/3288 [07:53<00:00,  6.95it/s]


trained. Validation starts now


100%|████████████████████████████████████████| 627/627 [00:36<00:00, 17.26it/s]


Current best F1-score
Saving model
current best : 0.5595105672969967
Epoch 4/4, F1 : 0.5595105672969967 Loss: 0.21521960198879242, Val Loss: 0.4482497566957413, Val Accuracy: 0.7628742514970059
And best F1 : 0.5922242314647378
Learning rate : 1.8e-05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


training


100%|██████████████████████████████████████| 3288/3288 [08:01<00:00,  6.83it/s]


trained. Validation starts now


100%|████████████████████████████████████████| 627/627 [00:36<00:00, 17.14it/s]


Current best F1-score
Saving model
current best : 0.5155925155925156
Epoch 1/4, F1 : 0.5155925155925156 Loss: 0.5013496279716492, Val Loss: 0.5460629142357402, Val Accuracy: 0.7209580838323353
And best F1 : 0.5922242314647378


100%|██████████████████████████████████████| 3288/3288 [07:52<00:00,  6.96it/s]


trained. Validation starts now


100%|████████████████████████████████████████| 627/627 [00:36<00:00, 17.31it/s]


Current best F1-score
Saving model
current best : 0.5504621769477457
Epoch 2/4, F1 : 0.5504621769477457 Loss: 0.2870580554008484, Val Loss: 0.470354030648487, Val Accuracy: 0.7621756487025948
And best F1 : 0.5922242314647378


100%|██████████████████████████████████████| 3288/3288 [07:52<00:00,  6.96it/s]


trained. Validation starts now


100%|████████████████████████████████████████| 627/627 [00:36<00:00, 17.24it/s]


Current best F1-score
Saving model
current best : 0.5568862275449101
Epoch 3/4, F1 : 0.5568862275449101 Loss: 0.2323976457118988, Val Loss: 0.4554196099440257, Val Accuracy: 0.7636726546906187
And best F1 : 0.5922242314647378


100%|██████████████████████████████████████| 3288/3288 [07:52<00:00,  6.96it/s]


trained. Validation starts now


100%|████████████████████████████████████████| 627/627 [00:36<00:00, 17.22it/s]


Current best F1-score
Saving model
current best : 0.5571164510166358
Epoch 4/4, F1 : 0.5571164510166358 Loss: 0.20376719534397125, Val Loss: 0.44842143665755574, Val Accuracy: 0.7608782435129741
And best F1 : 0.5922242314647378
Learning rate : 1.62e-05


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


training


100%|██████████████████████████████████████| 3288/3288 [08:01<00:00,  6.83it/s]


trained. Validation starts now


100%|████████████████████████████████████████| 627/627 [00:36<00:00, 17.26it/s]


Current best F1-score
Saving model
current best : 0.33245975290153496
Epoch 1/4, F1 : 0.33245975290153496 Loss: 0.5473620295524597, Val Loss: 0.6379978385838595, Val Accuracy: 0.8220558882235529
And best F1 : 0.5922242314647378


100%|██████████████████████████████████████| 3288/3288 [07:52<00:00,  6.96it/s]


trained. Validation starts now


100%|████████████████████████████████████████| 627/627 [00:36<00:00, 17.30it/s]


Current best F1-score
Saving model
current best : 0.532675222112537
Epoch 2/4, F1 : 0.532675222112537 Loss: 0.4238584637641907, Val Loss: 0.5340730393902536, Val Accuracy: 0.7637724550898204
And best F1 : 0.5922242314647378


 64%|████████████████████████▎             | 2108/3288 [05:02<02:51,  6.87it/s]

In [22]:

#TRAINING 2/2 : batch 16==>32 loss function cross_entropy=>MacroF1

batch_size=32
epochs = 10
max_len = 128
model_name = "best_tuned_bert_model" #We continue training the model we made at last step
size = train_df.shape[0]

#lr = 2e-5
lr = 2e-3 # best learning rate
print(f"Learning rate : {lr}")
bert_trainer = BertTrainer(device, class_weights, model_name, max_len,lr,"custom")

bert_trainer.train(train_df["text"].values.tolist(),
                   train_df["relevance"].values.tolist(),
                   val_df["text"].values.tolist(),
                   val_df["relevance"].values.tolist(),
                   epochs, batch_size)



Learning rate : 0.002


NameError: name 'device' is not defined

In [144]:









#If you are using cuda and you just trained the Bert model you might need to run the following line
#torch.cuda.empty_cache()  



# HERE WE TEST THE BERT MODEL ON ITS  OWN
test_score = test_model ("best_tuned_bert_model", test_df, "cuda")
print("Testing score of best iteration : ", test_score)
"""
test_score = test_model ("pretrained_bert_model", test_df)
print("Testing score of last round best iteration : ", test_score)"""

100%|█████████████████████████████████████| 10000/10000 [02:38<00:00, 63.13it/s]

Testing score of best iteration :  0.6110197368421052





'\ntest_score = test_model ("pretrained_bert_model", test_df)\nprint("Testing score of last round best iteration : ", test_score)'

In [45]:
# HERE WE TRAIN THE TFIDF VECTORIZER


max_embeddings = 300
tfidf_vectorizer = TFVectorizer (max_embeddings)
tfidf_vectorizer.train(train_df["text"])

In [46]:
#HERE WE PREPARE THE TRAINING OF THE SCALER AND THE CLASSIFIER
model_used = "best_tuned_bert_model"
class_weights_svc = [1,3]
additional_features = tags + ["score"]
print("Bert inference")
bert_inference = BertInference(model_used, max_len, "cuda")
train_df_bert_probabilities = bert_inference.infer (train_df["text"].values.tolist(), return_type="probabilities")
train_df_bert_probabilities =  [tensor.cpu().numpy() for tensor in train_df_bert_probabilities]
print("Tfidf vectorization")
train_df_vectorized = tfidf_vectorizer.infer (train_df["text"])

train_df_combined = np.concatenate([train_df_vectorized.toarray(), train_df_bert_probabilities, train_df[additional_features]], axis=1)


Bert inference


100%|████████████████████████████████████| 52603/52603 [14:05<00:00, 62.24it/s]


Tfidf vectorization


In [47]:
print("Scaler fitting")
scaler = StandardScaler()
scaler.fit(train_df_combined)
train_df_combined_scaled = scaler.transform(train_df_combined)

Scaler fitting


In [48]:
 # Classifier training
class_weights = [1,2]


def compute_sample_weights(y, class_weights):
    
    sample_weights = [class_weights[label] for label in y]
    return sample_weights
print("Classifier fitting")

y_train_true = train_df["relevance"].values.tolist()

sample_weights = compute_sample_weights(y_train_true, class_weights)
ada_classifier = AdaBoostClassifier(n_estimators=50, random_state=42)
ada_classifier.fit(train_df_combined_scaled, y_train_true, sample_weight=sample_weights)

Classifier fitting


In [49]:
# HERE WE PREPARE THE TEST DATA
bert_inference = BertInference(model_used, max_len, "cuda")
test_df_bert_probabilities = bert_inference.infer (test_df["text"].values.tolist(), return_type="probabilities")
test_df_bert_probabilities =  [tensor.cpu().numpy() for tensor in test_df_bert_probabilities]
test_df_vectorized = tfidf_vectorizer.infer (test_df["text"])

test_df_combined = np.concatenate([test_df_vectorized.toarray(), test_df_bert_probabilities, test_df[additional_features]], axis=1)
test_df_combined_scaled = scaler.transform (test_df_combined)


100%|████████████████████████████████████| 10000/10000 [02:44<00:00, 60.77it/s]


In [24]:
#HERE WE TEST THE CLASSIFIER
#for w in np.linspace(1,2,10):
#    print(w)

ada_predictions = ada_classifier.predict(test_df_combined_scaled)
y_test_true = test_df["relevance"].values.tolist()

print(classification_report(y_test_true, ada_predictions))

NameError: name 'ada_classifier' is not defined

In [None]:
# GENERATING THE BENCHMARK DATASET PREDICTIONS

In [30]:
def load_test_data (test_folder):

    json_files = [f for f in os.listdir(test_folder) if f.endswith('.json')]
    json_files.sort()
    dfs = []
    for json_file in json_files:
        file_path = os.path.join(test_folder, json_file)
        with open(file_path, 'r') as file:
            data = pd.json_normalize(json.load(file))
        shortname = json_file.split(".")[0]
        data["file"] = shortname
        dfs.append (data)

    df_ = pd.concat (dfs, ignore_index=True)
    for tag in tags:
        df_[tag] = 0
    txt_files = [f for f in os.listdir(test_folder) if f.endswith('.txt')]
    txt_files.sort()
    print("extraction des données du graphe (cela va prendre un certain temps)")
    for i,txt_file in tqdm(list(enumerate(txt_files))):
        shortname = txt_file.split(".")[0]
        with open(os.path.join(test_folder,txt_file), 'r') as file:
            txt = file.read()
        for line in txt.split("\n"):

            if line:
                items = line.split(" ")
                tag = items[1]
                referenced = items[2]
                df_.loc[(df_['index'] == int(referenced)) & (df_['file'] == shortname), tag] = 1
    return df_

def get_files_ (df_):
    return list(set(df_["file"].values.tolist()))

def add_tfidf_scores_ (df_):
    files = sorted(get_files_(df_))
    for file in tqdm(files):
        sentences = df_[df_["file"]==file]["text"].values.tolist()
        sentences_and_scores = tfidf_sentence_scores (sentences)
        scores = [0] * len(sentences)
        for score,index in sentences_and_scores.values():
            scores[index] = score
        
        df_.loc[df_["file"] == file, "score"] = scores

In [51]:
# PLEASE INSURE THAT THE MODEL HAS BEEN TRAINED WITH BERT MODEL model_used USED FOR TRAINING !
model_used = "best_tuned_bert_model"

test_folder  = "test"
df_kaggle_test = load_test_data(test_folder)
add_tfidf_scores_ (df_kaggle_test)

extraction des données du graphe (cela va prendre un certain temps)


100%|██████████████████████████████████████████| 40/40 [01:22<00:00,  2.06s/it]
100%|██████████████████████████████████████████| 40/40 [00:27<00:00,  1.45it/s]


In [52]:
bert_inference = BertInference(model_used, max_len, "cuda")
df_kaggle_test_probabilities = bert_inference.infer (df_kaggle_test["text"].values.tolist(), return_type="probabilities")
df_kaggle_test_probabilities =  [tensor.cpu().numpy() for tensor in df_kaggle_test_probabilities]


100%|████████████████████████████████████| 31026/31026 [08:23<00:00, 61.67it/s]


In [53]:
df_kaggle_test_vectorized = tfidf_vectorizer.infer (df_kaggle_test["text"])

In [54]:

df_kaggle_test_vectorized_combined = np.concatenate([df_kaggle_test_vectorized.toarray(), df_kaggle_test_probabilities, df_kaggle_test[additional_features]], axis=1)
df_kaggle_test_vectorized_combined_scaled = scaler.transform (df_kaggle_test_vectorized_combined)

In [55]:
ada_predictions = ada_classifier.predict(df_kaggle_test_vectorized_combined_scaled)

In [42]:
bert_predictions = [np.argmax(pb) for pb in df_kaggle_test_probabilities]
for i in range (200):
    print(df_kaggle_test.iloc [i])
    print(bert_predictions[i])

speaker                            PM
text                      Okay , well
index                               0
file                          ES2003a
Acknowledgement                     0
Alternation                         0
Background                          0
Clarification_question              0
Comment                             0
Conditional                         0
Continuation                        0
Contrast                            0
Correction                          0
Elaboration                         0
Explanation                         0
Narration                           0
Parallel                            0
Q-Elab                              0
Question-answer_pair                0
Result                              0
score                             0.0
Name: 0, dtype: object
0
speaker                                               PM
text                      I think we're ready to begin .
index                                                  1
file  

In [56]:
def create_json (predictions, df_test_):
    dico = {}
    for file in get_files_(df_test_):
        indexes = df_test_.index[df_test_['file'] ==  file].tolist()
        dico[file] = [int(predictions[i]) for i in indexes]
    return dico
json_bert = create_json(bert_predictions, df_kaggle_test)
json_all = create_json(ada_predictions, df_kaggle_test)

with open("json_bert_output.json", "w") as json_file:
    json.dump(json_bert, json_file, indent=2)

with open("json_all_output.json", "w") as json_file:
    json.dump(json_all, json_file, indent=2)