<a href="https://colab.research.google.com/github/mille-s/Sentence-similarity/blob/main/Lab_Week10_fineTuning_base.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#### Mount drive (optional)

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


# **FT-1 Dataset**

### FT-1.1 Load WebNLG v3.0

In [1]:
# For clearing outputs of installs
from IPython.display import clear_output

# datasets is for loading datasets from HuggingFace
!pip install datasets
from datasets import load_dataset

clear_output()

webnlg = load_dataset('web_nlg', 'release_v3.0_en')

Downloading builder script:   0%|          | 0.00/11.5k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/31.7k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/22.1k [00:00<?, ?B/s]

Downloading and preparing dataset web_nlg/release_v3.0_en to /root/.cache/huggingface/datasets/web_nlg/release_v3.0_en/0.0.0/28ffb892f7f42450dd9558684aa43bcaf44b1b3bf0d77cb8d73534646af88dda...


Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/13211 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1667 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5713 [00:00<?, ? examples/s]

Dataset web_nlg downloaded and prepared to /root/.cache/huggingface/datasets/web_nlg/release_v3.0_en/0.0.0/28ffb892f7f42450dd9558684aa43bcaf44b1b3bf0d77cb8d73534646af88dda. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Load triples of size 1 from the WebNLG training data

In [2]:
def load_webnlg(webnlg):
    dataset = []
    for sample in webnlg['train']:
        if sample['size'] == 1:
            dataset.append([sample['modified_triple_sets']['mtriple_set'][0][0], sample['lex']['text']])
    return dataset

dataset = load_webnlg(webnlg)

Merge repeated triples

In [3]:
def merge_repeated_triples(dataset):

    triples = [] #create a list of triples to then find the repeated ones
    for sample in dataset:
        triples.append(sample[0])

    duplicates = [triple for triple in triples if triples.count(triple) > 1] #duplicated triples list
    
    to_delete = []
    for i, sample in enumerate(dataset): #Append sentences from repeated triples in the first occurrence triple list 
        triple = sample[0]
        if triple in duplicates:
            first_occurrence = triples.index(triple)
            if i != first_occurrence:
                for sentence in sample[1]:
                    dataset[first_occurrence][1].append(sentence)
                to_delete.append(i) #Save indexes of triples to delete

    for i, index in enumerate(to_delete): #Delete repeated triples
        del dataset[index - i]

    for sample in dataset: #Remove repeated sentences after merging
        sentence_set = set(sample[1])
        sample[1] = list(sentence_set)

    return dataset

dataset = merge_repeated_triples(dataset)

In [4]:
print(dataset[0])
print(len(dataset))

['Aarhus_Airport | cityServed | "Aarhus, Denmark"', ['The Aarhus is the airport of Aarhus, Denmark.', 'Aarhus Airport serves the city of Aarhus, Denmark.']]
3107


Text triples: Each triple is a string in this form: "Subject | Property | Object"

In [5]:
def get_text_triples(dataset):
    text_triples = []
    for sample in dataset:
        text_triples.append(sample[0])
    return text_triples

text_triples = get_text_triples(dataset)

In [6]:
print('Num text triples:', len(text_triples), '(Expected: 3107)')
print()
print('text_triples[0]:')
text_triples[0]

Num text triples: 3107 (Expected: 3107)

text_triples[0]:


'Aarhus_Airport | cityServed | "Aarhus, Denmark"'

Sentences list: a list with all sentences corresponding to 1-triple inputs

In [7]:
def get_sentences(dataset):
    sentences = []
    for sample in dataset:
        for sentence in sample[1]:
            sentences.append(sentence)
    return sentences

sentences = get_sentences(dataset)

In [8]:
print('Num sentences:', len(sentences), '(Expected: 7630)')
print()
print('sentences[0]:')
sentences[0]

Num sentences: 7630 (Expected: 7630)

sentences[0]:


'The Aarhus is the airport of Aarhus, Denmark.'

Triples list: triples are stored as a list of 3 elements to access Subj, Property and Obj separately

In [9]:
def get_triples(text_triples):
    triples = []
    for text_triple in text_triples:
        firstBarIndex = text_triple.find('|')-1
        secondBarIndex = text_triple.rfind('|')+2

        sub = text_triple[:firstBarIndex]
        prop = text_triple[firstBarIndex + 3 : secondBarIndex-3]
        obj = text_triple[secondBarIndex:]

        triple = [sub, prop, obj]
        triples.append(triple)
    return triples

triples = get_triples(text_triples)

In [10]:
print('Num triples:', len(triples), '(Expected: 3107)')
print()
print('triples[0]:')
triples[0]

Num triples: 3107 (Expected: 3107)

triples[0]:


['Aarhus_Airport', 'cityServed', '"Aarhus, Denmark"']

In [11]:
# Save data
# import pickle

# fineTuningDataset_path = '/content/' #Write path to save the dataset
# with open(fineTuningDataset_path + 'triples_split.txt', 'wb') as fh:
#    pickle.dump(triples, fh)

# with open(fineTuningDataset_path + 'triples_raw.txt', 'wb') as fh:
#    pickle.dump(text_triples, fh)

### FT-1.2 Extract info needed for Finetuning dataset

Construct intermediate dataset: eventually, we want a dataset of sentence-like pairs with approximate similarity score between them.

In [12]:
# build a list of 2 dictionaries to store sentences with 2 different levels of similarity with the input triple
intermediate_dataset = [dict(), dict()]
for category in intermediate_dataset:
    for text_triple in text_triples:
        category[text_triple] = set()

In [13]:
print(len(intermediate_dataset[0]))
print(intermediate_dataset[0]['Aarhus_Airport | cityServed | "Aarhus, Denmark"'])

3107
set()


In [14]:
# For each triple, compare it to all other triples and get a set with the intersection of the two triples.
for i, triple1 in enumerate(triples):
    triple_1 = set(triple1)
    for j, triple2 in enumerate(triples):
        triple_2 = set(triple2)
        final = triple_1 & triple_2
        
        # Group together the sentences that verbalise triples that have all or no elements (Subj, Prop, Obj) in common with triple1
        if len(final) == 0:
            for sentence in dataset[j][1]:
                intermediate_dataset[0][text_triples[i]].add(sentence)
        elif len(final) == 3:
            for sentence in dataset[j][1]:
                intermediate_dataset[1][text_triples[i]].add(sentence)
    # print(final)


In [17]:
print(len(intermediate_dataset[0]))
print(list(intermediate_dataset[0]['Aarhus_Airport | cityServed | "Aarhus, Denmark"'])[:10])
print(list(intermediate_dataset[1]['Aarhus_Airport | cityServed | "Aarhus, Denmark"'])[:10])

3107
['The length of the A-Rosa Luna is 125.8 metres long.', 'Al Anderson, part of the NRBQ band, was born in Windsor, Connecticut.', 'Andra began her singing career in 2000.', "Alessio Romagnoli's club is the Italy national under 19 football team.", 'Max Huiberts owns AZ Alkmaar.', 'Sergio Marchionne is a key figure of Dodge.', 'The AZAL Arena is located in Azerbaijan.', 'The runway at Angola International Airport is named South Runway.', 'Elizabeth II is the queen of Jamaica.', 'Maria Lourdes Sereno is a Filipina leader.']
['The Aarhus is the airport of Aarhus, Denmark.', 'Aarhus Airport serves the city of Aarhus, Denmark.']


Save dataset

In [16]:
import pickle

fineTuningDataset_path = '/content/' #Write path to save the dataset
with open(fineTuningDataset_path + 'intermediate_dataset.txt', 'wb') as fh:
   pickle.dump(intermediate_dataset, fh)

### FT-1.3 Create text/string versions of the triples to be compared to the sentences

In [17]:
# print(triples[0])

def camelCaseClean(text):
  words = [[text[0]]]
  for c in text[1:]:
    if words[-1][-1].islower() and c.isupper():
      words.append(list(c.lower()))
    else:
      words[-1].append(c)
  words = [''.join(word) for word in words]
  cleaned = ' '.join(words)
  return cleaned

textified_triples_noTypes_noTags = []

for triple in triples:
  # remove underscores and quotes 
  subj = triple[0].replace('_',' ').replace('"','')
  obj = triple[2].replace('_',' ').replace('"','')
  # split property names
  prop = camelCaseClean(triple[1])
  
  text = subj+' '+prop+' '+obj+' .'
  textified_triples_noTypes_noTags.append(text)

# print(textified_triples_noTypes_noTags[0])
# print(len(textified_triples_noTypes_noTags))

In [18]:
import pickle

fineTuningDataset_path = '/content/' #Write path to save the dataset
with open(fineTuningDataset_path + 'triples_train_1_textified.txt', 'wb') as fh:
   pickle.dump(textified_triples_noTypes_noTags, fh)

### FT-1.4 Format dataset for Sentence Transformers.

Load the files produced in the previous step

In [19]:
# from google.colab import drive
# drive.mount('/content/drive', force_remount=True)

!pip install -U sentence-transformers

clear_output()

import pickle

# Load created dataset, with triples and scores
fineTuningDataset_path = '/content/' #Write path to save the dataset
pickle_off = open(fineTuningDataset_path + 'intermediate_dataset.txt', 'rb')
interm_dataset = pickle.load(pickle_off)

# Load textified triples
pickle_off = open(fineTuningDataset_path + 'triples_train_1_textified.txt', 'rb')
textified_triples = pickle.load(pickle_off)

In [20]:
# Print some examples
sample_triples = ['Aarhus_Airport | cityServed | "Aarhus, Denmark"', 'Aarhus_Airport | cityServed | Aarhus', 'Christian_Burns | genre | House_music', 'Athens_International_Airport | runwayLength | 3800.0', 'New_York_City | country | United_States']

for triple in sample_triples:
  print(triple)
  print(interm_dataset[1][triple])
  print(interm_dataset[0][triple])
  print()

print(len(textified_triples))
print(textified_triples[0])

Aarhus_Airport | cityServed | "Aarhus, Denmark"
{'The Aarhus is the airport of Aarhus, Denmark.', 'Aarhus Airport serves the city of Aarhus, Denmark.'}
{'AmeriGas was founded on 01-01-1959.', 'Chicharrón is a dish traditional in Spain.', "Buzz Aldrin was awarded the 'Legion of Merit' ribbon.", 'Tennis is one of the sports offered at the Acharya Institute of Technology.', 'French is the national language of France.', 'Asilomar Conference Grounds are located in Pacific Grove, California.', 'Lamborghini is a subsidiary of Audi.', 'The Appleton International Airport\'s location id is "ATW".', 'Abdulsalami Abubakar served in the Nigerian Air Force.', 'Afonso Pena International Airport has an elevation above the sea level (in feet) of 2988.', 'William Anders crewed Apollo 8.', 'Abner W. Sibal was a member of the Connecticut Senate from the 26th District.', 'Digify, Inc. is a subsidiary of GMA New Media.', 'Footballer, Steve Davis, manages Crewe Alexandra F.C.', 'Alpharetta, Georgia is locate

The final dataset is a list of objects with two attributes, "texts", which is a list of two sentences (texts=[sentence1, sentence2]), and "label", which is the similarity score between these two senences (label=score).

In [21]:
from tqdm import tqdm
from sentence_transformers import InputExample

triples = list(interm_dataset[0].keys())
# triples[999], textified_triples[999]

ft_sentencePair_score_list = []
#visualize_examples = []
for i, category in enumerate(interm_dataset):
    score = float(i)
    print('Category', i, end=': ')
    for j, triple in enumerate(tqdm(triples)):
        for sentence in category[triple]:
            sentence1 = textified_triples[j]
            sentence2 = sentence
            inp_example = InputExample(texts=[sentence1, sentence2], label=score)
            #visualize_example = [[sentence1, sentence2], score]
            ft_sentencePair_score_list.append(inp_example)
            #visualize_examples.append(visualize_example)


Category 0: 

100%|██████████| 3107/3107 [02:25<00:00, 21.35it/s]


Category 1: 

100%|██████████| 3107/3107 [00:00<00:00, 141976.10it/s]


In [22]:
print(len(ft_sentencePair_score_list) == 23252141 + 7645)
print(ft_sentencePair_score_list[0])

True
<InputExample> label: 0.0, texts: Aarhus Airport city served Aarhus, Denmark .; AmeriGas was founded on 01-01-1959.


Balance the dataset so as to have as many examples of positive and negative data.

In [23]:
import random

category0 = ft_sentencePair_score_list[:23252141] # [:23252141]
category1 = ft_sentencePair_score_list[23252141:]

shuffled_category0 = random.sample(category0, len(category0))

category0 = shuffled_category0[:7645]

ft_sentencePair_score_list = category0 + category1

In [24]:
print(len(ft_sentencePair_score_list))
# x = 0
# while x < 100:
#   print(str(ft_sentencePair_score_list[x].label)+': '+str(ft_sentencePair_score_list[x].texts))
#   x += 1

15290


In [25]:
import pickle
with open(fineTuningDataset_path + 'fine_tuning_dataset.txt', 'wb') as fh:
   pickle.dump(ft_sentencePair_score_list, fh)

#### Load created dataset if you want to have a look at it

In [26]:
import pickle

fineTuningDataset_path = '/content/'
pickle_off = open(fineTuningDataset_path + 'fine_tuning_dataset.txt', 'rb')
new_dataset = pickle.load(pickle_off)

In [27]:
print(new_dataset[7644].texts)
print(new_dataset[7644].label)
print(new_dataset[7645].texts)
print(new_dataset[7645].label)

['Quezon City leader party Liberal Party (Philippines) .', 'Albany is part of Linn County, Oregon.']
0.0
['Aarhus Airport city served Aarhus, Denmark .', 'The Aarhus is the airport of Aarhus, Denmark.']
1.0


# **FT-2 Fine-tuning**

### FT-2.1 Set parameters, load model (INPUT NEEDED: path to save fine-tuned model)

In [None]:
from torch.utils.data import DataLoader
import math
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import os
import gzip
import csv

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

#Check if dataset exsist. If not, download and extract  it
'''sts_dataset_path = 'datasets/stsbenchmark.tsv.gz'

if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)
'''

# Read the dataset
model_name = 'nli-distilroberta-base-v2'
train_batch_size = 16
num_epochs = 4
model_save_path = '/content/drive/MyDrive/Colab-dump/Lab_Week10/MyModel-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") #finetuned model path and name

# Load a pre-trained sentence transformer model
model = SentenceTransformer(model_name)

### FT-2.2 Load dataset and create splits

In [None]:
import pickle

fineTuningDataset_path = '/content/'
pickle_off = open(fineTuningDataset_path + 'fine_tuning_dataset.txt', 'rb')
input_examples = pickle.load(pickle_off)

In [None]:
import random

shuffled_input_examples = random.sample(input_examples, len(input_examples)) # shuffles the ordering of filenames (deterministic given the chosen seed)

split_1 = int(0.70 * len(shuffled_input_examples))
split_2 = int(0.85 * len(shuffled_input_examples))

train_samples = shuffled_input_examples[:split_1]
dev_samples = shuffled_input_examples[split_1:split_2]
test_samples = shuffled_input_examples[split_2:]

In [None]:
len(train_samples), len(dev_samples), len(test_samples)

(10703, 2293, 2294)

### FT-2.3 Create model (use GPU)

In [None]:
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size)
train_loss = losses.CosineSimilarityLoss(model=model)

# Development set: Measure correlation between cosine score and gold labels
logging.info("Read STSbenchmark dev dataset")
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')

# Configure the training. We skip evaluation in this example
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))

In [None]:
# Train the model (Use GPU)
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)