<a href="https://colab.research.google.com/github/pieter98/question_metadata/blob/main/QM_notebooks/QM03_few_shot_approach.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Few-shot approach

# **Importing different libraries**


In [None]:
import pandas as pd   # open source data analysis and manipulation tool
import numpy as np    # extension of Python; using arrays and matrices and different mathematical functions
from random import seed   # initializing of the random number generator
from random import sample   # sample elements from the string, tuple, list, ...
import re

seed(42)
np.random.seed(42)

from sklearn.model_selection import train_test_split    # split arrays or matrices in train, test subsets
import matplotlib.pyplot as plt   # Matlab-like way of plotting

import gensim.downloader as api   # used for loading different models. Here: Word2VecKeyedVectors
from gensim.models.keyedvectors import Word2VecKeyedVectors   # Word embedding
from gensim.models import KeyedVectors

from sklearn.decomposition import PCA   # Principal Component Analysis: projecting data to a lower dimension
from sklearn.metrics import accuracy_score    # computing subset accuracy in multilabel classification
from sklearn.neighbors import KNeighborsClassifier    # KNN
from scipy import spatial                                   

from nltk.corpus import stopwords   # removing stopwords

!pip install -U sentence-transformers
from sentence_transformers import SentenceTransformer   # Sentence embedding

!pip install PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials


Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 1.9 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.19.2-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 19.2 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 33.5 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.6.0-py3-none-any.whl (84 kB)
[K     |████████████████████████████████| 84 kB 3.6 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 55.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64

# **Create the PyDrive client**
This is done for the set-up of a link between the Google colab notebook and Google drive. 

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# **Pre-trained model from Gensim**
A Dutch corpus on wikipedia data is used here to obtain word embeddings. It contains 320-dimensional embeddings. It's easy to shift between different languages by uploading the appropriate corpus. 
A multilingual transformer network is used to obtain 512-dimensional sentence embeddings. 


In [None]:
#model0 = api.load('glove-twitter-25') 
#model1 = api.load('word2vec-google-news-300')

downloaded = drive.CreateFile({'id':"11CS9N5L5aQBz76jWOxNj7O1R686PTrld"})   # ID of the file on google drive
downloaded.GetContentFile('wikipedia-320.txt')    # Download the file from google drive to google colab
model2 = KeyedVectors.load_word2vec_format('wikipedia-320.txt')   # Dutch word embedding 

model3 = SentenceTransformer('distiluse-base-multilingual-cased-v1')   # sentence embedding

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/114 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.58M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.38k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/556 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/341 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/539M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/452 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

# **View CSV data set**
A relative small amount of data instances is manually labeled and presented in an comma-separated values file. 

In [None]:
downloaded = drive.CreateFile({'id':"1QFwORycv4VrVWVyREzzjemwUr-VJMP-z"})   
downloaded.GetContentFile('Labeled_data.csv')
df = pd.read_csv("Labeled_data.csv",sep = ';')
df.head()

Unnamed: 0,Instruction,Label
0,Hoite Pruiksma een Nederlandse musicus heeft e...,Fysica
1,Duid het enige juiste antwoord aan Sommige ste...,Aardrijkskunde
2,Kies het enige juiste antwoord Kies de juiste ...,Geschiedenis
3,Een vat van 250 mL bevat 0374 g van een gasvor...,Chemie
4,Duid het enige juiste antwoord aan Naar welke ...,Biologie


# **Pre-processing the instruction texts**
All the instruction texts are cleaned up by removing different signs, new lines, etc.
Only the words, separated by a blank space, are returned.

In [None]:
def get_only_chars(line):   # creating a definition to strip all instruction texts from not-alphabetic letters

    clean_line = ""

    line = line.replace("’", "")
    line = line.replace("'", "")
    line = line.replace("-", " ")   # replace hyphens with spaces
    line = line.replace("\t", " ")
    line = line.replace("\n", " ")
    line = line.lower()

    for char in line:
        if char in 'qwertyuiopasdfghjklzxcvbnm ':
            clean_line += char
        else:
            clean_line += ' '

    clean_line = re.sub(' +',' ',clean_line)    # delete extra spaces
    if clean_line[0] == ' ':
        clean_line = clean_line[1:]
    return clean_line

#df['Instruction'] = df['Instruction'].apply(lambda x: get_only_chars(x))    # apply the function to each instance of the 'Instruction' column, only necessary when working with word embeddings

# **Defining the Query and Support set**
The Support set of the few-shot learning problem is defined here as the training set. The Support set consists of a number of examples of a specific class.
The Queries or the instances that need to be labeled are defined here as the test set.

In [None]:
num_classes = 4   # amount of classes in the Support class
sample_size = 1   # amount of samples per Support class

df["Label"].replace({'Fysica':int(1), 'Aardrijkskunde':int(2), 'Biologie':int(3), 'Chemie':int(4), 'Geschiedenis':int(5)}, inplace=True)    # the labels need to be replaced by an integer for the gen_sample function

def gen_sample(sample_size, num_classes):
    df_1 = df[((df["Label"].astype(int)) < num_classes + 1)].reset_index().drop(["index"], axis=1).reset_index().drop(["index"], axis=1)
    train = df_1[df_1["Label"] == np.unique(df_1['Label'])[0]].sample(sample_size)    # support set

    train_index = train.index.tolist()

    for i in range(1,num_classes):
        train_2 = df_1[df_1["Label"] == np.unique(df_1['Label'])[i]].sample(sample_size)
        train = pd.concat([train, train_2], axis=0)
        train_index.extend(train_2.index.tolist())

    test = df_1[~df_1.index.isin(train_index)]    # all the examples of the num_classes excluding the query

    return train, test

train, test = gen_sample(sample_size, num_classes)

X_train = train['Instruction']
y_train = train['Label'].values
X_test = test['Instruction']
y_test = test['Label'].values

In [None]:
X_test

0     Hoite Pruiksma een Nederlandse musicus heeft e...
1     Duid het enige juiste antwoord aan Sommige ste...
2     Een vat van 250 mL bevat 0374 g van een gasvor...
3     Duid het enige juiste antwoord aan Naar welke ...
4                    Wat is de hoofdstad van Frankrijk?
5     Duid het enige juiste antwoord aan Welke verge...
7     Het ICSH - hormoon wordt afgescheiden door de ...
8     Duid het enige juiste antwoord aan Welke volks...
9     Bekijk het filmpje Hoe kan je de werking van e...
10    Typ het antwoord in het lege vakOp een mooie z...
11    Duid het enige juiste antwoord aanBij bananenv...
12    Noteer de uitscheidingsorganen samen met hun u...
13    Wat is de eenheid van spanning?Duid het juiste...
14    Wat zijn de buurlanden van het gekleurde land?...
15    Duid het enige juiste antwoord aanDe plaatsfun...
16    Duid het enige juiste antwoord aanWelke Vlaams...
17    Om lampen parallel te schakelen worden deze aa...
18    Typ het antwoord in het lege vak In welk d

# **Word Embedding**
The embedding of each token is searched within the corpus. 
This is done by stripping the words from the instruction text that aren't within the corpus. The embeddings are than searched within the corpus. The mean of all these embeddings are calculated as a representation of the instruction text. 

In [None]:
def transform_sentence1(text, model):

    def preprocess_text(raw_text, model=model):

        raw_text = raw_text.split()   # divides the string into substrings in a list 

        return list(filter(lambda x: x in model.vocab, raw_text))   # filter out the words that aren't in the models vocabulary

    tokens = preprocess_text(text)

    if not tokens:
        return np.zeros(model.vector_size)    # no words recognized by model, all values are zero

    text_vector = np.mean(model[tokens], axis=0)    # otherwise the mean of all the embeddings are calculated

    return np.array(text_vector)

X_train_mean = X_train.apply(lambda x : transform_sentence1(x, model2))  
X_test_mean = X_test.apply(lambda x : transform_sentence1(x, model2))

X_train_mean = pd.DataFrame(X_train_mean)['Instruction'].apply(pd.Series)
X_test_mean = pd.DataFrame(X_test_mean)['Instruction'].apply(pd.Series)

In [None]:
X_train_mean

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,310,311,312,313,314,315,316,317,318,319
31,0.04131,0.010985,1.9e-05,0.019441,0.001972,-0.020605,-0.027712,-0.009328,0.005012,0.012773,...,-0.003531,-0.018412,0.001791,0.006877,0.017384,0.006758,-0.011257,0.004588,-0.010116,0.023639
35,0.0231,0.002561,-0.015214,-0.000402,0.00963,-0.01565,-0.007573,-9.3e-05,0.007496,-0.017133,...,0.000635,-0.009625,0.032492,0.014717,-0.003899,0.00153,-0.005409,-0.012785,-0.000887,0.009284
6,0.049431,-0.021579,0.024524,-0.00199,0.006053,-0.027093,-0.008159,0.002931,0.003574,0.007896,...,0.025878,0.007817,0.012878,0.026973,-0.006855,0.038156,-0.005758,0.021406,-0.019594,0.045709
57,0.038356,-0.008195,-0.007407,-0.017084,0.014927,-0.056805,-0.021172,0.006211,-0.010974,-0.014487,...,-0.010086,-0.010535,0.021091,0.000562,-0.023924,0.004441,0.037905,0.00794,-0.006066,0.028713


# **Sentence embedding**
The embedding of each instruction is computed through a transformer model. 

In [None]:
def transform_sentence2(text, model):
  text_vector = model.encode(text)
  return np.array(text_vector)

X_train_mean = X_train.apply(lambda x : transform_sentence2(x, model3))  
X_test_mean = X_test.apply(lambda x : transform_sentence2(x, model3))

X_train_mean = pd.DataFrame(X_train_mean)['Instruction'].apply(pd.Series)
X_test_mean = pd.DataFrame(X_test_mean)['Instruction'].apply(pd.Series)

In [None]:
X_test_mean

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,502,503,504,505,506,507,508,509,510,511
0,-0.009783,0.057442,-0.064297,0.000151,0.02288,-0.044638,-0.008454,0.063411,0.020576,-0.034341,...,-0.036994,-0.018926,0.039391,0.013421,0.018489,0.024621,-0.033445,0.020147,-0.03981,-0.000363
1,-0.014477,0.011849,0.035354,0.015653,0.034147,-0.005856,-0.034503,0.07012,0.00617,0.005643,...,-0.070834,0.016364,-0.033718,-0.02807,0.058371,-0.050314,0.017263,-0.02885,-0.048681,0.073053
2,-0.024124,-0.050552,-0.010184,-0.006836,-0.005716,0.033874,-0.0422,-0.036405,0.023374,-0.022115,...,0.037171,-0.039379,0.043095,-0.025703,0.010413,0.002103,0.015992,0.018576,-0.05419,0.033453
3,0.021445,-0.064973,-0.020086,0.011044,-0.00121,-0.080773,0.036471,0.020725,-0.018282,-0.020597,...,-0.050963,-0.001315,-0.027268,0.016628,-0.008629,-0.020503,-0.050943,0.045887,-0.046968,0.021141
4,0.040591,-0.088585,-0.073375,0.034393,-0.080135,0.006831,-0.025301,-0.024403,0.024706,-0.136847,...,-0.039838,-0.067574,0.04185,-0.003366,-0.05757,0.038148,0.001778,0.08234,-0.057382,-0.045082
5,-0.026465,0.056341,-0.050378,0.020442,0.029858,0.007592,-0.018076,-0.020911,0.030722,-0.002986,...,-0.015189,0.118353,0.019435,-0.004698,-0.031667,-0.001835,-0.005685,0.065773,0.031302,-0.016246
7,0.036068,0.050669,-0.034846,0.046642,0.064609,-0.022523,-0.002622,0.077373,-0.041926,0.030139,...,-0.018814,0.042597,0.063464,-0.027971,0.044487,0.055835,-0.07535,-0.061041,-0.053874,-0.06515
8,-0.004402,0.041073,0.05301,-0.006798,0.024879,0.033065,-0.017164,-0.018269,0.014549,-0.018557,...,-0.05296,0.055163,-0.069677,-0.025643,-0.035415,-0.0084,0.019997,-0.067365,0.045226,0.001119
9,0.012174,-0.030193,-0.038004,-0.043598,0.008284,-0.029818,0.047363,-0.008622,-0.017615,-0.06128,...,0.009351,0.036479,0.092595,0.011758,0.01566,-0.049488,-0.011471,-0.012379,0.009488,-0.079829
10,0.030513,0.024666,0.011347,0.020077,0.041245,-0.011795,0.000471,-0.030323,0.014908,0.064953,...,-0.023313,0.021557,0.047688,-0.007372,0.059244,-0.013548,0.042878,-0.021596,0.027036,-0.03111


# **Similarity measures: Cosine similarity**
There are a lot of measures that can be used to calculate the similarity between an instance and each class within the support set. Here Cosine similarity is used.  

In [None]:
def classify_cosine_txt(txt, mean_embedding, model, embedding):

    best_dist = 1
    best_label = -1

    for cl in range(len(mean_embedding)):
        dist = spatial.distance.cosine(embedding(txt, model), mean_embedding[cl])  #calculating the cosine distance between 2 1-D arrays; respectively the embedding of the instruction text and the mean embedding of the class

        if dist < best_dist :
            best_dist = dist
            best_label = cl+1                                                                #+1 because cl starts with index 0

    return best_label

# **Similarity measures: Euclidean distance**
There are a lot of measures that can be used to calculate the similarity between an instance and each class within the support set. Here Euclidean distance is used.

In [None]:
def classify_euclidean_txt(txt, mean_embedding, model, embedding):

    best_dist = 5
    best_label = -1

    for cl in range(len(mean_embedding)):
        dist = spatial.distance.euclidean(embedding(txt, model), mean_embedding[cl])  #calculating the euclidean distance between 2 1-D arrays; respectively the embedding of the instruction text and the mean embedding of the class

        if dist < best_dist :
            best_dist = dist
            best_label = cl+1                                                                #+1 because cl starts with index 0

    return best_label

# **Similarity measures: Chebyshev distance**
There are a lot of measures that can be used to calculate the similarity between an instance and each class within the support set. Here Chebyshev distance is used.

In [None]:
def classify_chebyshev_txt(txt, mean_embedding, model, embedding):

    best_dist = 1
    best_label = -1

    for cl in range(len(mean_embedding)):
        dist = spatial.distance.chebyshev(embedding(txt, model), mean_embedding[cl])  #calculating the chebyshev distance between 2 1-D arrays; respectively the embedding of the instruction text and the mean embedding of the class

        if dist < best_dist :
            best_dist = dist
            best_label = cl+1                                                                #+1 because cl starts with index 0

    return best_label

# **Accuracy**
A lot of the functions above are combined here. 
First the Query and Support set are calculated.
Secondly, the mean embeddings of the instruction texts are obtained. Based on the similarity measure outcome a prediction is made.
Return_score1 is used for the calculations with the average embedding of the support class, Return_score2 is used for the calculations with the KNN method on the support class.



In [None]:
def return_score1(sample_size, num_classes, model, embedding, sim_func):

    train, test = gen_sample(sample_size, num_classes)

    X_train = train['Instruction']
    y_train = train['Label'].values
    X_test = test['Instruction']
    y_test = test['Label'].values

    X_train_mean = X_train.apply(lambda x : embedding(x, model))
    X_test_mean = X_test.apply(lambda x : embedding(x, model))

    X_train_mean = pd.DataFrame(X_train_mean)['Instruction'].apply(pd.Series)
    X_test_mean = pd.DataFrame(X_test_mean)['Instruction'].apply(pd.Series)

    mean_embedding = {}
    for cl in range(num_classes):
        mean_embedding[cl] = np.mean((X_train_mean[y_train == cl + 1]), axis=0)
   

    y_pred = [sim_func(t, mean_embedding, model, embedding) for t in test['Instruction'].values]
    
    return accuracy_score(y_test.tolist(), y_pred)    # tolist because y_test is from the type numpy array

In [None]:
def return_score2(sample_size, num_classes, model, embedding, sim_func):

    train, test = gen_sample(sample_size, num_classes)

    X_train = train['Instruction']
    y_train = train['Label'].values
    X_test = test['Instruction']
    y_test = test['Label'].values

    X_train_mean = X_train.apply(lambda x : embedding(x, model))
    X_test_mean = X_test.apply(lambda x : embedding(x, model))

    X_train_mean = pd.DataFrame(X_train_mean)['Instruction'].apply(pd.Series)
    X_test_mean = pd.DataFrame(X_test_mean)['Instruction'].apply(pd.Series)

    clf = KNeighborsClassifier(n_neighbors=sample_size, p=2)    # KNN with Euclidean distance as similarity measure 
    clf.fit(X_train_mean.values.tolist(), y_train.tolist())   # train the KNN model; tolist because the types are different and have to be of the type list

    y_pred = clf.predict(X_test_mean.values.tolist())
    
    return accuracy_score(y_test.tolist(), y_pred) 

In [None]:
return_score2(2,2,model3,transform_sentence2,classify_chebyshev_txt)

0.7692307692307693

# **Comparison and plot**

In [None]:
samples_min = 1
samples_max = 6
cl_min = 2
cl_max = 5 
model = model3    # model2 = word embedding model, model3 = sentence embedding model
embedding = transform_sentence2   # transform_sentence1 = word embedding transformation, transform_sentence2 = sentence embedding transformation
sim_func = classify_chebyshev_txt   # similarity measure (cosine, euclidean and chebyshev); only usefull for return_score1
pred_acc = return_score1    # return_score1 = prediction accuracy for cosine, euclidean and chebyshev, return_score2 = prediction accuracy for KNN                                                               

all_accuracy = {2:[],3:[],4:[]}

for num_samples in range(samples_min,samples_max):                          
  for num_cl in range(cl_min,cl_max):
    all_accuracy[num_cl].append(pred_acc(num_samples,num_cl,model,embedding,sim_func))


plt.figure(figsize=(12,8))
plt.plot(all_accuracy[2], label="2 classes")
plt.plot(all_accuracy[3], label="3 classes")
plt.plot(all_accuracy[4], label="4 classes")
plt.title("Accuracy depending on the number of samples and classes")
plt.xlabel("Amount of class examples in the support set")
plt.ylabel("Accuracy(%)")
plt.xticks()
plt.legend()
plt.show()

for i in range(cl_min, cl_max):
  print("Accuracy of class " + str(i) + str(all_accuracy[i]) )

# **References**


1.   **Dutch corpus**
      author = {Stephan Tulkens and Chris Emmery and Walter Daelemans},
      title = {Evaluating Unsupervised Dutch Word Embeddings as a Linguistic Resource},
      booktitle = {Proceedings of the Tenth International Conference on Language Resources and Evaluation (LREC 2016)},
      year = {2016},
      month = {may},
      date = {23-28},
      location = {Portorož, Slovenia},
      editor = {Nicoletta Calzolari (Conference Chair) and Khalid Choukri and Thierry Declerck and Marko Grobelnik and Bente Maegaard and Joseph Mariani and Asuncion Moreno and Jan Odijk and Stelios Piperidis},
      publisher = {European Language Resources Association (ELRA)},
      address = {Paris, France},
      isbn = {978-2-9517408-9-1},
      language = {english}
 }
2.   **Few-shot learning code**
https://maelfabien.github.io/machinelearning/NLP_5/#implementation


