## ED5005 MSC AI PROJECT

### Title: Analysis of how semantic similarity methods can be used to automatically test call routing in Interactive Voice Response Systems

### Objective: The objective of this script is to to use variations of BERT pretrained models on the clean ivr data and calculate the semantic similarity between the intent and the child_transcription using cosine similarity, pearsons correlation and spearmans correlation

Student Name: Brian Mullins

Student Number: 19225741

In [None]:
#load the libraries and clean ivr data
import json 
import numpy as np
import pandas as pd
from scipy import spatial
from scipy.stats import pearsonr, spearmanr
from sklearn.metrics.pairwise import cosine_similarity
import gensim.downloader as api
from sentence_transformers import SentenceTransformer

df = pd.read_csv('clean_ivr_data.csv')
df

In [None]:
#create models for GloVe and the different variations of BERT
glove_model = api.load("glove-wiki-gigaword-50") #choose from multiple models https://github.com/RaRe-Technologies/gensim-data
word2vec_model = api.load('word2vec-google-news-300')
bert_model = SentenceTransformer('sentence-transformers/bert-base-nli-mean-tokens')
roberta_model = SentenceTransformer('sentence-transformers/nli-roberta-base')
tinybert_model = SentenceTransformer('sentence-transformers/paraphrase-TinyBERT-L6-v2')
albert_model = SentenceTransformer('sentence-transformers/paraphrase-albert-base-v2')

In [None]:
glove_info = api.info('glove-wiki-gigaword-50')
print(json.dumps(glove_info, indent=4))

In [None]:
word2vec_info = api.info('word2vec-google-news-300')
print(json.dumps(word2vec_info, indent=4))

In [None]:
for index, row in df.iterrows():
    ###########################################################################################
    # Create BERT embedding and calculate cosine similarity, pearsons and spearmans correlation
    ###########################################################################################
    
    bert_child_embeddings = bert_model.encode(str(row['child_transcription_without_stopwords']))
    bert_intent_embeddings = bert_model.encode(str(row['intent_without_stopwords']))
    
    # calculate cosine simlarity
    bert_cosine_score = spatial.distance.cosine(bert_child_embeddings, bert_intent_embeddings)
    #bert_cosine_score_2 = cosine_similarity(bert_child_embeddings, bert_intent_embeddings)
    
    # calculate Pearson's correlation
    bert_pearson_score, _ = pearsonr(bert_child_embeddings, bert_intent_embeddings)
    
    # calculate spearman's correlation
    bert_spearman_score, _ = spearmanr(bert_child_embeddings, bert_intent_embeddings)
    
    #############################################################################################
    # Create RoBerta embedding and calculate cosine similarity, pearsons and spearmans correlation
    #############################################################################################
    roberta_child_embeddings = roberta_model.encode(str(row['child_transcription_without_stopwords']))
    roberta_intent_embeddings = roberta_model.encode(str(row['intent_without_stopwords']))
    
    # calculate cosine simlarity
    roberta_cosine_score = spatial.distance.cosine(roberta_child_embeddings, roberta_intent_embeddings)
    #roberta_cosine_score_2 = cosine_similarity(bert_child_embeddings, bert_intent_embeddings)
    
    # calculate Pearson's correlation
    roberta_pearson_score, _ = pearsonr(roberta_child_embeddings, roberta_intent_embeddings)
    
    # calculate spearman's correlation
    roberta_spearman_score, _ = spearmanr(roberta_child_embeddings, roberta_intent_embeddings)
    
    ##############################################################################################
    # Create TinyBert embedding and calculate cosine similarity, pearsons and spearmans correlaton
    ##############################################################################################
    tinybert_child_embeddings = tinybert_model.encode(str(row['child_transcription_without_stopwords']))
    tinybert_intent_embeddings = tinybert_model.encode(str(row['intent_without_stopwords']))
    
    # calculate cosine simlarity
    tinybert_cosine_score = spatial.distance.cosine(tinybert_child_embeddings, tinybert_intent_embeddings)
    #tinybert_cosine_score_2 = cosine_similarity(bert_child_embeddings, bert_intent_embeddings)
    
    # calculate Pearson's correlation
    tinybert_pearson_score, _ = pearsonr(tinybert_child_embeddings, tinybert_intent_embeddings)
    
    # calculate spearman's correlation
    tinybert_spearman_score, _ = spearmanr(tinybert_child_embeddings, tinybert_intent_embeddings)
    
    #############################################################################################
    # Create AlBert embedding and calculate cosine similarity, pearsons and spearmans correlaton
    #############################################################################################
    albert_child_embeddings = albert_model.encode(str(row['child_transcription_without_stopwords']))
    albert_intent_embeddings = albert_model.encode(str(row['intent_without_stopwords']))
    
    # calculate cosine simlarity
    albert_cosine_score = spatial.distance.cosine(albert_child_embeddings, albert_intent_embeddings)
    #albert_cosine_score_2 = cosine_similarity(bert_child_embeddings, bert_intent_embeddings)
    
    # calculate Pearson's correlation
    albert_pearson_score, _ = pearsonr(albert_child_embeddings, albert_intent_embeddings)
    
    # calculate spearman's correlation
    albert_spearman_score, _ = spearmanr(albert_child_embeddings, albert_intent_embeddings)
    
    ############################################################################################
    # Create Glove embedding and calculate cosine similarity, pearsons and spearmans correlaton
    ###########################################################################################
    glove_error = 0
    child_transcription  = list(str(row['child_transcription_without_stopwords']).split(" "))
    intent  = list(str(row['intent_without_stopwords']).split(" "))
    vector_1 = np.mean([glove_model[word] for word in child_transcription if word in glove_model],axis=0)
    vector_2 = np.mean([glove_model[word] for word in intent if word in glove_model],axis=0)

    # calculate cosine simlarity
    try:
        glove_cosine_score = spatial.distance.cosine(vector_1, vector_2)

        # calculate Pearson's correlation
        glove_pearson_score, _ = pearsonr(vector_1, vector_2)
     
        # calculate spearman's correlation
        glove_spearman_score, _ = spearmanr(vector_1, vector_2)
       
    except:
        glove_cosine_score = None
        # calculate Pearson's correlation
        glove_pearson_score = None

        # calculate spearman's correlation
        glove_spearman_score = None
        glove_error = 1
        
    ############################################################################################
    # Create word2vec embedding and calculate cosine similarity, pearsons and spearmans correlaton
    ###########################################################################################
    word2vec_error = 0
    child_transcription  = list(str(row['child_transcription_without_stopwords']).split(" "))
    intent  = list(str(row['intent_without_stopwords']).split(" "))
    vector_1 = np.mean([word2vec_model[word] for word in child_transcription if word in word2vec_model],axis=0)
    vector_2 = np.mean([word2vec_model[word] for word in intent if word in word2vec_model],axis=0)
    
    # calculate cosine simlarity
    try:
        word2vec_cosine_score = spatial.distance.cosine(vector_1, vector_2)

        # calculate Pearson's correlation
        word2vec_pearson_score, _ = pearsonr(vector_1, vector_2)
     
        # calculate spearman's correlation
        word2vec_spearman_score, _ = spearmanr(vector_1, vector_2)
       
    except:
        word2veccosine_score = None
        # calculate Pearson's correlation
        word2vec_pearson_score = None

        # calculate spearman's correlation
        word2vec_spearman_score = None
        word2vec_error = 1
        
    
    #####################################################################################
    # Write results back into dataframe
    #####################################################################################
    #cosine
    df.loc[index, 'bert_cosine_score'] = 1 - bert_cosine_score
    df.loc[index, 'tinybert_cosine_score'] = 1 - tinybert_cosine_score
    df.loc[index, 'roberta_cosine_score'] = 1 - roberta_cosine_score
    df.loc[index, 'albert_cosine_score'] = 1 - albert_cosine_score
    if glove_cosine_score is not None:
        df.loc[index, 'glove_cosine_score'] = 1 - glove_cosine_score
    if word2vec_cosine_score is not None:
        df.loc[index, 'word2vec_cosine_score'] = 1 - word2vec_cosine_score
    
    
    #pearsons score
    df.loc[index, 'bert_pearson_score'] = bert_pearson_score
    df.loc[index, 'tinybert_pearson_score'] = tinybert_pearson_score
    df.loc[index, 'roberta_pearson_score'] = roberta_pearson_score
    df.loc[index, 'albert_pearson_score'] = albert_pearson_score
    df.loc[index, 'glove_pearson_score'] = glove_pearson_score
    df.loc[index, 'word2vec_pearson_score'] = word2vec_pearson_score
    
    #spearmans
    df.loc[index, 'bert_spearman_score'] = bert_spearman_score
    df.loc[index, 'roberta_spearman_score'] = roberta_spearman_score
    df.loc[index, 'tinybert_spearman_score'] = tinybert_spearman_score
    df.loc[index, 'albert_spearman_score'] = albert_spearman_score
    df.loc[index, 'glove_spearman_score'] = glove_spearman_score
    df.loc[index, 'word2vec_spearman_score'] = word2vec_spearman_score
    
    #log if error was present on glove model
    df.loc[index, 'glove_error'] = glove_error
    df.loc[index, 'word2vec_error'] = word2vec_error
    

In [None]:
df

In [None]:
#save cleaned data to csv
df.to_csv('results.csv')