In [2]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
import csv
import time
from os.path import isfile
import pandas as pd
import re
import spacy
import matplotlib.pyplot as plt
import Tools.processing as proc

model = SentenceTransformer('all-mpnet-base-v2')

In [3]:
df_amazon_pooled = pd.read_json("./Data_Storage/Processed_Data/Amazon_Pooled.json")

df_amazon_pooled.head()

Unnamed: 0,Five_Stars,One_Star
0,"[0.0196684411, 0.0089247576, 0.0033987069, -0....","[0.0130684374, 0.0054455661, -0.0202282401, -0..."


In [4]:
# Get positive and negative semantic embeddings
positive_amazon_enc = df_amazon_pooled["Five_Stars"][0]
negative_amazon_enc = df_amazon_pooled["One_Star"][0]

# Load test and train sets
titles_df = ["Text", "Score"]
df_amazon = pd.read_csv("/home/marcuswrrn/Projects/Semantic_Quantification/Semantic_Comparison/Lab_Tests/Data_Storage/Positive_Scores/sentiment labelled sentences/amazon_cells_labelled.txt", names=titles_df, sep='\t')
df_imbd = pd.read_csv("/home/marcuswrrn/Projects/Semantic_Quantification/Semantic_Comparison/Lab_Tests/Data_Storage/Positive_Scores/sentiment labelled sentences/imdb_labelled.txt", names=titles_df, sep='\t')
df_yelp = pd.read_csv("/home/marcuswrrn/Projects/Semantic_Quantification/Semantic_Comparison/Lab_Tests/Data_Storage/Positive_Scores/sentiment labelled sentences/yelp_labelled.txt", names=titles_df, sep='\t')

In [5]:
encodings_amazon = df_amazon["Text"].apply(lambda x: model.encode(x))
encodings_imbd = df_imbd["Text"].apply(lambda x: model.encode(x))
encodings_yelp = df_yelp["Text"].apply(lambda x: model.encode(x))

In [6]:
def compare_embeddings(embed, positive=positive_amazon_enc, negative=negative_amazon_enc, hyper=0.2, visible=False):
    embed_temp = np.array([float(x) for x in embed], dtype=np.float32)
    positive = np.array([float(x) for x in positive], dtype=np.float32)
    negative = np.array([float(x) for x in negative], dtype=np.float32)

    similarity_pos = util.cos_sim(embed_temp, positive)[0][0]
    similarity_neg = util.cos_sim(embed_temp, negative)[0][0]
    #print(f"{similarity_pos} : {similarity_neg} : {similarity_pos >= similarity_neg}")
    diff = similarity_pos - (similarity_neg + (similarity_pos *hyper))
    if visible:
        print(f"Difference in embedding {diff}")
    return (diff >= 0).item()


df_amazon["Train_Score"] = encodings_amazon.apply(lambda x: compare_embeddings(x, hyper=0))
df_imbd["Train_Score"] = encodings_imbd.apply(lambda x: compare_embeddings(x, hyper=0))
df_yelp["Train_Score"] = encodings_yelp.apply(lambda x: compare_embeddings(x, hyper=0))

In [7]:
def build_confusion_matrix(train_scores, test_scores):
    matrix = np.zeros((2, 2))
    for train, test in zip(train_scores, test_scores):
        train_temp = 1 if train else 0
        test_temp = 1 if test else 0
        #print(f"Train: {train}, Test: {test}")
        matrix[test_temp][train_temp] += 1
    return matrix

def find_test_score(train_score, test_score):
    count = 0
    for test, train in zip(test_score, train_score):
        count += 1 if test == train else 0
    return count/len(test_score)

def binary_confusion_matrix_scores(confusion_matrix):
    """
    Compute precision, accuracy, and recall scores from a 2x2 confusion matrix.
    
    Args:
        confusion_matrix (list of lists): A 2x2 list containing the confusion matrix, 
                                          with rows representing true labels and columns representing predicted labels.
    
    Returns:
        precision (float): Precision score for the positive class.
        accuracy (float): The overall accuracy of the classifier.
        recall (float): Recall score for the positive class.
    """
    
    # Make sure the input is a 2x2 matrix
    if len(confusion_matrix) != 2 or any(len(row) != 2 for row in confusion_matrix):
        raise ValueError("The confusion matrix must be 2x2")
    
    # Calculate true positives, false positives, and false negatives
    true_positives = confusion_matrix[1][1]
    false_positives = confusion_matrix[0][1]
    false_negatives = confusion_matrix[1][0]
    true_negatives = confusion_matrix[0][0]
    
    # Compute precision, accuracy, and recall scores
    precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) != 0 else 0
    accuracy = (true_positives + true_negatives) / (true_positives + true_negatives + false_positives + false_negatives)
    recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) != 0 else 0
    print(f"Precision: {precision}")
    print(f"Accuracy: {accuracy}")
    print(f"Recall: {recall}")

def print_test_log(df : pd.DataFrame, train_label="Train_Score", test_label="Score"):
    
    conf_matrix = build_confusion_matrix(df[train_label], df[test_label])
    print(conf_matrix)
    binary_confusion_matrix_scores(conf_matrix)

def print_all_logs(df_amazon, df_imbd, df_yelp):
    print("Amazon")
    print_test_log(df_amazon)
    print()
    print("IMBD")
    print_test_log(df_imbd)
    print()
    print("Yelp")
    print_test_log(df_yelp)



print_all_logs(df_amazon, df_imbd, df_yelp)

Amazon
[[453.  47.]
 [366. 134.]]
Precision: 0.7403314917127072
Accuracy: 0.587
Recall: 0.268

IMBD
[[285.  77.]
 [177. 209.]]
Precision: 0.7307692307692307
Accuracy: 0.660427807486631
Recall: 0.5414507772020726

Yelp
[[286. 214.]
 [142. 358.]]
Precision: 0.6258741258741258
Accuracy: 0.644
Recall: 0.716


In [8]:
df_imbd.head()

Unnamed: 0,Text,Score,Train_Score
0,"A very, very, very slow-moving, aimless movie ...",0,False
1,Not sure who was more lost - the flat characte...,0,False
2,Attempting artiness with black & white and cle...,0,False
3,Very little music or anything to speak of.,0,False
4,The best scene in the movie was when Gerardo i...,1,False


In [9]:
def column_mean(list_of_lists):
    # Convert the input list of lists to a numpy array
    data = np.array(list_of_lists)
    # Compute the mean average along the columns (axis=0)
    mean_average = np.mean(data, axis=0)
    # Convert the result back to a Python list
    mean_average_list = mean_average.tolist()
    return mean_average_list

def get_avg_embeds(df: pd.DataFrame, encodings: list):
    df['Encodings'] = encodings
    # Lets average all embeddings
    df_pos = df[df['Score'] == 1].reset_index()
    df_neg = df[df['Score'] == 0].reset_index()
    return column_mean(df_pos['Encodings']), column_mean(df_neg['Encodings'])


positive_imbd_enc, negative_imbd_enc = get_avg_embeds(df_imbd, encodings_imbd)
positive_amazon_enc, negative_amazon_enc = get_avg_embeds(df_amazon, encodings_amazon)
positive_yelp_enc, negative_yelp_enc = get_avg_embeds(df_yelp, encodings_yelp)

In [10]:
for i in range(3):
    if i == 0:
        name = "Amazon"
        positive, negative = positive_amazon_enc, negative_amazon_enc
    elif i == 1:
        name = "IMDb"
        positive, negative = positive_imbd_enc, negative_imbd_enc
    else:
        name = "Yelp"
        positive, negative = positive_yelp_enc, negative_yelp_enc

    df_amazon["Train_Score"] = encodings_amazon.apply(lambda x: compare_embeddings(x, positive=positive, negative=negative, hyper=0))
    df_imbd["Train_Score"] = encodings_imbd.apply(lambda x: compare_embeddings(x, positive=positive, negative=negative, hyper=0))
    df_yelp["Train_Score"] = encodings_yelp.apply(lambda x: compare_embeddings(x, positive=positive, negative=negative, hyper=0))
    print(f"\n================= Dataset: {name} =================")
    print_all_logs(df_amazon, df_imbd, df_yelp)


Amazon
[[443.  57.]
 [ 46. 454.]]
Precision: 0.8884540117416829
Accuracy: 0.897
Recall: 0.908

IMBD
[[358.   4.]
 [ 50. 336.]]
Precision: 0.9882352941176471
Accuracy: 0.9278074866310161
Recall: 0.8704663212435233

Yelp
[[473.  27.]
 [ 27. 473.]]
Precision: 0.946
Accuracy: 0.946
Recall: 0.946

Amazon
[[465.  35.]
 [ 63. 437.]]
Precision: 0.9258474576271186
Accuracy: 0.902
Recall: 0.874

IMBD
[[343.  19.]
 [ 22. 364.]]
Precision: 0.9503916449086162
Accuracy: 0.9451871657754011
Recall: 0.9430051813471503

Yelp
[[478.  22.]
 [ 23. 477.]]
Precision: 0.9559118236472945
Accuracy: 0.955
Recall: 0.954

Amazon
[[462.  38.]
 [ 65. 435.]]
Precision: 0.919661733615222
Accuracy: 0.897
Recall: 0.87

IMBD
[[358.   4.]
 [ 58. 328.]]
Precision: 0.9879518072289156
Accuracy: 0.9171122994652406
Recall: 0.8497409326424871

Yelp
[[468.  32.]
 [ 23. 477.]]
Precision: 0.93713163064833
Accuracy: 0.945
Recall: 0.954


In [11]:
print_all_logs(df_amazon, df_imbd, df_yelp)

Amazon
[[462.  38.]
 [ 65. 435.]]
Precision: 0.919661733615222
Accuracy: 0.897
Recall: 0.87

IMBD
[[358.   4.]
 [ 58. 328.]]
Precision: 0.9879518072289156
Accuracy: 0.9171122994652406
Recall: 0.8497409326424871

Yelp
[[468.  32.]
 [ 23. 477.]]
Precision: 0.93713163064833
Accuracy: 0.945
Recall: 0.954


In [12]:
df_gpt_pooled = pd.read_json("/home/marcuswrrn/Projects/Semantic_Quantification/Semantic_Comparison/Data_Processing/Data/job_pooled_embeddings.json")

In [13]:
positive = column_mean([positive_imbd_enc, positive_amazon_enc])
negative = column_mean([negative_imbd_enc, negative_amazon_enc])

df_amazon["Train_Score"] = encodings_amazon.apply(lambda x: compare_embeddings(x, positive=positive, negative=negative, hyper=0))
df_imbd["Train_Score"] = encodings_imbd.apply(lambda x: compare_embeddings(x, positive=positive, negative=negative, hyper=0))
df_yelp["Train_Score"] = encodings_yelp.apply(lambda x: compare_embeddings(x, positive=positive, negative=negative, hyper=0))



print_all_logs(df_amazon, df_imbd, df_yelp)

Amazon
[[451.  49.]
 [ 49. 451.]]
Precision: 0.902
Accuracy: 0.902
Recall: 0.902

IMBD
[[351.  11.]
 [ 25. 361.]]
Precision: 0.9704301075268817
Accuracy: 0.9518716577540107
Recall: 0.9352331606217616

Yelp
[[471.  29.]
 [ 21. 479.]]
Precision: 0.9429133858267716
Accuracy: 0.95
Recall: 0.958


In [14]:
pos = [float(x) for x in df_gpt_pooled["positive"]]
neg = [float(x) for x in df_gpt_pooled["negative"]]
neut = [float(x) for x in df_gpt_pooled["neutral"]]

print("Positive/Negative Tests:")
print(util.cos_sim(positive, pos)[0][0])
print(util.cos_sim(negative, neg)[0][0])

print("\nNeutral Testing:")
print(util.cos_sim(positive, neut)[0][0])
print(util.cos_sim(negative, neut)[0][0])

Positive/Negative Tests:
tensor(0.1606)
tensor(0.1677)

Neutral Testing:
tensor(0.0594)
tensor(0.0435)


In [15]:
sent = "I'm trapped in a wonderful dream"
enc = model.encode(sent)

print(util.cos_sim(positive, enc)[0][0])
print(util.cos_sim(negative, enc)[0][0])
print(util.cos_sim(neut, enc)[0][0])

tensor(0.1814)
tensor(0.1421)
tensor(0.1236)


In [16]:
encodings_pos = [df_gpt_pooled["positive"], positive_yelp_enc]
encodings_neg = [df_gpt_pooled["negative"], negative_yelp_enc]

encodings_pos = column_mean(encodings_pos)
encodings_neg = column_mean(encodings_neg)

df_amazon["Train_Score"] = encodings_amazon.apply(lambda x: compare_embeddings(x, positive=encodings_pos, negative=encodings_neg, hyper=0))
df_imbd["Train_Score"] = encodings_imbd.apply(lambda x: compare_embeddings(x, positive=encodings_pos, negative=encodings_neg, hyper=0))
df_yelp["Train_Score"] = encodings_yelp.apply(lambda x: compare_embeddings(x, positive=encodings_pos, negative=encodings_neg, hyper=0))