In [1]:
from sentence_transformers import SentenceTransformer, util
import numpy as np
import csv
import time
from os.path import isfile
import pandas as pd
import re
import spacy
import matplotlib.pyplot as plt
import Tools.processing as proc
import html

nlp = spacy.load("en_core_web_sm")
model = SentenceTransformer('all-mpnet-base-v2')

In [2]:
chunk_size = 1000000

df_reader = pd.read_json("./Data_Storage/All_Amazon_Review.json", lines=True, chunksize=chunk_size)

In [3]:
def split_dataframe(df):
    # Initialize an empty dictionary to store the new DataFrames
    split_dfs = {}

    # Loop through the values 1 to 5
    for i in range(1, 6):
        # Filter the original DataFrame based on the "overall" column value and store it in the dictionary
        split_dfs[i] = df[df["overall"] == i]

    return split_dfs

df_one   = pd.DataFrame()
df_two   = pd.DataFrame()
df_three = pd.DataFrame()
df_four  = pd.DataFrame()
df_five  = pd.DataFrame()

for i, chunk in enumerate(df_reader):
    print(f"Iteration: {i}")
    dfs = split_dataframe(chunk)
    df_one = df_one.append(dfs[1])
    df_two = df_two.append(dfs[2])
    df_three = df_three.append(dfs[3])
    df_four = df_four.append(dfs[4])
    df_five = df_five.append(dfs[5])
    print(len(df_two))
    break

Iteration: 0
55872


In [4]:
print(f"Five Stars: {len(df_five)}\nFour Stars: {len(df_four)}\nThree Stars: {len(df_three)}\nTwo Stars: {len(df_two)}\nOne Star: {len(df_one)}\n")

Five Stars: 600955
Four Stars: 150027
Three Stars: 79187
Two Stars: 55872
One Star: 113959



In [5]:
# Make sure that all dataframes have the same 
cutoff = len(df_two)

df5 = df_five.head(cutoff).reset_index()
df4 = df_four.head(cutoff).reset_index()
df3 = df_three.head(cutoff).reset_index()
df2 = df_two.head(cutoff).reset_index()
df1 = df_one.head(cutoff).reset_index()

In [6]:
df5.head()

Unnamed: 0,index,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,vote,image,style
0,5,5,False,"01 15, 2018",A3TXR8GLKS19RE,B017O9P72A,Nello,Great skill,Great,1515974400,,,
1,11,5,False,"12 25, 2017",A3KWZMO1A9TO3Z,B017O9P72A,Amazon Customer,So easy to connect and I&rsquo;m having way to...,This is awesome,1514160000,2.0,,
2,12,5,False,"12 18, 2017",A1QBG1TTQZGJNM,B017O9P72A,Artyhow,I use &ldquo;Echo&rdquo; as the wake word. I ...,Alexa and LIFX,1513555200,2.0,,
3,14,5,False,"11 24, 2017",A2LNJJWW2TLL00,B017O9P72A,jarhead,I haven't had any issues with this skill like...,Love it!,1511481600,,,
4,22,5,False,"06 29, 2017",ALBD3EW51P0PJ,B017O9P72A,Mageswari Babu,Good,Hooked it up today with ease,1498694400,,,


In [8]:
def decode_html(text):
    return html.unescape(str(text))

def preprocess_text(df: pd.DataFrame):
    df["summary"] = df["summary"].apply(lambda x: decode_html(x))

preprocess_text(df5)
preprocess_text(df4)
preprocess_text(df3)
preprocess_text(df2)
preprocess_text(df1)

In [9]:
def estimate_completion(iteration, total_iterations, start_time):
    elapsed_time = time.time() - start_time
    time_per_iteration = elapsed_time / iteration
    remaining_iterations = total_iterations - iteration
    estimated_completion_time = remaining_iterations * time_per_iteration
    return estimated_completion_time

def split_sentences(text: str):
    if type(text) != str:
        return None
    doc = nlp(text)
    return [sent.text.strip() for sent in doc.sents][0]

def column_mean(list_of_lists):
    # Convert the input list of lists to a numpy array
    data = np.array(list_of_lists)
    # Compute the mean average along the columns (axis=0)
    mean_average = np.mean(data, axis=0)
    # Convert the result back to a Python list
    mean_average_list = mean_average.tolist()
    return [mean_average_list]

def get_encoding(df: pd.DataFrame):
    encs = []
    # Initialize the start time
    start_time = time.time()
    for i, text in enumerate(df["summary"]):
        split_text = split_sentences(text)
        if split_text:
            encs.append(model.encode(split_text))
            encs = column_mean(encs)
            remaining_time = estimate_completion(i + 1, cutoff, start_time)
            print(f"Iteration {i}/{cutoff}: Estimated time remaining: {remaining_time:.2f} seconds")
    return encs

enc5 = get_encoding(df5)
enc1 = get_encoding(df1)


Iteration 0/55872: Estimated time remaining: 159330.13 seconds
Iteration 1/55872: Estimated time remaining: 80130.75 seconds
Iteration 2/55872: Estimated time remaining: 53700.84 seconds
Iteration 3/55872: Estimated time remaining: 40488.84 seconds
Iteration 4/55872: Estimated time remaining: 32564.07 seconds
Iteration 5/55872: Estimated time remaining: 27277.62 seconds
Iteration 6/55872: Estimated time remaining: 23501.34 seconds
Iteration 7/55872: Estimated time remaining: 20664.85 seconds
Iteration 8/55872: Estimated time remaining: 18474.60 seconds
Iteration 9/55872: Estimated time remaining: 16709.90 seconds
Iteration 10/55872: Estimated time remaining: 15268.35 seconds
Iteration 11/55872: Estimated time remaining: 14067.55 seconds
Iteration 12/55872: Estimated time remaining: 13049.93 seconds
Iteration 13/55872: Estimated time remaining: 12186.78 seconds
Iteration 14/55872: Estimated time remaining: 11430.69 seconds
Iteration 15/55872: Estimated time remaining: 10773.70 seconds
I

In [10]:
import json

data = {
    "Five_Stars": enc5,
    "One_Star":   enc1
}
df_pooled = pd.DataFrame(data)
#df_pooled = pd.read_json("./Data_Storage/Processed_Data/Amazon_Pooled_Single_Sentence.json")

df_pooled.to_json("./Data_Storage/Processed_Data/Amazon_Pooled_Single_Sentence_Summaries.json")


In [11]:
df_pooled["Five_Stars"][0]

[-0.00014820747813461526,
 -0.0029095830355915543,
 -0.01145250501881518,
 0.0375325204819147,
 -0.018044533591786094,
 0.02935987759802785,
 0.02750725259430576,
 0.03666118250581377,
 -0.022320177149487825,
 -0.009278822942819762,
 0.04952049603466155,
 0.043088561574886795,
 -0.03246729675026658,
 0.044056156929639295,
 0.005153539114789107,
 -0.028434937721355802,
 0.0028082714026704475,
 0.008204194066979345,
 -0.0006924866555403052,
 -0.05373165392623292,
 -0.008780292725815361,
 0.0009745961479963577,
 0.016438881899245916,
 -0.00364873175617322,
 -0.011416107247346362,
 0.02053637459552423,
 0.03920895734722407,
 0.02315766496002513,
 0.0028073241712843386,
 -0.01542327588835953,
 -0.03244978711316178,
 -0.020145439568218036,
 -0.019876449300376662,
 0.008776594583607552,
 1.7611608872708628e-06,
 -0.0033948293041655194,
 0.02516895851802549,
 -0.022870947670271723,
 -0.016072121414856463,
 0.0078027777028964965,
 0.008666297486364489,
 -0.022778924581129706,
 0.007530118447850

In [16]:
sent = "The world is a wonderful place"
encoding = model.encode(sent)
df_pooled2 = pd.read_json("/home/marcuswrrn/Projects/Semantic_Quantification/Semantic_Comparison/Data_Processing/Data/job_pooled_embeddings.json")
negative_enc = df_pooled["One_Star"]
positive_enc = df_pooled["Five_Stars"]

positive = [float(enc) for enc in df_pooled2["positive"]]
negative = [float(enc) for enc in df_pooled2["negative"]]


print(util.cos_sim(positive_enc, encoding)[0][0])
print(util.cos_sim(negative_enc, encoding)[0][0])
print(util.cos_sim(positive, encoding)[0][0])
print(util.cos_sim(negative, encoding)[0][0])

tensor(0.2154)
tensor(0.1244)
tensor(0.0744)
tensor(-0.0813)
