### UPDATE REVIEW FEATURE

Plan:
* Inport review dataset [done]
* Groupby reviews in new way (append number of upvotes at the end of each review) [done]
* Generate dictionary with ID as key and concatenated REVIEW as value 

* Get list of all PINECONE IDs (parent_asin)
* For each Pinecone ID, match with parent_asin in dictionary, and update VALUE

In [1]:
import pandas as pd
import json

In [2]:
filepath = '/Users/ez/Downloads/Amazon_Fashion.jsonl'

# # Read some
# with open(filepath, 'r') as f:
#     reviews = []
#     # Read and parse the first 10,000 lines as JSON data
#     for _ in range(100000):
#         line = f.readline()
#         if line:
#             try:
#                 reviews.append(json.loads(line))
#             except json.JSONDecodeError:
#                 # Handle malformed JSON lines, if any
#                 pass

# Read all
with open(filepath, 'r') as f:
    reviews = []
    for line in f:
        try:
            reviews.append(json.loads(line))
        except ValueError:
            pass

len(reviews)

2500939

In [3]:
reviews = pd.DataFrame(reviews)

In [4]:
# REVIEWS: Drop rows where VERIFIED_PURCHASE == False
print('length reviews before', len(reviews))
reviews = reviews[reviews['verified_purchase'] == True]
print('length reviews after', len(reviews))

length reviews before 2500939
length reviews after 2337702


In [5]:
# REVIEWS: Drop verified_rows col
reviews = reviews.drop(columns=['verified_purchase'])

In [6]:
reviews.head()

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote
0,5.0,Pretty locket,I think this locket is really pretty. The insi...,[],B00LOPVX74,B00LOPVX74,AGBFYI2DDIKXC5Y4FARTYDTQBMFQ,1578528394489,3
1,5.0,A,Great,[],B07B4JXK8D,B07B4JXK8D,AFQLNQNQYFWQZPJQZS6V3NZU4QBQ,1608426246701,0
2,2.0,Two Stars,One of the stones fell out within the first 2 ...,[],B007ZSEQ4Q,B007ZSEQ4Q,AHITBJSS7KYUBVZPX7M2WJCOIVKQ,1432344828000,3
3,1.0,Won’t buy again,Crappy socks. Money wasted. Bought to wear wit...,[],B07F2BTFS9,B07F2BTFS9,AFVNEEPDEIH5SPUN5BWC6NKL3WNQ,1546289847095,2
4,5.0,I LOVE these glasses,I LOVE these glasses! They fit perfectly over...,[],B00PKRFU4O,B00XESJTDE,AHSPLDNW5OOUK2PLH7GXLACFBZNQ,1439476166000,0


In [7]:
# Reformat function
def reformat_review(text, upvotes):
    reformatted = "{} Upvoted: '{}'".format(upvotes, text)
    return reformatted

In [8]:
# Apply
reviews['text_w_vote'] = reviews.apply(lambda row: reformat_review(row['text'], row['helpful_vote']), axis=1)

In [9]:
# DROP UNNEEDED COLUMNS - everything but parent_asin, text_w_vote
reviews = reviews.drop(columns=['rating', 'title', 'text', 'images', 'asin', 'user_id', 'timestamp'])

In [10]:
reviews[:2]

Unnamed: 0,parent_asin,helpful_vote,text_w_vote
0,B00LOPVX74,3,3 Upvoted: 'I think this locket is really pret...
1,B07B4JXK8D,0,0 Upvoted: 'Great'


#### GROUPING

NEW APPROACH (PINECONE BYTE LIMIT 40960)

write code to 

1. group by ID column
2. concatenate review text into a new column, but starting with the reviews that have the most upvotes
3. run lambda
4. apply function: if len of this new review column is > limit, truncate to first 40000 

In [11]:
# Sort the DataFrame by 'ID' and 'upvotes' in descending order
reviews = reviews.sort_values(by=['parent_asin', 'helpful_vote'], ascending=[True, False])

In [12]:
# Group the DataFrame by 'ID' and concatenate 'review_text' starting with the highest upvotes
reviews_grouped = reviews.groupby('parent_asin')['text_w_vote'].apply(lambda x: ' '.join(x))

In [48]:
# Create a new DataFrame with the grouped data
reviews_concat = pd.DataFrame({'parent_asin': reviews_grouped.index, 'concat_reviews': reviews_grouped.values})

In [49]:
reviews_concat.iloc[2536]

parent_asin                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             

In [50]:
reviews_concat.shape # As expected!

(776976, 2)

In [51]:
reviews_concat.columns

Index(['parent_asin', 'concat_reviews'], dtype='object')

#### CROP REVIEW TEXT ACCORDING TO BYTE LIMIT

In [52]:
def calculate_byte_count(text, encoding='utf-8'):
    """
    Calculate the byte count of a string.

    Inputs:
        text: The input string.
        encoding: The encoding to use (default is UTF-8).
    Returns:
        byte_count: The number of bytes in the encoded string.
    """
    # Encode the string using the specified encoding
    byte_array = text.encode(encoding)
    
    # Calculate the length of the byte array
    byte_count = len(byte_array)
    
    return byte_count

def truncate_string_to_bytes(text, max_bytes, encoding='utf-8'):
    """
    Truncate a string to a specified maximum byte length.

    Inputs:
        text: The input string.
        max_bytes: The maximum number of bytes allowed in the output string.
        encoding: The encoding to use (default is UTF-8).
    Returns:
        truncated_text: The truncated string with a byte count <= max_bytes.
    """
    # Encode the string using the specified encoding
    byte_array = text.encode(encoding)
    
    # Truncate the byte array to the specified maximum length
    truncated_byte_array = byte_array[:max_bytes]
    
    # Decode the truncated byte array back to a string
    truncated_text = truncated_byte_array.decode(encoding, errors='ignore')
    
    return truncated_text

def inspect_text_length(text):
    """
    Trims a string if it exceeds the Pinecone metadata byte limit of 
    40960 bytes.
    """
    max_bytes = 40500

    if calculate_byte_count(text) > max_bytes:
        truncated_text = truncate_string_to_bytes(text, max_bytes)
        return truncated_text
    
    else:
        return text

In [53]:
# Do text length inspection
reviews_concat['reviews_truncated'] = reviews_concat['concat_reviews'].apply(lambda x: inspect_text_length(x))

In [70]:
max(reviews_concat['len_reviews'])

40500

In [69]:
# Drop unneeded col
reviews_concat = reviews_concat.drop(columns=['concat_reviews'])

In [71]:
reviews_concat[:2]

Unnamed: 0,parent_asin,reviews_truncated,len_reviews
0,99813,"0 Upvoted: 'I'm 6'1"" and usually get xl but at...",370
1,310807115,0 Upvoted: 'I carry my bible and I le accesso...,119


In [55]:
# Adjust display of text
# pd.set_option('display.max_colwidth', None)
# pd.set_option('display.max_colwidth', 50)

In [72]:
# Make df into dictionary for easy retrieval
reviews_dict = reviews_concat.groupby('parent_asin')['reviews_truncated'].apply(list).to_dict()

In [None]:
# Save
# reviews_concat.to_parquet('reviews_truncated_052324.parquet')

### Send to Pinecone

In [74]:
from dotenv import load_dotenv
load_dotenv()

True

In [75]:
# initialize Pinecone vector store
import pinecone
from pinecone import Pinecone, ServerlessSpec
import os

In [76]:
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
pc = Pinecone(api_key=PINECONE_API_KEY)

In [77]:
# Load vector store
index_name = 'description-embed'
if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=emb_shape[0],
        metric='cosine',
        spec=ServerlessSpec(
            cloud='aws', 
            region='us-east-1'
        )
    )

In [78]:
description_index = pc.Index(index_name)
description_index.describe_index_stats()

{'dimension': 3072,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 776976}},
 'total_vector_count': 776976}

UPDATE METADATA

In [83]:
# ids = set(reviews_dict.keys()) #776976
# len(ids)

ids = list(reviews_dict.keys())  # Convert to a list for indexing
print(len(ids))
num_ids = len(ids)

776976


In [None]:
# DO UPDATE!
from tqdm import tqdm

# for id in ids:
# 	description_index.update(
# 		id = id, 
# 		set_metadata = {"review": reviews_dict[id][0]}
# 	)

ids = list(reviews_dict.keys())  # Convert to a list for indexing
num_ids = len(ids)

# Define the batch size
batch_size = 1000

# Create a progress bar
pbar = tqdm(total=num_ids, unit='batch')

# Split the IDs into batches
for i in range(0, num_ids, batch_size):
    id_batch = ids[i:i+batch_size]
    metadata_batch = [{"_id": _id, "review": reviews_dict[_id][0]} for _id in id_batch]

    # Extract the IDs from the metadata batch
    ids_in_batch = [_id['_id'] for _id in metadata_batch]

    # Update the metadata in batches, including the IDs
    description_index.update(ids=ids_in_batch, set_metadata=metadata_batch)

    # Update the progress bar
    pbar.update(len(id_batch))

pbar.close()

PARALLELIZE

In [102]:
import itertools
import concurrent.futures

In [114]:
def chunks(iterable, batch_size=100):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = list(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = list(itertools.islice(it, batch_size))

# ids and review_dict are assumed to be defined
ids = list(reviews_dict.keys())

def update_review(index_name, id, review_text):
    """A function to update the review metadata for a single ID."""
    with Pinecone(api_key=PINECONE_API_KEY, pool_threads=30) as pc:
        index = pc.Index(index_name)
        index.update(id=id, set_metadata={"review": review_text})

# Define the maximum number of workers for parallel execution
max_workers = 12

# Create a list of (id, review_text) tuples for each ID in ids
id_review_pairs = [(id, reviews_dict[id]) for id in ids]

# Define a function to update review metadata for each id in parallel
def parallel_update(id_review_pairs):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit tasks for parallel execution
        futures = [executor.submit(update_review, description_index, id, review_text) for id, review_text in id_review_pairs]
        # Wait for all tasks to complete
        concurrent.futures.wait(futures)

# Split id_review_pairs into chunks for parallel processing
id_review_chunks = list(chunks(id_review_pairs, batch_size=100)) #0000099813, 0310807115

In [115]:
# Perform parallel updates for each chunk
for chunk in id_review_chunks:
    parallel_update(chunk)

PARALLELIZE 2

In [117]:
import concurrent.futures

# Define the maximum number of workers for parallel execution
max_workers = 10

# Define a function to update review metadata for a single ID
def update_review(id):
    description_index.update(
        id=id, 
        set_metadata={"review": reviews_dict[id][0]}
    )

# Create a list of IDs for batch updates
ids = list(reviews_dict.keys())

# Define the function for parallel execution
def parallel_update(ids):
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit tasks for parallel execution
        futures = [executor.submit(update_review, id) for id in ids]
        # Wait for all tasks to complete
        concurrent.futures.wait(futures)

# Split the list of IDs into chunks for parallel processing
batch_size = 1000  # Adjust as needed
id_chunks = [ids[i:i+batch_size] for i in range(0, len(ids), batch_size)]

In [120]:
# Perform parallel updates for each chunk
for chunk in id_chunks:
    parallel_update(chunk)