In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
! pip install weaviate-client
!pip install tiktoken

Collecting weaviate-client
  Downloading weaviate_client-4.5.4-py3-none-any.whl (306 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m306.8/306.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
Collecting httpx==0.27.0 (from weaviate-client)
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting validators==0.22.0 (from weaviate-client)
  Downloading validators-0.22.0-py3-none-any.whl (26 kB)
Collecting authlib<2.0.0,>=1.2.1 (from weaviate-client)
  Downloading Authlib-1.3.0-py2.py3-none-any.whl (223 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m223.7/223.7 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
Collecting grpcio-tools<2.0.0,>=1.57.0 (from weaviate-client)
  Downloading grpcio_tools-1.62.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import pandas as pd
import re
import tiktoken
import random
import weaviate
from weaviate import EmbeddedOptions


In [4]:
directory_path ='/content/drive/MyDrive/summarization_amazon_reviews'
os.chdir(directory_path)

In [None]:
# import zipfile

# with zipfile.ZipFile('archive.zip', 'r') as zip_ref:
#     zip_ref.extractall('data')

## Reading dataset

In [5]:
df = pd.read_csv("data/Reviews.csv",
                 usecols = ['ProductId', 'Text' ]
                )
df = df[df.Text.notna()]

In [6]:
df.head()

Unnamed: 0,ProductId,Text
0,B001E4KFG0,I have bought several of the Vitality canned d...
1,B00813GRG4,Product arrived labeled as Jumbo Salted Peanut...
2,B000LQOCH0,This is a confection that has been around a fe...
3,B000UA0QIQ,If you are looking for the secret ingredient i...
4,B006K2ZZ7K,Great taffy at a great price. There was a wid...


In [7]:
len(df)

568454

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 568454 entries, 0 to 568453
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   ProductId  568454 non-null  object
 1   Text       568454 non-null  object
dtypes: object(2)
memory usage: 13.0+ MB


In [9]:
print('Total unique product ids in dataset=',df.ProductId.nunique())

Total unique product ids in dataset= 74258


## Set OpenAI API key

In [41]:
os.environ["OPENAI_API_KEY"] = 'my-openai-api-key'


In [42]:
if os.getenv("OPENAI_API_KEY") is not None:
    print ("OPENAI_API_KEY is ready")
else:
    print ("OPENAI_API_KEY environment variable not found")

OPENAI_API_KEY is ready


## Connect to Weaviate instance

In [45]:
client = weaviate.Client(
    url="https://myurlforweaviate.weaviate.network",
    additional_headers={
        "X-OpenAI-Api-Key": os.getenv("OPENAI_API_KEY")
    }
)

# Check if your instance is live and ready
client.is_ready()

#client.schema.delete_all()

True

## Schema


In [46]:
if client.schema.exists("Reviews"):
    client.schema.delete_class("Reviews")

class_obj = {
    "class": "Reviews",
    "properties": [
        {
            "name": "review_text",
            "dataType": ["text"],
        },
        {
            "name": "product_id",
            "dataType": ["text"],
            "moduleConfig": {
                "text2vec-openai": {
                    "skip": True, # skipping vectorization for this property
                    "vectorizePropertyName": False
                }
            }
        },
    ],
    "vectorizer": "text2vec-openai",
    "moduleConfig": {
        "text2vec-openai": {
            "vectorizeClassName": False,
            "model": "ada",
            "modelVersion": "002",
            "type": "text"
        },
        "generative-openai": {
          "model": "gpt-3.5-turbo"
        }

    },
}

client.schema.create_class(class_obj)

In [47]:
df.columns

Index(['ProductId', 'Text'], dtype='object')

## Populating database in batches


In [48]:
from weaviate.util import generate_uuid5

client.batch.configure(batch_size=100)# batch size of 100

with client.batch as batch:
    for _, row in df.iterrows():
        review_item = {
            "review_text": row.Text,
            "product_id": row.ProductId
        }

        batch.add_data_object(
            class_name="Reviews",
            data_object=review_item,
            uuid=generate_uuid5(review_item)
        )

## Prompt

In [None]:
generate_prompt = """
Summarize these customer reviews into one paragraph long review:
{review_text}
"""

## Generate summary of products

In [50]:
for product_id in list(df.ProductId.unique()):
    summary = client.query\
                .get('Reviews',
                     ['review_text', "product_id"])\
                .with_where({
                    "path": ["product_id"],
                    "operator": "Equal",
                    "valueText": product_id
                })\
                .with_generate(grouped_task=generate_prompt)\
                .do()["data"]["Get"]["Reviews"]

    new_review_summary = {
        "product_id" : product_id,
        "summary": summary[0]["_additional"]["generate"]["groupedResult"]
    }

    # Create new object
    client.data_object.create(
      data_object = new_review_summary,
      class_name = "Products",
      uuid = generate_uuid5(new_review_summary)
    )

## Viewing the summaries generated for different product_ids

### For product_id='B000LKVRQA'

In [51]:
reviews_1 = client.query\
                .get('Reviews', ['review_text', "product_id"])\
                .with_where({
                    "path": ["product_id"],
                    "operator": "Equal",
                    "valueText": "B000LKVRQA"
                })\
               .do()

In [52]:
reviews_1 #reviews present in dataset for the product

{'data': {'Get': {'Reviews': [{'product_id': 'B000LKVRQA',
    {'product_id': 'B000LKVRQA',
     'review_text': "St. Dalfour's Earl Grey is one of my favorite teas. This tea has such a distinctively-delicious and refreshing flavor. The French are true connoisseurs of brilliance and this tea falls under such auspices.  Have a cup of this with a slice of homemade bread, smothered in St. Dalfour's marmalade and you'll know what I'm writing about."},
    {'product_id': 'B000LKVRQA',
     'review_text': "I wanted to branch out and try this Earl Grey blend (I've tried and enjoyed many as it's my favorite tea).  This tastes too much like licorice for me.  I had hoped it would grow on me with time, but I'm still not feeling this one.  Won't be purchasing again."},
    {'product_id': 'B000LKVRQA',
     'review_text': '*****<br />St. Dalfour\'s wonderful Certified Organic Earl Grey Tea is made from hand-picked, organic Ceylon tea. The flavor is enhanced by "the all natural flavor of bergamot ora

In [53]:
result_1 = client.query\
            .get('Products', ['product_id', 'summary'])\
            .with_where({
                "path": ["product_id"],
                "operator": "Equal",
                "valueText": "B000LKVRQA"
            })\
            .do()

In [54]:
result_1 #generated summary

{'data': {'Get': {'Products': [{'product_id': 'B000LKVRQA',
     'summary': "Overall, the reviews for St. Dalfour's Earl Grey tea are mixed. Some customers enjoy the unique licorice flavor and find it refreshing, while others are disappointed by the lack of bergamot aroma and flavor typically associated with Earl Grey tea. Those who appreciate the organic and natural ingredients highly recommend this tea, praising its distinct and floral taste. Some customers even claim it is the best Earl Grey they have ever tried, with a smooth and heavy texture that leaves a lasting impression. However, those seeking a traditional Earl Grey experience may be let down by the deviation from the classic flavor profile."}]}}}

### For product_id='B000Z4Y50M'

In [57]:
reviews_2=client.query\
                .get('Reviews', ['review_text', "product_id"])\
                .with_where({
                    "path": ["product_id"],
                    "operator": "Equal",
                    "valueText": "B000Z4Y50M"
                })\
               .do()

In [58]:
reviews_2

{'data': {'Get': {'Reviews': [{'product_id': 'B000Z4Y50M',
     'review_text': "This hot sauce is without a doubt one of the best jalapeno hot sauces' on the market.  This is my second order and it is a great value for the price. It is a required staple in my pantry!  If you love hot sauce, you won't regret this purchase!"},
    {'product_id': 'B000Z4Y50M',
     'review_text': "Probably the best tasting hot sauce I've tried.  Has a nice kick to it too, but not so hot it's unbearable."}]}}}

In [61]:
result_2=client.query\
            .get('Products', ['product_id', 'summary'])\
            .with_where({
                "path": ["product_id"],
                "operator": "Equal",
                "valueText": "B000Z4Y50M"
            })\
            .do()

In [62]:
result_2

{'data': {'Get': {'Products': [{'product_id': 'B000Z4Y50M',
     'summary': 'Customers rave about this jalapeno hot sauce, calling it one of the best on the market. They praise its great value for the price and note that it has become a staple in their pantry. With a nice kick but not too unbearable, this hot sauce is highly recommended for hot sauce lovers.'}]}}}

### For product_id='B003ULDEU4'

In [86]:
reviews_3=client.query\
                .get('Reviews', ['review_text', "product_id"])\
                .with_where({
                    "path": ["product_id"],
                    "operator": "Equal",
                    "valueText": "B003ULDEU4"
                })\
               .do()

In [87]:
reviews_3

{'data': {'Get': {'Reviews': [{'product_id': 'B003ULDEU4',
     'review_text': '*****<br />Dry Desert Lime is an unusual premium Numi tea. It is herbal tea (caffeine-free) containing solely dried lime---that\'s all---not lime and green/black tea, not artificial lime flavor, but real lime---alone---a truly remarkable feat. It is certified organic, and is kosher. It has the most amazing tart and lovely taste. If you like lime, you\'ll love this tea. If you don\'t like lime, definitely pass, as it is a very intense "lime experience". I love it!<br /><br />On the box, the manufacturer lyrically writes: "From the far reaches of the Arabian Desert, this rare lime has been enjoyed for thousands of years, both as a spice in Middle Eastern cuisine and as a teasan. Traditionally harvested and dried in the hot desert sun, this oasis of Vitamin C has been drunk to soothe just about everything. The first taste brings a distant yet familiar impression, leaving the trace of a fleeting memory on the t

In [84]:
result_3=client.query\
            .get('Products', ['product_id', 'summary'])\
            .with_where({
                "path": ["product_id"],
                "operator": "Equal",
                "valueText": "B003ULDEU4"

            })\
            .do()

In [85]:
result_3

{'data': {'Get': {'Products': [{'product_id': 'B003ULDEU4',
     'summary': 'Numi Dry Desert Lime herbal tea is a unique and premium tea made solely from dried lime, offering a tart and lovely taste that is perfect for lime lovers. The tea is certified organic, kosher, and described as a clean, bracing, and reviving citrus pleasure. Customers highly recommend this tea for its refreshing flavor, whether enjoyed hot or cold. Some even use it in recipes, like a Dried Lime Lentil soup, and mix it with other teas for a unique flavor experience. While some may find the lime flavor too intense, overall, Numi Dry Desert Lime tea is praised for its exotic and refreshing taste that stands out from other herbal teas on the market.'}]}}}