In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext, SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from delta import *
from chunkipy import TextChunker, TokenEstimator
from numpy import exp
import boto3
import builtins
import json
import math
import os
import pandas as pd
import numpy as np
import logging
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
builder = SparkSession.builder.appName("amzn-reviews") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.driver.cores", "2") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.memory","28g") \
    .config("spark.executor.cores","5") \
    .config("spark.executor.instances","2") \
    .config("spark.sql.execution.arrow.pyspark.enabled",True) \
    .config("spark.sql.execution.arrow.pyspark.fallback.enabled", True) \
    .config("spark.sql.parquet.mergeSchema", False) \
    .config("spark.hadoop.parquet.enable.summary-metadata", False) \
    .enableHiveSupport()

spark = configure_spark_with_delta_pip(builder).getOrCreate()

sc = spark.sparkContext

:: loading settings :: url = jar:file:/opt/spark-3.5.0-bin-hadoop3/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
io.delta#delta-spark_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-921b495f-9259-4f63-b7fb-5f6bfae3c452;1.0
	confs: [default]
	found io.delta#delta-spark_2.12;3.1.0 in local-m2-cache
	found io.delta#delta-storage;3.1.0 in local-m2-cache
	found org.antlr#antlr4-runtime;4.9.3 in local-m2-cache
:: resolution report :: resolve 168ms :: artifacts dl 8ms
	:: modules in use:
	io.delta#delta-spark_2.12;3.1.0 from local-m2-cache in [default]
	io.delta#delta-storage;3.1.0 from local-m2-cache in [default]
	org.antlr#antlr4-runtime;4.9.3 from local-m2-cache in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	--------------------------------------------------------------

In [3]:
# Get secrets credential for S3a
REGION = "us-east-1"
client = boto3.client('secretsmanager',region_name=REGION)
response = client.get_secret_value(
    SecretId='s3all'
)
accessJson = json.loads(response['SecretString'])
accessKeyId = accessJson['accessKey']
secretAccessKey = accessJson['secretAccess']

# Configure S3a
sc._jsc.hadoopConfiguration().set("fs.s3a.access.key", accessKeyId)
sc._jsc.hadoopConfiguration().set("fs.s3a.secret.key", secretAccessKey)
sc._jsc.hadoopConfiguration().set("fs.s3a.path.style.access", "true")
sc._jsc.hadoopConfiguration().set("fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
sc._jsc.hadoopConfiguration().set("fs.s3a.endpoint", "s3.us-east-1.amazonaws.com")

In [4]:
BUCKET = "amzn-customer-reviews-228924278364"
#PREFIX = "amzn-customer-reviews-partitioned/category=Books/review_year=2014/review_month=10/review_day=10/"
PREFIX = "delta/amznreviews/books/"
PATH = f"s3a://{BUCKET}/{PREFIX}"
REGION = "us-east-1"

print(PATH)

s3a://amzn-customer-reviews-228924278364/delta/amznreviews/books/


In [5]:
def getBodyJson(system_prompt, message_json):
        body_summary = json.dumps({
            "max_tokens": 2048,
            "temperature": 0.5,
            "system": system_prompt,
            "messages": [ message_json ],
            "anthropic_version": "bedrock-2023-05-31"
        })

        return body_summary

def getMessageJson(user_prompt):
        user_prompt = f"""\n\n\nHuman: {user_prompt}\n\nAssistant:"""
        message_json = {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": user_prompt
                }
            ]
        }
        return message_json

def getPrompt(type,txt):
        if type == "summary":
            user_prompt = f"""Write a summary of the following review; all your output has to be grammatically correct in a single paragraph:\n{txt}"""
        elif type == "sentiment":
            user_prompt = f"""The text below is a book review. Tell me the sentiment of the text here:\n{txt}\n The sentiment categories are: \n(1) Positive\n(2) Negative\n(3) Neutral"""
        elif type == "sentiment_one_label":
            user_prompt = f"""The text below is a book review.\nTell me the sentiment of the text here by returning only one label, which can be (1) Positive, (2) Negative, or (3) Neutral.\nThe text is here:\n{txt}\n"""
        else:
            user_prompt = "error"

        return user_prompt



In [19]:
def getPdfAnthropic(df:pd.DataFrame)-> pd.DataFrame:

    # Define model ID
    model_id = 'anthropic.claude-3-haiku-20240307-v1:0'
    # Set system prompt
    system_prompt = "The texts here are all book reviews."

    # Instantiate bedrock
    client_bedrock = boto3.client("bedrock-runtime", REGION)

    def getAnthropic(type,txt):
        user_prompt = getPrompt(type,txt)

        if user_prompt == "error":
            return "error"
        else:
            message_json = getMessageJson(user_prompt)
            body = getBodyJson(system_prompt, message_json)
        
            response = client_bedrock.invoke_model(modelId=model_id, body=body)
            response_body_summary = json.loads(response["body"].read())
            response_summary = response_body_summary["content"][0]["text"]

        return response_summary

    
    reviewText = [d.reviewText for idx, d in df.iterrows()]

    anthropicSummary_list = []
    
    for r in reviewText:
        each_summary = getAnthropic("summary",r)
        anthropicSummary_list.append(each_summary)

    anthropicSummary_array = np.array([anthropicSummary_list])
    anthropicSummary_concat = np.concatenate(anthropicSummary_array)

    return_df = (
        df[[
            "asin","overall","reviewText",
            "reviewTimeTS", "reviewerID",
            "reviewerName", "summary",
            "verified", "row_id",
            "asin_key", "row_number"
          ]]
        .assign(anthropicSummary=list(anthropicSummary_concat))
    )

    return return_df
    

In [7]:
df = spark.read.format("delta").load(PATH)

24/03/19 02:23:00 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


In [None]:
print(type(df))

In [None]:
df.printSchema()

In [8]:
df.count()

24/03/19 02:23:06 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

23184

In [9]:
 # Create row_count
df = df.withColumn("temp_column", lit("A"))
w = Window().partitionBy("temp_column").orderBy(lit("A"))
df = df.withColumn("row_id", row_number().over(w)).drop("temp_column")

In [10]:
df.printSchema()

root
 |-- asin: string (nullable = true)
 |-- overall: double (nullable = true)
 |-- reviewText: string (nullable = true)
 |-- reviewTimeTS: timestamp (nullable = true)
 |-- reviewerID: string (nullable = true)
 |-- reviewerName: string (nullable = true)
 |-- summary: string (nullable = true)
 |-- verified: boolean (nullable = true)
 |-- row_number: integer (nullable = true)
 |-- asin_key: integer (nullable = true)
 |-- row_id: integer (nullable = false)



In [27]:
df_test = df.limit(30)

In [28]:
df_test.count()

30

In [29]:
df_test.show()

+----------+-------+--------------------+-------------------+--------------+-------------------+--------------------+--------+----------+--------+------+
|      asin|overall|          reviewText|       reviewTimeTS|    reviewerID|       reviewerName|             summary|verified|row_number|asin_key|row_id|
+----------+-------+--------------------+-------------------+--------------+-------------------+--------------------+--------+----------+--------+------+
|0002005107|    5.0|Thumps DreadfulWa...|2008-07-13 00:00:00| AACMLWNILA6CL|Diana Tixier Herald|         Love Thumps|   false|         1| 2005107|     1|
|0006280544|    3.0|Mere Chistianity ...|2008-07-13 00:00:00|A1W98Q4MZK80VA|Joseph P. Tevington|Nothing "Mere" Ab...|   false|         1| 6280544|     2|
|0007127898|    5.0|Mabinogion myth m...|2008-07-13 00:00:00|A1VCUE373CMQBY|    Amazon Customer|Let this story bl...|   false|         1| 7127898|     3|
|0007256760|    4.0|[...]\nIt was a g...|2008-07-13 00:00:00| APPWA0SMSBT44|

In [30]:
schema = StructType(
       [
           StructField('asin', StringType(), True),
           StructField('overall', DoubleType(), True),
           StructField('reviewText', StringType(), True),
           StructField('reviewTimeTS', TimestampType(), True),
           StructField('reviewerID', StringType(), True),
           StructField('reviewerName', StringType(), True),
           StructField('summary', StringType(), True),
           StructField('verified', BooleanType(), True),
           StructField('row_id', IntegerType(), True),
           StructField('row_number', IntegerType(), True),
           StructField('asin_key', IntegerType(), True),
           StructField("anthropicSummary", StringType(), True)
       ]
    )

In [31]:
df_summary_test = ( df_test
      .groupBy(spark_partition_id().alias("_pid"))
    )

In [32]:
df_summary_test.count().show()

+----+-----+
|_pid|count|
+----+-----+
|   0|   30|
+----+-----+



In [33]:
df_summary = ( df_test
      .groupBy(spark_partition_id().alias("_pid"))
      .applyInPandas(getPdfAnthropic,schema)
    )

In [34]:
# Get start time
t0 = time.localtime()
start_time = time.strftime("%H:%M:%S", t0)
print(start_time)

# Get Dataframe
df_summary.cache()
df_summary.show()

# Get end time
t1 = time.localtime()
end_time = time.strftime("%H:%M:%S", t1)
print(end_time)



02:31:35


[Stage 73:>                                                         (0 + 1) / 1]

+----------+-------+--------------------+-------------------+--------------+-------------------+--------------------+--------+------+----------+--------+--------------------+
|      asin|overall|          reviewText|       reviewTimeTS|    reviewerID|       reviewerName|             summary|verified|row_id|row_number|asin_key|    anthropicSummary|
+----------+-------+--------------------+-------------------+--------------+-------------------+--------------------+--------+------+----------+--------+--------------------+
|0002005107|    5.0|Thumps DreadfulWa...|2008-07-13 00:00:00| AACMLWNILA6CL|Diana Tixier Herald|         Love Thumps|   false|     1|         1| 2005107|The review is abo...|
|0006280544|    3.0|Mere Chistianity ...|2008-07-13 00:00:00|A1W98Q4MZK80VA|Joseph P. Tevington|Nothing "Mere" Ab...|   false|     2|         1| 6280544|The review provid...|
|0007127898|    5.0|Mabinogion myth m...|2008-07-13 00:00:00|A1VCUE373CMQBY|    Amazon Customer|Let this story bl...|   false

                                                                                

In [36]:
# Difference 
diff_time = ( time.mktime(t1) - time.mktime(t0) )
print(f"Difference (seconds): {diff_time}")

Difference (seconds): 63.0


In [35]:
df_summary_text = df_summary.select("reviewText", "anthropicSummary").collect()

In [38]:
print(df_summary_text[1]["anthropicSummary"])

