# Create review dataset

In [None]:
from time import sleep
import random
import json
from vertexai.preview.language_models import TextGenerationModel

In [None]:
sentiments = [("positive",4,5), ("negative",1,3), ("neutral",2,4), ("positive",4,5)]
model = TextGenerationModel.from_pretrained("text-bison")

review_prompt_1 = """<Instructions>
The text below is a product title and description.
You are a customer of Cymbal and you need to write a {} review talking about your experience with the product. Be polite, informal and succinct.
</Instructions>
<ProductDescription>
Product title: {}
Product description: {}
</ProductDescription>
<Output>"""

review_prompt_2 = """The text below is a product title and description for a product you just bought.
Write a {} review for the website talking about your experience with the product. Use one sentence, be polite and formal.
Product title: {}
Product description: {}
output:"""

review_prompt_3 = """Create a {} review for Cymbal retailer website talking about your experience with the product below. Be polite, informal and succinct.
Write the review in one sentence.
Product title: {}
Product description: {}
output:"""

review_prompt_4 = """Write a {} review for a website talking about the product below.
Write the review in one sentence and use a formal tone.
Product title: {}
Product description: {}
output:"""

review_prompt_5 = """Create a short {} review for a website talking about the product below, in one sentence.
You are a long time customer and you already had many interactions with this website.
Product title: {}
Product description: {}
output:"""

review_prompt_6 = """Create a {} review for the product below in one sentence. Be succinct. Use an informal tone.
Product title: {}
Product description: {}
output:"""

review_prompt_7 = """Create a {} review in one sentence for a website talking about the product below. Be honest and use nice words.
Product title: {}
Product description: {}
output:"""

review_prompt_8 = """The text below is related to a product in a retailer website called Cymbal.
Create a {} review for a retail website talking about your experience with the product. Be succinct and use only once sentence.
Remember this is your first buy in this website and you are new to the platform.
Product title: {}
Product description: {}
output:"""

review_prompt_9 = """The text below is related to a product sold by Cymbal retailer website.
In one sentence, create a review in one sentence talking about your {} experience with the product. Use a formal tone.
Title: {}
Description: {}
output:"""

review_prompt_9 = """You are a long time customer of Cymbal retail online store.
Create a {} review of the product below talking about the experience with it, in one sentence.
Product title: {}
Product description: {}
output:"""

review_prompt_10 = """You are a customer who just bought a product. Create a review talking about your {} interaction with the product in one sentence.
Be nice, but honest.
Product title: {}
Product description: {}
output:"""

prompts = [review_prompt_1, review_prompt_2, review_prompt_3, review_prompt_4, review_prompt_5, review_prompt_6, review_prompt_7, review_prompt_8, review_prompt_9, review_prompt_10]

In [None]:
reviews = []

with open("./recommendation_products.jsonl", "r") as f:
    products = f.readlines()


with open("./product_reviews.jsonl", "a") as f:
    i=0
    for product in products:
        print(i)
        product = json.loads(product)
        title = product["title"]
        description = product["description"]
        category = product["categories"]
        id = product["id"]

        for prompt in prompts:
            sentiment = random.choice(sentiments)
            stars = random.randint(sentiment[1], sentiment[2])
            review = model.predict(
                prompt=prompt.format(
                    sentiment[0],
                    title,
                    description
                )
            ).text
            payload = {
                "id": id,
                "title": title,
                "description": description,
                "category": category,
                "review": review,
                "sentiment": sentiment[0],
                "stars": stars
            }
            f.write(json.dumps(payload) + "\n")
        sleep(3)
        i+=1

# Create Dialog dataset

In [None]:
#     - User interactions with support (product, category, dialog, resolved, score de sentimento)
#         .Para cada produto, gerar 10 support cases

In [None]:
from time import sleep
import random
import json
from vertexai.preview.language_models import TextGenerationModel

model = TextGenerationModel.from_pretrained("text-bison")

In [None]:
sentiments = [("positive",4,5), ("negative",1,3), ("neutral",2,4), ("positive",4,5)]
resolved = ["was resolved", "was not resolved"]

customer_wants = [
    "ask simple questions about products or services",
    "get help with troubleshooting",
    "get updates on their order status",
    "track returns",
    "submit complaints or feedback",
    "cancel subscriptions",
    "attach documents or screenshots to help explain their issue",
    "get updates on orders and promotions",
    "download manuals and other documents about the product",
    "request a return of the product",
    "request an exchange of the product"
]

prompt = """Create a simulated conversation (dialog) between a support agent and a customer of Cymbal Furniture online store. Use the product title and description below to help create the conversation.
The conversation must be natural and use informal tone.
The customer wants to {} and the agent must help the customer. The dialog have at least 5 interactions.
At the end of conversation the case {} and the overall sentiment of the customer is {}.
Product title: {}
Product description: {}

Output:"""

In [None]:
with open("./recommendation_products.jsonl", "r") as f:
    products = f.readlines()


with open("./service_conversations.jsonl", "a") as f:
    count = 0
    for product in products[:10]:
        print(count)
        product = json.loads(product)
        title = product["title"]
        description = product["description"]
        category = product["categories"]
        id = product["id"]

        for i in customer_wants:
            for resolved in ["was resolved", "was not resolved"]:
                sentiment = random.choice(sentiments)
                stars = random.randint(sentiment[1], sentiment[2])
                dialog = model.predict(
                    prompt=prompt.format(
                        i,
                        resolved,
                        sentiment[0],
                        title,
                        description
                    ),
                    max_output_tokens=1024
                ).text
                payload = {
                    "id": id,
                    "title": title,
                    "description": description,
                    "category": category,
                    "dialog": dialog,
                    "sentiment": sentiment[0],
                    "stars": stars
                }
                f.write(json.dumps(payload) + "\n")
        sleep(6)
        count += 1

# Final Format Conversation Dataset

In [None]:
# male_names = [
# "James",
# "Robert",
# "John",
# "Michael",
# "David",
# "William",
# "Richard",
# "Joseph",
# "Thomas",
# "Christopher",
# "Charles",
# "Daniel",
# "Matthew",
# "Anthony",
# "Mark",
# "Donald",
# "Steven",
# "Andrew",
# "Paul",
# "Joshua"]

# # Default female names to be included in the email copy.
# # We didn't included this in the BigQuery table to not expose any PII.
# # You can change this names if you want.
# female_names = [
# "Mary",
# "Patricia",
# "Jennifer",
# "Linda",
# "Elizabeth",
# "Barbara",
# "Susan",
# "Jessica",
# "Sarah",
# "Karen",
# "Lisa",
# "Nancy",
# "Betty",
# "Sandra",
# "Margaret",
# "Ashley",
# "Kimberly",
# "Emily",
# "Carol",
# "Michelle"]

# import json

# with open("agent_list.json", "a") as f:
#     for i, name in enumerate(female_names+male_names):
#         agent = json.dumps({
#             "agent_id": i,
#             "agent_name": name,
#             "agent_email": name + "@customerservices.agents"
#         })
#         f.write(agent + "\n")

In [1]:
import random
import json

# Define the total number of individual numbers and the number of buckets
total_numbers = 19250
num_buckets = 40

# Generate a list of sequential integers from 0 to total_numbers - 1
numbers = list(range(total_numbers))

# Initialize an empty list for each bucket
agent_buckets = [[] for _ in range(num_buckets)]

# Randomly distribute numbers into buckets
for number in random.sample(numbers, len(numbers)):
    # Choose a random bucket and append the number to it
    random_bucket = random.choice(agent_buckets)
    random_bucket.append(number)

In [2]:
# Define the total number of individual numbers and the number of buckets
total_numbers = 19250
num_buckets = 2000

# Generate a list of sequential integers from 0 to total_numbers - 1
numbers = list(range(total_numbers))

# Initialize an empty list for each bucket
customers_buckets = [[] for _ in range(num_buckets)]

# Randomly distribute numbers into buckets
for number in random.sample(numbers, len(numbers)):
    # Choose a random bucket and append the number to it
    random_bucket = random.choice(customers_buckets)
    random_bucket.append(number)

In [3]:
import json 

with open("customer_list.json", "r") as f:
    customer_list = f.readlines()
    customer_list = [json.loads(i) for i in customer_list]

for i in range(len(customer_list)):
    customer_list[i]["customer_id"] = i

with open("service_conversations.jsonl", "r") as f:
    conversations = f.readlines()
    conversations = [json.loads(i) for i in conversations]

for i in range(len(conversations)):
    if i%2 == 0:
        conversations[i]["status"] = "resolved"
    else:
        conversations[i]["status"] = "not resolved"

with open("agent_list.json", "r") as f:
    agents = f.readlines()
    agents = [json.loads(i) for i in agents]

In [4]:
# Assign customers to conversations
for customer_id, bucket in enumerate(customers_buckets):
    for element in bucket:
        conversations[element]["customer_id"] = customer_list[customer_id]["customer_id"]
        conversations[element]["customer_email"] = customer_list[customer_id]["email"]

In [5]:
# Assign agents to conversation
for agent_id, bucket in enumerate(agent_buckets):
    for element in bucket:
        conversations[element]["agent_id"] = agents[agent_id]["agent_id"]
        conversations[element]["agent_email"] = agents[agent_id]["agent_email"]

In [6]:
conversations[0]

{'id': '1000',
 'title': "Women's White Cotton Robe - Soft, Absorbent, and Comfortable",
 'description': "Our women's white cotton robe is the perfect way to relax and unwind after a long day. Made from 100% cotton, this robe is soft, absorbent, and comfortable. It features a shawl collar, two patch pockets, and a self-tie belt. This robe is available in sizes S-XXL.\n\nHere are some of the benefits of our women's white cotton robe:\n\n* Made from 100% cotton for softness and absorbency\n* Shawl collar for added warmth\n* Two patch pockets for storing essentials\n* Self-tie belt for a secure fit\n* Available in sizes S-XXL\n\nOrder your women's white cotton robe today and experience the ultimate in comfort and relaxation!",
 'category': 'Bath Robe',
 'dialog': " **Customer**: Hello! I'm interested in purchasing a white cotton robe from your store. Can you tell me more about it?\n\n**Agent**: Absolutely! Our women's white cotton robe is made from 100% cotton, making it incredibly soft a

In [7]:
final_dataset = []

for i, conversation in enumerate(conversations):
    product_id = conversation["id"]
    del conversation["id"]

    c = conversation["dialog"]
    del conversation["dialog"]

    rating = conversation["stars"]
    del conversation["stars"]

    conversations[i]["product_id"] = product_id
    conversations[i]["conversation"] = c
    conversations[i]["rating"] = str(rating)

    position = conversation["conversation"].find("**Overall sentiment")
    conversations[i]["conversation"] = conversations[i]["conversation"][:position]

In [14]:
conversations[9]

{'title': "Women's White Cotton Robe - Soft, Absorbent, and Comfortable",
 'description': "Our women's white cotton robe is the perfect way to relax and unwind after a long day. Made from 100% cotton, this robe is soft, absorbent, and comfortable. It features a shawl collar, two patch pockets, and a self-tie belt. This robe is available in sizes S-XXL.\n\nHere are some of the benefits of our women's white cotton robe:\n\n* Made from 100% cotton for softness and absorbency\n* Shawl collar for added warmth\n* Two patch pockets for storing essentials\n* Self-tie belt for a secure fit\n* Available in sizes S-XXL\n\nOrder your women's white cotton robe today and experience the ultimate in comfort and relaxation!",
 'category': 'Bath Robe',
 'sentiment': 'positive',
 'status': 'not resolved',
 'customer_id': 1418,
 'customer_email': 'user22561@sample_user22561.sample',
 'agent_id': 37,
 'agent_email': 'Andrew@customerservices.agents',
 'product_id': '1000',
 'conversation': " **Customer**: H

Fields:
 - customer_id
 - customer_email
 - agent_id
 - agent_email
 - product_id
 - product_title
 - product_description
 - product_category
 - conversation
 - resolved
 - sentiment
 - rating

In [15]:
with open("full_conversations.jsonl", "a") as f:
    for i, conversation in enumerate(conversations):
        data = {
            "id":str(i),
            "jsonData": json.dumps(conversation),
            "content": 
                {"mimeType": "text/html", 
                 "uri": f"gs://csm-dataset/p5-dataset/html_conversations/{i}.html"}
        }

        f.write(json.dumps(data) + "\n")

In [19]:
html_template = """<html><head><meta http-equiv="Content-Type" content="text/html; charset=windows-1252"></head><body><div class="row mt-5">
	<nav aria-label="breadcrumb">
		<ol class="breadcrumb">
			<li class="breadcrumb-item">Product Category</li>
			<li class="breadcrumb-item active" aria-current="page">{}</li>
		</ol>
	</nav>
	<div class="card mb-12">
		<div class="row g-0">
			<div class="col-md-8">
				<div class="card-body">
					<h3 class="card-title">{}</h3>
                    <p class="card-text"><small>Conversation</small></p>
                    <p class="card-text"><small>{}</small></p>
                    <p class="card-text"><small>Sentiment: {}</small></p>
                    <p class="card-text"><small>Status: {}</small></p>
					<p class="card-text"><small>Rating: {}</small></p>
				</div>
			</div>
		</div>
	</div>
</div> 
</body></html>"""

In [20]:
for i, conversation in enumerate(conversations):
    content = html_template.format(
        conversation["category"],
        conversation["title"],
        conversation["conversation"],
        conversation["sentiment"],
        conversation["status"],
        conversation["rating"]
    )
    with open(f"./dataset/html/{i}.html", "w") as f:
        f.write(content)