
# Synthetic Data Generation using NVIDIA's APIs
<table>
  <td style="text-align: center">
    <a href="https://colab.research.google.com/drive/1Qc9ehY8ykmZPmxP0JH4HHmGkUTG6oMax?usp=sharing">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
</table>


In [2]:
!pip install openai langchain
!pip install --upgrade --quiet langchain-nvidia-ai-endpoints
!pip install langchain_experimental
!pip install langchain_nvidia_ai_endpoints

Collecting openai
  Downloading openai-1.51.0-py3-none-any.whl.metadata (24 kB)
Collecting langchain
  Downloading langchain-0.3.2-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.6 kB)
Collecting langchain-core<0.4.0,>=0.3.8 (from langchain)
  Downloading langchain_core-0.3.9-py3-none-any.whl.metadata (6.3 kB)
Collecting langchain-text-splitters<0.4.0,>=0.3.0 (from langchain)
  Downloading langchain_text_splitters-0.3.0-py3-none-any.whl.metadata (2.3 kB)
Collecting langsmith<0.2.0,>=0.1.17 (from langchain)
  Downloading langsmith-0.1.131-py3-none-any.whl.metadata (13 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain)
  Downloading tenacity-8.5.0-py3-none-any.whl.metadata (1.2 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading

In [3]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.chat_models import ChatOpenAI
from pydantic import BaseModel
from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain_experimental.tabular_synthetic_data.base import SyntheticDataGenerator
from langchain_experimental.tabular_synthetic_data.openai import create_openai_data_generator, OPENAI_TEMPLATE
from langchain_experimental.tabular_synthetic_data.prompts import SYNTHETIC_FEW_SHOT_SUFFIX, SYNTHETIC_FEW_SHOT_PREFIX

In [2]:
# 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
# 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

**Use Nvidia API Keys provided you in mail or you can create by logging into** [Build Nvidia](https://build.nvidia.com/)

In [1]:
import getpass
import os

if not os.getenv("NVIDIA_API_KEY"):
    # Note: the API key should start with "nvapi-"
    os.environ["NVIDIA_API_KEY"] = getpass.getpass("Enter your NVIDIA API key: ")

Enter your NVIDIA API key: ··········


you can download csv files by this [link](https://drive.google.com/file/d/19eTFRj2ctWYOmdYHuC7h7qlBBDYqSVVM/view) , upload into your drive and connect with colab.

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# **Data Cleaning and Preprocessing**

In [3]:
import pandas as pd
dataset = pd.read_csv('/content/drive/MyDrive/product_asin.csv',encoding="ISO-8859-1",on_bad_lines='skip')
dataset.head()

Unnamed: 0,X,title,parent_asin,categories,cat1,cat2,cat3,cat4,cat5,cat6
0,1,Allegra Allergy 45ct + 15 Free,B00JENH5OI,['Health & Household','Health Care','Over-the-Counter Medication','Allergy,Sinus & Asthma','Allergy Medicine'],
1,2,InvoSpa Shiatsu Back Shoulder and Neck Massage...,B0C4L5Y711,['Health & Household','Wellness & Relaxation','Massage Tools & Equipment','Electric Massagers','Back Massagers'],,
2,4,"Kal 100 Mcg Selenium Yeast Free Tablets, 100 C...",B00020HX5S,['Health & Household','Vitamins,Minerals & Supplements','Vitamins','Multivitamins'],,
3,5,Rocky Mountain Oils Cinnamon Bark Essential Oi...,B07K363N3S,['Health & Household','Health Care','Alternative Medicine','Aromatherapy'],,,
4,12,"Prevail Super Absorbent Underpads, Prevail Sup...",B00ACMDOOA,['Health & Household','Health Care','Incontinence & Ostomy','Protective Briefs & Underwear'],,,


In [4]:
dataset.dtypes

Unnamed: 0,0
X,int64
title,object
parent_asin,object
categories,object
cat1,object
cat2,object
cat3,object
cat4,object
cat5,object
cat6,object


In [5]:
dataset['categories'] = dataset['categories'].str.replace(r"[\[\]']", "", regex=True)
dataset['categories']

Unnamed: 0,categories
0,Health & Household
1,Health & Household
2,Health & Household
3,Health & Household
4,Health & Household
...,...
482980,Health & Household
482981,Health & Household
482982,Health & Household
482983,Health & Household


In [6]:
dataset['cat1'] = dataset['cat1'].str.replace(r"[\[\]']", "", regex=True)
dataset['cat1']

Unnamed: 0,cat1
0,Health Care
1,Wellness & Relaxation
2,Vitamins
3,Health Care
4,Health Care
...,...
482980,Wellness & Relaxation
482981,Diet & Sports Nutrition
482982,Medical Supplies & Equipment
482983,Medical Supplies & Equipment


In [7]:
dataset.tail(30)

Unnamed: 0,X,title,parent_asin,categories,cat1,cat2,cat3,cat4,cat5,cat6
482955,797513,"Dynarex Gauze Sponge 2""x 2"" 12 Ply. Pack of 200",B00QH9WGWS,Health & Household,Health Care,'First Aid','Bandages & Bandaging Supplies','Gauze & Pads','Gauze'],
482956,797514,Best Naturals MSM Powder 4 OZ,B01CO8SLLA,Health & Household,Vitamins,Minerals & Supplements','MSM'],,,
482957,797515,"Max Load - 2 caps,(M.D. Science Lab)",B001QZ94S2,Health & Household,Vitamins,Minerals & Supplements','Herbal Supplements'],,,
482958,797516,ULCERx 10-15 mmHg Underliner Size: Large Long ...,B004LNHJBK,Health & Household,Medical Supplies & Equipment,'Braces,Splints & Supports'],,,
482959,797517,"KuToo Replacement for Gear S3 Band, 22mm Gear ...",B079868YZW,Health & Household,Heart Health Event,,,,,
482960,797518,"Hopgo Ankle Brace for Women Men, Foot Sleeve A...",B08JQ36YZM,Health & Household,Medical Supplies & Equipment,'Braces,Splints & Supports','Leg & Foot Supports','Foot Supports'],
482961,797521,TPS Professional Deep Tissue Massage Gun Relax...,B0881YY8YW,Health & Household,Wellness & Relaxation,'Massage Tools & Equipment','Electric Massagers','Back Massagers'],,
482962,797522,Inbody Dial H20b Body Fat Composition Analyzer...,B017B1TDOO,Health & Household,Wellness & Relaxation,'Fitness & Activity Monitors'],,,,
482963,797526,Awkward Styles funny cat face masks washable r...,B08FJ2P55L,Health & Household,Medical Supplies & Equipment,'Cloth Face Masks & Accessories','Cloth Face Masks'],,,
482964,797527,"LG G5 Case with Free Screen Protector,Funyye S...",B01M304JPQ,Health & Household,Medical Supplies & Equipment,'Braces,Splints & Supports','Back,Neck & Shoulder Supports','Back Braces']


In [8]:
dataset = dataset.drop('cat6', axis=1)


In [9]:
dataset['cat2'] = dataset['cat2'].str.replace(r"[\[\]']", "", regex=True)
dataset['cat3'] = dataset['cat3'].str.replace(r"[\[\]']", "", regex=True)
dataset['cat4'] = dataset['cat4'].str.replace(r"[\[\]']", "", regex=True)
dataset['cat5'] = dataset['cat5'].str.replace(r"[\[\]']", "", regex=True)
dataset.head(10)


Unnamed: 0,X,title,parent_asin,categories,cat1,cat2,cat3,cat4,cat5
0,1,Allegra Allergy 45ct + 15 Free,B00JENH5OI,Health & Household,Health Care,Over-the-Counter Medication,Allergy,Sinus & Asthma,Allergy Medicine
1,2,InvoSpa Shiatsu Back Shoulder and Neck Massage...,B0C4L5Y711,Health & Household,Wellness & Relaxation,Massage Tools & Equipment,Electric Massagers,Back Massagers,
2,4,"Kal 100 Mcg Selenium Yeast Free Tablets, 100 C...",B00020HX5S,Health & Household,Vitamins,Minerals & Supplements,Vitamins,Multivitamins,
3,5,Rocky Mountain Oils Cinnamon Bark Essential Oi...,B07K363N3S,Health & Household,Health Care,Alternative Medicine,Aromatherapy,,
4,12,"Prevail Super Absorbent Underpads, Prevail Sup...",B00ACMDOOA,Health & Household,Health Care,Incontinence & Ostomy,Protective Briefs & Underwear,,
5,15,Life Extension Bio-Curcumin Elite 400 mg 60 Ve...,B01LYS06EF,Health & Household,Vitamins,Minerals & Supplements,Herbal Supplements,Curcumin,
6,18,Contour Flip Pillow - 10-in-1 Rest Positions W...,B0C5JWD7FR,Health & Household,Medical Supplies & Equipment,Mobility & Daily Living Aids,Bedroom Aids & Accessories,Contoured Support Pillows,
7,20,DAWGS Women's Loudmouth Patterns Z Sandals | L...,B00VMGV2DK,Health & Household,Health Care,Foot Health,Inserts & Insoles,Insoles,
8,21,Optimum Nutrition Amino Energy Naturally Flavo...,B074K5KR9K,Health & Household,Diet & Sports Nutrition,Endurance & Energy,Powders,,
9,23,Analgesic Balm Counterpain Methyl Salicylate M...,B009NBVZGK,Health & Household,Health Care,Over-the-Counter Medication,Pain Relievers,,


In [10]:
dataset = dataset.drop('X', axis=1)


In [11]:
reviews = pd.read_csv('/content/drive/MyDrive/reviews_supplements.csv',encoding="ISO-8859-1",on_bad_lines='skip')
reviews.head()

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase,date,time
0,4,B Complex in gel cap form,I bought this along with Vit C in gel cap form...,B00012ND5G,B00012ND5G,AGDVFFLJWAQ3ULNNKF4LXID2RVSQ,12/11/2009 0:37,1,True,12/11/2009,0:37
1,5,Five Stars,great product,B00013Z0ZQ,B00013Z0ZQ,AG3BSKXHDGP6E3EGQD2SXCK6KFQQ,1/4/2015 3:11,0,True,1/4/2015,3:11
2,5,Five Stars,Came as expectedly,B00013Z0ZQ,B00013Z0ZQ,AHG2WKFD4LXPC46WWC6JMQGX52JA,9/27/2015 19:15,0,True,9/27/2015,19:15
3,5,Vitamin Shoppe Dry Vitamin A,Excellent Product ..... Fast Delivery ....... ...,B00013Z1KA,B00013Z1KA,AEOF7RT3AC4ACRX5HGIP2V3BNIHA,33:16.9,0,True,2/9/2019,19:33
4,5,Un producto que compro regularmente,Es muy buena vitamina,B00013Z1KA,B00013Z1KA,AGW2WETWQRL2PKUGTL2LU7IJ2BPQ,11:10.9,0,True,7/25/2022,14:11


In [12]:
reviews.dtypes

Unnamed: 0,0
rating,int64
title,object
text,object
asin,object
parent_asin,object
user_id,object
timestamp,object
helpful_vote,int64
verified_purchase,bool
date,object


In [13]:
reviews = reviews.drop('timestamp', axis=1)


In [14]:
reviews['date'] = pd.to_datetime(reviews['date'],format= '%m/%d/%Y')
reviews['time'] = pd.to_datetime(reviews['time'],format= '%H:%M').dt.time

In [15]:
reviews.tail(20)

Unnamed: 0,rating,title,text,asin,parent_asin,user_id,helpful_vote,verified_purchase,date,time
16651,4,GOOD BUT NOT PURE VITAMIN E GEL,"This was all on me, I didnâ??t read the descri...",B0009VO8EO,B0C7RQ35K7,AHFCZCKJBBX2ML64VPKEQSGVWCXQ,0,True,2021-12-10,16:38:00
16652,5,Good product,I use this on my hairless dogs to moisturize t...,B0009VO8EO,B0C7RQ35K7,AHS4RPSZFVJRUTZJCLAD5VH6EODA,0,True,2021-12-29,19:17:00
16653,5,Vitamin E gel,This is used by my husband as a shaving gel. ...,B0009VO8EO,B0C7RQ35K7,AH3Y77XAJ3LK4KMQGNKY6YVSMVYA,0,True,2022-01-05,01:08:00
16654,5,Husband loves it,He uses it as moisturizer for face. Could buy ...,B0009VO8EO,B0C7RQ35K7,AGCQT2ESKOMG3DQUL6BRFSWCTVCA,0,True,2022-02-17,18:52:00
16655,4,"Safe, highly recommended product....",My wife just had a lumpectomy to remove 2 smal...,B0009VO8EO,B0C7RQ35K7,AHT6IDM2XPK35WDNX3H2IHR5TAFA,1,True,2022-02-20,13:00:00
16656,5,The best natural moisturizer,I have been using fruit of the earth vitamin E...,B0009VO8EO,B0C7RQ35K7,AF4VYMCW3WKJEOOOKZLOROB3MQWA,2,True,2022-03-03,06:13:00
16657,3,Sticky,Just ok would not buy again stick no moisture ...,B0009VO8EO,B0C7RQ35K7,AGA3PQKDGXZQB7D3XXO74OO3GM7A,0,True,2022-03-14,06:00:00
16658,5,Fruit of the Earth Vitamin-E gel has great soo...,I love the value for the money.,B01BLPGBRI,B0C7RQ35K7,AHMH6OFLVQPTRP6ZXK6E2CWZ55UQ,0,True,2022-03-15,20:56:00
16659,1,Broke my face out.,Didnâ??t care for this product. Beware if you ...,B0009VO8EO,B0C7RQ35K7,AEOFUDYBYBAVP5PYFXWDVBYT3AKA,0,True,2022-03-31,18:09:00
16660,5,Great service,Nothing,B0009VO8EO,B0C7RQ35K7,AFHNGSSWZ5YF4VKGYTD6AHCWOJSA,0,True,2022-05-30,06:08:00


In [16]:
df = pd.merge(dataset, reviews, on='parent_asin', how='inner')
df.head(5)

Unnamed: 0,title_x,parent_asin,categories,cat1,cat2,cat3,cat4,cat5,rating,title_y,text,asin,user_id,helpful_vote,verified_purchase,date,time
0,Bariatric Fusion Bariatric Multivitamin Soft C...,B0BKC2WYWB,Health & Household,Heart Health Event,,,,,5,Good vitamins,"Is a bariatric patient, my vitamins are really...",B07JH63HWS,AF2HY3SCRK45T2ATV7FKRYUJCCTA,0,True,2019-12-12,12:43:00
1,Bariatric Fusion Bariatric Multivitamin Soft C...,B0BKC2WYWB,Health & Household,Heart Health Event,,,,,1,Taste terrible!,Not only did it taste terrible with a horrible...,B07JGTMK2L,AEW6J6HI2GG7WWJSS77H6LXW3MYA,0,True,2020-04-02,02:18:00
2,Bariatric Fusion Bariatric Multivitamin Soft C...,B0BKC2WYWB,Health & Household,Heart Health Event,,,,,5,Good alternative to pills,"Good flavor and no artificial sweeteners, whic...",B07JGFHQ5P,AEOSYG7MLFUOIQXBSJ7UBZZBOTWQ,0,True,2020-08-22,03:47:00
3,Bariatric Fusion Bariatric Multivitamin Soft C...,B0BKC2WYWB,Health & Household,Heart Health Event,,,,,5,Certainly a good purchase,so Bariatric Fusion has multiple flavors and q...,B08HG44W98,AGPAXVLELK72U4USTT4LJ2JSG47Q,4,True,2020-10-17,21:21:00
4,Bariatric Fusion Bariatric Multivitamin Soft C...,B0BKC2WYWB,Health & Household,Heart Health Event,,,,,3,Not tasty at all,This looked like one of the best bariatric vit...,B07JH63HWS,AH73KJX7VIFLXKRLC6TLB7F3Z2VQ,0,True,2020-12-31,23:52:00


In [17]:
len(df)

16663

In [18]:
df.to_csv('amazon_product_reviews.csv', index=False)

**Converting my dataframe to list of dictionaries for Few Shot Prompt template**

In [19]:
df = df.head(20)

In [20]:
# Converting the DataFrame to a list of dictionaries
examples = df.to_dict(orient='records')

In [21]:
examples[0]

{'title_x': 'Bariatric Fusion Bariatric Multivitamin Soft Chew | Tropical Fruit Flavor | Chewy for Post Bariatric Surgery Patients Including Gastric Bypass and Sleeve Gastrectomy | 60 Count | 1 Month Supply',
 'parent_asin': 'B0BKC2WYWB',
 'categories': 'Health & Household',
 'cat1': ' Heart Health Event',
 'cat2': nan,
 'cat3': nan,
 'cat4': nan,
 'cat5': nan,
 'rating': 5,
 'title_y': 'Good vitamins',
 'text': 'Is a bariatric patient, my vitamins are really important!! These Taste pretty good and seems to work.',
 'asin': 'B07JH63HWS',
 'user_id': 'AF2HY3SCRK45T2ATV7FKRYUJCCTA',
 'helpful_vote': 0,
 'verified_purchase': True,
 'date': Timestamp('2019-12-12 00:00:00'),
 'time': datetime.time(12, 43)}

# **Creating Prompt Template**

In [22]:
template_str = """
Product Review:
- Product Title: {title_x}
- Parent ASIN: {parent_asin}
- Categories: {categories}
- Sub-Category 1: {cat1}
- Sub-Category 2: {cat2}
- Sub-Category 3: {cat3}
- Sub-Category 4: {cat4}
- Sub-Category 5: {cat5}
- Rating: {rating}/5
- Review Title: {title_y}
- Review Text: "{text}"
- Product ASIN: {asin}
- User ID: {user_id}
- Helpful Votes: {helpful_vote}
- Verified Purchase: {verified_purchase}
- Review Date: {date}
- Review Time: {time}
"""


In [25]:
from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain.prompts import PromptTemplate, FewShotPromptTemplate
from langchain_experimental.tabular_synthetic_data.openai import create_openai_data_generator
from langchain.output_parsers import PydanticOutputParser
from langchain_experimental.tabular_synthetic_data.prompts import SYNTHETIC_FEW_SHOT_SUFFIX, SYNTHETIC_FEW_SHOT_PREFIX
from pydantic import BaseModel

In [26]:
# Template for the synthetic data example
SYNTHETIC_DATA_EXAMPLE = PromptTemplate(
    input_variables=["title_x", "parent_asin", "categories", "cat1", "cat2", "cat3", "cat4", "cat5", "rating", "title_y", "text", "asin", "user_id", "helpful_vote", "verified_purchase", "date", "time"],
    template=template_str
)

prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject","extra"],
    example_prompt=SYNTHETIC_DATA_EXAMPLE,
)

In [27]:

from os import strerror
class AmazonProductReviews(BaseModel):
    patient_id: int
    patient_name: str
    diagnosis_code: str
    procedure_code: str
    total_charge: str
    insurance_claim_amount: str
    title_x: str
    parent_asin: str
    categories: str
    cat1: str
    cat2: str
    cat3: str
    cat4: str
    cat5: str
    rating: int
    title_y: str
    text: str
    asin: str
    user_id: str
    helpful_vote: int
    verified_purchase: bool
    date: str
    time: str

# **Creating Data Generator**

In [28]:
synthetic_data_generator = create_openai_data_generator(
    output_schema=AmazonProductReviews,
    llm=ChatNVIDIA(model="nvidia/nemotron-4-340b-instruct",temperature=0.2),
    prompt=prompt_template
)

In [29]:
synthetic_results = synthetic_data_generator.generate(
    subject="amazon_product_reviews",
    extra = "the name must be chosen at random. Make it something you wouldn't normally choose.",
    runs=10,
)

Exception: [500] Internal Server Error
Inference error
RequestID: c86aaca0-6264-4ed8-9df4-614a34f5ae05