In [1]:
import pandas as pd
import numpy as np
from dotenv import load_dotenv
import os
import tiktoken
import openai

load_dotenv()
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
HUGGINGFACEHUB_API_TOKEN = os.getenv('HUGGINGFACEHUB_API_TOKEN')

if os.getenv("OPENAI_API_KEY") is not None:
    print ("OPENAI_API_KEY is ready")
else:
    print ("OPENAI_API_KEY environment variable not found")

OPENAI_API_KEY is ready


In [2]:
def extract_brand_name(string):
    if isinstance(string, str) and ("Brand: " in string or "Visit the " in string):
        try:
            if "Brand: " in string:
                brand_name = string.split("Brand: ")[1]
            else:
                brand_name = string.split("Visit the ")[1]
            brand_name = brand_name.replace("Store", "").strip()
            return brand_name
        except IndexError:
            pass
    return string


In [46]:
def read_data(folder_path):
    product = pd.DataFrame()
    
    for file_name in os.listdir(folder_path):
        if file_name.startswith("asin"):
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            product = pd.concat([product, df])
    
    return product

In [53]:
products = read_data("/Users/vladbordei/Documents/Development/ProductExplorer/data/raw/RaisedGardenBed")
products['product_information.brand'] = products['product_information.brand'].apply(extract_brand_name)

In [54]:
product_path = "/Users/vladbordei/Documents/Development/ProductExplorer/data/interim/products.csv"
# products_path = "./data/interim/products.csv"
products.to_csv(products_path, index=False)



In [60]:
product = products.copy()

In [43]:
# asin_list_path = './data/external/asin_list.csv'
asin_list_path = '/Users/vladbordei/Documents/Development/ProductExplorer/data/external/asin_list.csv'
asin_list = pd.read_csv(asin_list_path)['asin'].tolist()

https://github.com/openai/openai-cookbook/blob/main/examples/How_to_call_functions_with_chat_models.ipynb

In [None]:
import json
import openai
import requests
from tenacity import retry, wait_random_exponential, stop_after_attempt
from termcolor import colored

GPT_MODEL = "gpt-3.5-turbo-0613"

In [None]:
@retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(3))
def chat_completion_request(messages, functions=None, function_call=None, model=GPT_MODEL):
    headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer " + openai.api_key,
    }
    json_data = {"model": model, "messages": messages}
    if functions is not None:
        json_data.update({"functions": functions})
    if function_call is not None:
        json_data.update({"function_call": function_call})
    try:
        response = requests.post(
            "https://api.openai.com/v1/chat/completions",
            headers=headers,
            json=json_data,
        )
        return response
    except Exception as e:
        print("Unable to generate ChatCompletion response")
        print(f"Exception: {e}")
        return e

In [None]:
def pretty_print_conversation(messages):
    role_to_color = {
        "system": "red",
        "user": "green",
        "assistant": "blue",
        "function": "magenta",
    }
    formatted_messages = []
    for message in messages:
        if message["role"] == "system":
            formatted_messages.append(f"system: {message['content']}\n")
        elif message["role"] == "user":
            formatted_messages.append(f"user: {message['content']}\n")
        elif message["role"] == "assistant" and message.get("function_call"):
            formatted_messages.append(f"assistant: {message['function_call']}\n")
        elif message["role"] == "assistant" and not message.get("function_call"):
            formatted_messages.append(f"assistant: {message['content']}\n")
        elif message["role"] == "function":
            formatted_messages.append(f"function ({message['name']}): {message['content']}\n")
    for formatted_message in formatted_messages:
        print(
            colored(
                formatted_message,
                role_to_color[messages[formatted_messages.index(formatted_message)]["role"]],
            )
        )

In [None]:
functions = [
    {
        "name": "describe_product",
        "description": "Provide a detailed description of a product",
        "parameters": {
            "type": "object",
            "properties": {
                "Product Summary": {
                    "type": "string",
                    "description": "A brief summary of the product in 100 words"
                },
                "What is in the box": {
                    "type": "string",
                    "description": "Contents of the product package"
                },
                "Technical Facts": {
                    "type": "string",
                    "description": "Technical details about the product"
                },
                "Features": {
                    "type": "string",
                    "description": "Features of the product"
                },
                "How the product is used": {
                    "type": "string",
                    "description": "Instructions on how to use the product"
                },
                "Where the product is used": {
                    "type": "string",
                    "description": "Suggested locations or situations where the product can be used"
                },
                "User Description": {
                    "type": "string",
                    "description": "Description of the ideal user for the product"
                },
                "Packaging": {
                    "type": "string",
                    "description": "Description of the product's packaging"
                },
                "Season": {
                    "type": "string",
                    "description": "Season or time of year when the product is typically used"
                }
            },
            "required": ["Product Summary", "What is in the box", "Technical Facts", "Features", "How the product is used", "Where the product is used", "User Description", "Packaging", "Season"]
        },
    }
]


In [None]:
chatbot_responses = dict()

for i in product.index:
    asin  = product['asin'][i]
    bullets = product['feature_bullets'][i]
    description = product['description'][i]

    # Get the product review
    print(bullets)
    print(description)

    messages = [
        {"role": "system", "content": "You are a highly analytic product researcher in a product development team"},
        {"role": "user", "content": User_Prompt_1},
        {"role": "assistant", "content": AI_Prompt_1},
        {"role": "user", "content": {"Product Summary": bullets, "What is in the box": description, "Technical Facts": "", "Features": "", "How the product is used": "", "Where the product is used": "", "User Description": "", "Packaging": "", "Season": ""}} # replace these with the appropriate values
    ]

    # Send the prompt to the chatbot and get the response
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages,
        functions=functions,
        function_call={"name": "describe_product"},
        temperature=0
    )

    # Process the response and store in the dictionary
    chatbot_responses[asin] = response["choices"][0]["message"]["content"]
    product.loc[i, 'product_description_data'] = chatbot_responses[asin]
    print(response["choices"][0]["message"]["content"])


In [112]:
# https://github.com/openai/openai-cookbook/blob/main/examples/How_to_format_inputs_to_ChatGPT_models.ipynb


User_Prompt_1 = """
Format your response as a JSON object with: \
{\
"Product Summary [100 words]",\
"What is in the box,"\
"Technical Facts"\
"Features",\
"How the product is used",\
"Where the product is used",\
"User Description,"\
"Packaging",\
"Season",\
}

PRODUCT DESCRIPTIONS from an ecommerce site \
delimited with triple backticks. ``` \
If information isn't present, use "unknown" value. \
Product Summary  : summary to give feedback to the \
product development department, responsible for researching \
and developing the product. Max 100 words. \

PRODUCT DESCRIPTIONS: ```\
[" The included pencil works great to bring the magnet balls to the surface. \
It is very sturdy and has survived multiple drops."," \
Nice for a preschooler, love playing with it because of the satisfying \"click\" noises and the fun of a novel toy.\
"," Randomly doodles, practicing abcs and shapes, playing tic tac toe, or making more elaborate designs. Have fun with it!\
"," Magnetic boards are Really brilliant idea and innovative way of teaching child how to write, \
They are happy to trace the letters with this magnetic board and quickly learning letters and numbers.\
"," We bring it on long car rides, restaurant waiting for dinner, anywhere they need to be entertained. \
Much better for their brains than screen time! Each age uses it a bit differently which is neat to see.","\
MADE OF CHILD SAFE, NON-TOXIC, BPA-FREE and lead-free, tested in CPC accredited lab to ensure quality and safety"]\
```"""


AI_Prompt_1 = """\
{\
"Product Summary": "A durable magnetic board for children's learning and fun.\
Includes a pencil for magnet balls, offering a satisfying 'click' sound. \
Suitable for various ages and healthier than screen time.",\
"What is in the box": "Magnetic board , pencil",\
"Technical Facts": "Child safe, non-toxic, BPA-free, lead-free, tested in CPC accredited lab",\
"Features": "Satisfying 'click' noises, various uses (doodling, practicing letters and shapes, playing games)",\
"How the product is used": "Doodling, practicing letters/shapes, playing games",\
"Where the product is used": "Car rides, restaurants, entertainment situations",\
"User Description": "Preschoolers and children",\
"Packaging": "unknown",\
"Season": "unknown"\
}"""

In [117]:
chatbot_responses = dict()
for i in product.index:
    asin  = product['asin'][i]
    bullets = product['feature_bullets'][i]
    description = product['description'][i]
    # Get the product review
    print(bullets)
    print(description)
    User_Prompt_2 = f"PRODUCT DESCRIPTIONS: ```{bullets},{description}```"
    # Send the prompt to the chatbot and get the response
    response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "user", "content": User_Prompt_1},
                    {"role": "assistant", "content": AI_Prompt_1},
                    {"role": "user", "content": User_Prompt_2} ],
                temperature=0
        )
    # Process the response and store in the dictionary
    chatbot_responses[asin] = response["choices"][0]["message"]["content"]
    product.loc[i, 'product_description_data'] = chatbot_responses[asin]
    print(response["choices"][0]["message"]["content"])

[" A No-Mess Toy for Kids: Make long drives bearable and keep your youngsters entertained without getting messy! Our magnetic board drawing pad is all you need to occupy their hands, minds, and eyes for hours!"," Learning & Playing in One: Its unique design uses the concept of magnetism that stimulates various sensory receptors, disguising learning as play for a refreshing experience each time."," Portable & Travel-Friendly: With only an 8.8\" x 7\" magnet board and a magnetic stylus pen, you can easily take these toddler girl toys drawing supplies on a long drive, flights, or train rides without bulking the bag."," No Eraser Needed: Let your kids unleash their creative juices just by pointing the magnetic nib of the stylus on each hole to move the magnetic beads to the surface. To undo, just use the side of the pen to push them back down."," Fun & Unique Gift: Make toddler games and magnetic toys thrilling for everyone! Click 'Add to Cart' now and wrap it up as a gift that even the wh

In [None]:
from sqlalchemy import create_engine, text, MetaData, Table
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.sql import select

# Create a SQLAlchemy engine
engine = create_engine('postgresql://postgres:mysecretpassword@localhost:5432/postgres')

metadata = MetaData()

def create_table(table_name, schema):
    try:
        table = Table(table_name, metadata, autoload_with=engine)
    except SQLAlchemyError:
        table = Table(table_name, metadata, schema, extend_existing=True)
        metadata.create_all(engine)

def insert_data(table_name, dataframe):
    # Convert DataFrame to a list of dictionaries
    data = dataframe.to_dict(orient='records')

    # Get the table
    table = Table(table_name, metadata, autoload_with=engine)

    # Insert the data
    with engine.begin() as connection:
        for row in data:
            connection.execute(table.insert(), row)

def delete_duplicates(table_name, column_name, id):
    with engine.begin() as connection:
        delete_query = text(f"""
            DELETE FROM {table_name} 
            WHERE {id} NOT IN (
                SELECT {id} 
                FROM {table_name} 
                GROUP BY {column_name} 
                HAVING COUNT(*) > 1
            )
        """)
        connection.execute(delete_query)

def get_duplicate_asins(table_name, column_name):
    with engine.begin() as connection:
        query = text(f"""
            SELECT {column_name}, COUNT(*) as count
            FROM {table_name} 
            GROUP BY {column_name}
            HAVING COUNT(*) > 1
        """)
        result = connection.execute(query)
        return result.fetchall()


In [None]:
from sqlalchemy import Column, Integer, String, Float, Boolean, Text
from sqlalchemy.ext.declarative import declarative_base

Base = declarative_base()

class Product(Base):
    __tablename__ = 'products'

    index = Column(Integer, primary_key=True)
    title = Column(Text)
    description = Column(Text)
    feature_bullets = Column(Text)
    variants = Column(Text)
    categories = Column(Text)
    asin = Column(String(10))
    url = Column(Text)
    reviews_total_reviews = Column(Integer)
    reviews_rating = Column(Float)
    reviews_answered_questions = Column(Integer)
    item_available = Column(Boolean)
    price_symbol = Column(Text)
    price_currency = Column(Text)
    price_current_price = Column(Float)
    price_discounted = Column(Boolean)
    price_before_price = Column(Float)
    price_savings_amount = Column(Float)
    price_savings_percent = Column(Float)
    bestsellers_rank = Column(Text)
    main_image = Column(Text)
    total_images = Column(Integer)
    images = Column(Text)
    total_videos = Column(Integer)
    videos = Column(Text)
    delivery_message = Column(Float)
    product_information_dimensions = Column(Float)
    product_information_weight = Column(Float)
    product_information_available_from = Column(Float)
    product_information_available_from_utc = Column(Float)
    product_information_available_for_months = Column(Integer)
    product_information_available_for_days = Column(Integer)
    product_information_manufacturer = Column(Float)
    product_information_model_number = Column(Float)
    product_information_department = Column(Float)
    product_information_qty_per_order = Column(Text)
    product_information_store_id = Column(Float)
    product_information_brand = Column(Text)
    badges_amazon_choice = Column(Boolean)
    badges_amazon_prime = Column(Boolean)
    badges_best_seller = Column(Boolean)
    sponsored_products = Column(Text)
    also_bought = Column(Text)
    other_sellers = Column(Text)
    product_description_data = Column(Text)

# Create the table
Base.metadata.create_all(engine)


In [None]:
    # create a dictionary with key-value pairs for renaming
rename_dict = {
        'price.symbol': 'price_symbol',
        'badges.best_seller': 'bestsellers_rank',
        'badges.amazon_prime': 'badges_amazon_prime',
        'badges.amazon_сhoice': 'badges_amazon_choice',
        'reviews.total_reviews': 'reviews_total_reviews',
        'reviews.answered_questions': 'reviews_answered_questions',
        'reviews.rating': 'reviews_rating',
        'product_information.available_from': 'product_information_available_from',
        'product_information.available_from_utc': 'product_information_available_from_utc',
        'product_information.available_for_days': 'product_information_available_for_days',
        'product_information.available_for_months': 'product_information_available_for_months',
        'product_information.brand': 'product_information_brand',
        'product_information.department': 'product_information_department',
        'product_information.dimensions': 'product_information_dimensions',
        'product_information.manufacturer': 'product_information_manufacturer',
        'product_information.model_number': 'product_information_model_number',
        'product_information.qty_per_order': 'product_information_qty_per_order',
        'product_information.store_id': 'product_information_store_id',
        'product_information.weight': 'product_information_weight',
        'price.before_price': 'price_before_price',
        'price.currency': 'price_currency',
        'price.current_price': 'price_current_price',
        'price.discounted': 'price_discounted',
        'price.savings_amount': 'price_savings_amount',
        'price.savings_percent': 'price_savings_percent',
        'url': 'url',
        'title': 'title',
        'description': 'description',
        'feature_bullets': 'feature_bullets',
        'variants': 'variants',
        'categories': 'categories',
        'asin': 'asin',
        'item_available': 'item_available',
        'main_image': 'main_image',
        'total_images': 'total_images',
        'images': 'images',
        'total_videos': 'total_videos',
        'videos': 'videos',
        'delivery_message': 'delivery_message',
        'sponsored_products': 'sponsored_products',
        'also_bought': 'also_bought',
        'other_sellers': 'other_sellers',
        'product_description_data': 'product_description_data'
    }
    
product.rename(columns=rename_dict, inplace=True)

In [None]:
# Insert data into the 'products' table
insert_data('products', product)

# Check if there are duplicates
duplicates = get_duplicate_asins('products', 'asin')
print(duplicates)

# Remove duplicates by ASIN from the 'products' table
delete_duplicates('products', 'asin', 'index')