In [1]:
import requests
import urllib.parse
import json
import pandas as pd
import random

#define api key
api_key = 'your_api_key'

#define marketplace domain ID 1="US'
domain_id = 1

#generate page number
#Keepa API uses pagination to help control the number of results returned. Max results is 10000. 
#be mindful that pagination and page results combined do not exceed 10000
#if you request the last page for more results than it contains, your query will return zero results
#with this query I have noted that 15 is the max number of pages with a page size of 200 that you can request
#because the query will return the same products if run sequentially, I have added a random page generator to diversify results
#this is in lieue of actually devising a way to check if I have requested an asin in the past before. Will revisit if n
pg_number = random.randint(1, 15)


#create query parameters
query_params = {
    "categories_exclude": [ #filter unwanted categories. Check Keepa category IDs for more info
        "668145011", #aprons
        "374742011", #sports apparel 
        "5768995011", #caps and hats
        "13727921011", #Alexa Skills
        "2350149011", #Apps & Games
        "18145289011", #Audible Books & Originals
        "283155", #Books
        "5174", #CDs & Vinyl
        "7141123011", #Clothing, Shoes & Jewelry
        "4991425011", #Collectibles & Fine Art
        "163856011", #Digital Music
        "2238192011", #Gift Cards
        "11260432011", #Handmade Products
        "133140011", #Kindle Store
        "599858", #Magazine Subscriptions
        "2625373011", #Movies & TV
        "229534", #Software
        "468642", #Video Games
        "9013971011" #Video Shorts
    ],
    "productType": "0", #physical products only
    "avg180_NEW_gte": 15, #minimum price
    "imageCount_gte": 1, #min image count
    "imageCount_lte": 4, #max image count
    "current_SALES_gte": 1000, #min sales rank
    "current_SALES_lte": 100000, #max Sales Rank
    "avg30_SALES_gte": 1000, #min average 30 day sales rank
    "avg30_SALES_lte": 100000,  #max average 30 day sales rank
    "monthlySold_gte": 50, #min monthly sales
    "current_RATING_gte": 40, #min rating
    "current_COUNT_REVIEWS_gte": 25, #min reviews
    "brand": "✜Bath & Body Works", #exlcude bath and bodyworks cause theres so many listings
    "buyBoxSellerId": [
        "-ATVPDKIKX0DER", #exclude
        "-A3SLTBYT1P4ASM" #exclude
    ],
    "current_BUY_BOX_SHIPPING_gte": 0,
    "offerCountFBA_gte": 5, #min FBA offers
    "launchpad": False, 
    "itemWeight_gte": 0,
    "itemWeight_lte": 20, #max pacajage weight
    "isHazMat": False,
    "isAdultProduct": False,
    "productType": [
        "0"
    ],
    "singleVariation": True, #only return 1 asin per variation
    "sort": [ #sort by sales
        [
            "current_SALES",
            "asc"
        ]
    ],
    "lastOffersUpdate_gte": 6969326,
    "lastRatingUpdate_gte": 6844106,
    "page": pg_number,
    "perPage": 200
}

#convert query to json format 
query_json = urllib.parse.quote(json.dumps(query_params))

#construct get request URL
api_endpoint = f'https://api.keepa.com/query?key={api_key}&domain={domain_id}&selection={query_json}'


In [2]:
#send HTTP GET request
response = requests.get(api_endpoint)
response_data = response.json()

# Parse the response JSON
response_data = response.json()

In [3]:
#create dataframe of ASINS from parsed response
df= pd.DataFrame(response_data['asinList'], columns=['asinList'])

#Add column with link to asin
df['asinLink'] = df['asinList'].apply(lambda x: "https://www.amazon.com/dp/" + x)


In [4]:
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

# Set up the progress bar for DataFrame's apply method
tqdm.pandas(desc="Checking brand registration")

#function to scrape the page from the asin link and look for storefront 
def is_brand_registered(asin_url):
    try:
        # Fetch the HTML content of the product page with scrapeops proxy
        response = requests.get(
            url = 'https://proxy.scrapeops.io/v1/',
            params = {
                'api_key' : 'your_api_key',
                'url' : asin_url,
            },
            timeout=30  # Set timeout to avoid hanging requests
        )
        #store html retrieved by scrapeops
        soup = BeautifulSoup(response.content, 'html.parser')

        # Check for the presence of "visit the [brand name] store" text by element tag <a>, id "bylineInfo", class "a-link-normal"
        store_text = soup.find("a", {"id": "bylineInfo", "class": "a-link-normal"}, string=lambda text: text and "Visit the " in text and " Store" in text)
        return bool(store_text)
    
    except requests.exceptions.RequestException as e:
        print(f"Request error for {asin_url}: {e}")
    except Exception as e:
        print(f"Error fetching or parsing page {asin_url}: {e}")
    
    return False





In [5]:
# Filter ASINs associated with brand registered brands
filtered_asins = df[~df['asinLink'].progress_apply(is_brand_registered)]


Checking brand registration: 100%|███████████████████████████████████████████████████| 200/200 [24:43<00:00,  7.42s/it]


In [6]:
import os
from datetime import datetime

def save_to_csv_with_unique_name(dataframe, base_path):
    #save dataframe to csv with a unique name if the filename already exists
    #params: 
    #-dataframe: dataframe to export
    #-base_path: directory to export to

    #returns
    #-final file path for exporting
    directory, base_filename = os.path.split(base_path)
    name, ext = os.path.splitext(base_filename)

    #check that directory exists
    if directory and not os.path.exists(directory):
        os.makedirs(directory) 
    final_path = base_path
    counter = 1

   # Generate a new name if the file already exists
    while os.path.exists(final_path):
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        final_path = os.path.join(directory, f"{name}_{timestamp}_{counter}{ext}")
        counter += 1

    # Save the file
    dataframe.to_csv(final_path, index=False)
    print(f"File saved as: {final_path}")
    return final_path

In [7]:
#directory for export
base_path = r'C:\Users\benol\Downloads\filtered_asins.csv'


save_to_csv_with_unique_name(filtered_asins, base_path)

File saved as: C:\Users\benol\Downloads\filtered_asins_20241125_165451_1.csv


'C:\\Users\\benol\\Downloads\\filtered_asins_20241125_165451_1.csv'