In [2]:
# Imports
import re
import os
import csv
import sys
import time
import requests


In [3]:
def dk_code_extractor(dk_link):
    pattern = r"(dkp-)?(?P<code>\d+)"
    match = re.search(pattern, dk_link)
    return match.group("code")


<h4>Get & saving the products list</h4>

In [4]:
def get_products_of_category(category):
    total = []
    url = f"https://api.digikala.com/v1/categories/{category}/search/"

    # Initial pager
    cur_page = 1
    total_pages = 1
    max_page_limit = 100
    max_retry_on_error = 5

    sys.stdout.write(f"\n{category} pages:\n")

    def get_page(page=1):
        nonlocal total_pages, total, max_retry_on_error

        while max_retry_on_error > 0:
            try:
                response = requests.get(url, params={"page": page})
                response = response.json()
                if not response:
                    raise ValueError("Empty response")
                break
            except Exception:
                max_retry_on_error -= 1
                if max_retry_on_error >0:
                    print(f"🔴Error occurred, ({max_retry_on_error}) retying...")
                    time.sleep(5)

        products = response["data"]["products"]
        total_pages = response["data"]["pager"]["total_pages"]

        sys.stdout.write(f"✅{format(page, '02d')} ")
        if page % 20 == 0:
            sys.stdout.write("\n")

        for product in products:
            total.append({"id": product["id"]})

    while cur_page <= total_pages and cur_page <= max_page_limit:
        get_page(cur_page)
        cur_page += 1

    return {"products": total, "category": category}


def save_products(category):
    result = get_products_of_category(category)
    products = result["products"]

    print(f"\n{len(products)} products of {category} category retrieved!")

    file_path = f"products/{category} ({len(products)}).csv"
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    with open(file_path, "w", encoding="UTF8", newline="") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=["id"])
        writer.writerows(products)


# categories = ["electronic-devices", "laptop", "mobile-phone"]
# for category in categories:
#     save_products(category)


<h4>Reading the saved products</h4>

In [5]:
# Reading the product IDs
def get_all_products():
    ids = []
    path = './products/'
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        if os.path.isfile(file_path):
            with open(file_path, "r") as file:
                ids.extend(map(lambda line: line.rstrip(), file))
    return ids
    
# get_all_products()

<h4>Get & saving the comments</h4>

In [6]:
def get_comments(dk_link):
    total = []
    code = dk_code_extractor(dk_link)
    url = f"https://api.digikala.com/v1/product/{code}/comments/"

    # Initial pager
    cur_page = 1
    total_pages = 1
    max_page_limit = 5
    max_retry_on_error = 3

    def get_page(page=1):
        nonlocal total_pages, total, max_page_limit, max_retry_on_error
        while max_retry_on_error > 0:
            try:
                response = requests.get(url, params={"page": page})
                response = response.json()
                if "data" not in response:
                    raise ValueError("Empty response")
                if "comments" not in response['data']:
                    raise ValueError("Empty response")
                break
            except Exception:
                max_retry_on_error -= 1
                if max_retry_on_error > 0:
                    print(f"🔴Error occurred {code}, ({max_retry_on_error}) retying...")
                    time.sleep(5)  
                else:
                    return {"comments": [], "code": code}

        sys.stdout.write(f"✅{format(page, '02d')} ")
        if page % 20 == 0:
            sys.stdout.write("\n")

        text_comments = response["data"]["comments"]
        # media_comments = response["data"]["media_comments"]
        total_pages = response["data"]["pager"]["total_pages"]

        comments = text_comments
        # comments += media_comments
        for comment in comments:
            rate = comment["rate"]
            text = comment["body"]
            if text is not None:
                text = text.replace("\n"," ").replace("\t"," ")
                total.append({"rate": rate, "text": text})

    while cur_page <= total_pages and cur_page <= max_page_limit:
        get_page(cur_page)
        cur_page += 1

    return {"comments": total, "code": code}


def save_comments(dk_link):
    print(f"\nGetting the comments of `dkp-{dk_link}` ...")
    result = get_comments(dk_link)
    code = result["code"]
    comments = result["comments"]

    print(f"\n{len(comments)} comments of dkp-{code} retrieved!")

    file_path = f"comments/{code} ({len(comments)}).csv"
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    with open(file_path, "w", encoding="UTF8", newline="") as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=["rate", "text"])
        writer.writerows(comments)


# comments = get_all_products()
# print(f"✅ {len(comments)} Product IDs fetched.")
# for id in comments:
#     save_comments(id)


In [17]:
# Reading the saved comments
def aggregate_saved_comments():
    comments = []
    path = "./comments/"
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        if os.path.isfile(file_path):
            with open(file_path, encoding="utf8", mode="r") as file:
                reader = csv.reader(file)
                comments.extend(reader)
    return comments


def save_comments_dataset():
    saved_comments = aggregate_saved_comments()

    # Remove un-labeled comments
    saved_comments = list(filter(lambda c: c[0] != "0", saved_comments))

    with open("../comments.csv", "w", encoding="UTF8", newline="") as csv_file:
        writer = csv.writer(csv_file)
        for comment in saved_comments:
            writer.writerow([comment[0], comment[1]])
    print(f"✅{len(saved_comments)} comments saved.")


save_comments_dataset()

✅144715 comments saved.
