In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import re
import json
import requests
from requests.adapters import Retry
import pandas as pd

def crawl_shopee_data(url, max_ratings=500):
    r = re.search(r"i\.(\d+)\.(\d+)", url)
    shop_id, item_id = r[1], r[2]
    ratings_url = "https://shopee.vn/api/v2/item/get_ratings?filter=0&flag=1&itemid={item_id}&limit=20&offset={offset}&shopid={shop_id}&type=0"

    offset = 0
    d = {"username": [], "rating": [], "rating_time": [], "item_id": []}

    # Define retry strategy
    retries = Retry(total=5, backoff_factor=1, status_forcelist=[429, 500, 502, 503, 504])

    # Create a session with the retry strategy
    session = requests.Session()
    session.mount('https://', requests.adapters.HTTPAdapter(max_retries=retries))

    while offset < max_ratings:
        try:
            data = session.get(
                ratings_url.format(shop_id=shop_id, item_id=item_id, offset=offset)
            ).json()

            if data.get("data") and data["data"].get("ratings"):
                for rating in data["data"]["ratings"]:
                    d["username"].append(rating["author_username"])
                    d["rating"].append(rating["rating_star"])
                    d["rating_time"].append(rating["ctime"])
                    d["item_id"].append(item_id)
            else:
                break  # Exit the loop if there are no ratings

            offset += 20

        except requests.exceptions.RequestException as e:
            print(f"Error: {e}")
            continue  # Retry the request if an exception occurs

    df = pd.DataFrame(d)
    return df

def crawl_data_from_csv(csv_file_path):
    df_urls = pd.read_csv(csv_file_path, encoding='latin-1')  # or encoding='cp1252'
    dfs = []

    for index, row in df_urls.iterrows():
        url = row['Product Link']
        print(f"Crawling data for URL: {url}")
        df = crawl_shopee_data(url)
        dfs.append(df)

    result_df = pd.concat(dfs, ignore_index=True)
    return result_df

# Example usage
csv_file_path = 'links.csv'
result_df = crawl_data_from_csv(csv_file_path)
print(result_df)
result_df.to_csv("data.csv", index=False)

Crawling data for URL: https://shopee.vn/Áo-ki?u-Lovito-thêu-h?a-ti?t-ph?i-l??i-t??ng-ph?n-màu-tr?n-th??ng-ngày-cho-n?-LNE19119-(Màu-m?)-i.446089250.21192160813?sp_atk=d30b9ed4-bea0-4846-a4c9-a58b6bf9dbb0&xptdk=d30b9ed4-bea0-4846-a4c9-a58b6bf9dbb0
Crawling data for URL: https://shopee.vn/MONATA-BLUELIGHT-Tee-Retro-Cassette-Áo-thun-unisex-form-r?ng-i.104170321.20604319586?sp_atk=8186acef-acae-4af4-818b-5aeb3fb95790&xptdk=8186acef-acae-4af4-818b-5aeb3fb95790
Crawling data for URL: https://shopee.vn/Áo-thun-n?-nam-unisex-tay-l?-phông-local-brand-form-r?ng-teen-c?-tròn-oversize-cotton-màu-?en-tr?ng-CLOUDZY-BASIC-TEE-2-i.909107171.22422234055?sp_atk=57206b13-ba07-42aa-b149-a06b050cfc71&xptdk=57206b13-ba07-42aa-b149-a06b050cfc71
Crawling data for URL: https://shopee.vn/Áo-thun-nam-n?-tay-l?-unisex-form-r?ng-Áo-thun-SOAT-Áo-phông-Ng??i-??p-Vì-L?a-i.714509866.22676261984?sp_atk=fe4aed60-c873-4642-857c-95e9af0c2360&xptdk=fe4aed60-c873-4642-857c-95e9af0c2360
Crawling data for URL: https://shopee

In [None]:
import re
import requests
from requests.adapters import Retry
import pandas as pd

def crawl_shopee_item_id(url):
    r = re.search(r"i\.(\d+)\.(\d+)", url)
    shop_id, item_id = r[1], r[2]
    return item_id

def crawl_data_from_csv(csv_file_path):
    df_urls = pd.read_csv(csv_file_path, encoding='latin-1')  # or encoding='cp1252'
    unique_item_ids = set()

    for index, row in df_urls.iterrows():
        url = row['Product Link']
        print(f"Crawling item ID for URL: {url}")
        item_id = crawl_shopee_item_id(url)
        unique_item_ids.add(item_id)

    result_df = pd.DataFrame({"item_id": list(unique_item_ids)})
    return result_df

# Example usage
csv_file_path = 'links.csv'  # Replace with the actual path to your CSV file
result_df = crawl_data_from_csv(csv_file_path)
print(result_df)
result_df.to_csv("unique_item_ids.csv", index=False)

Crawling item ID for URL: https://shopee.vn/Áo-ki?u-Lovito-thêu-h?a-ti?t-ph?i-l??i-t??ng-ph?n-màu-tr?n-th??ng-ngày-cho-n?-LNE19119-(Màu-m?)-i.446089250.21192160813?sp_atk=d30b9ed4-bea0-4846-a4c9-a58b6bf9dbb0&xptdk=d30b9ed4-bea0-4846-a4c9-a58b6bf9dbb0
Crawling item ID for URL: https://shopee.vn/MONATA-BLUELIGHT-Tee-Retro-Cassette-Áo-thun-unisex-form-r?ng-i.104170321.20604319586?sp_atk=8186acef-acae-4af4-818b-5aeb3fb95790&xptdk=8186acef-acae-4af4-818b-5aeb3fb95790
Crawling item ID for URL: https://shopee.vn/Áo-thun-n?-nam-unisex-tay-l?-phông-local-brand-form-r?ng-teen-c?-tròn-oversize-cotton-màu-?en-tr?ng-CLOUDZY-BASIC-TEE-2-i.909107171.22422234055?sp_atk=57206b13-ba07-42aa-b149-a06b050cfc71&xptdk=57206b13-ba07-42aa-b149-a06b050cfc71
Crawling item ID for URL: https://shopee.vn/Áo-thun-nam-n?-tay-l?-unisex-form-r?ng-Áo-thun-SOAT-Áo-phông-Ng??i-??p-Vì-L?a-i.714509866.22676261984?sp_atk=fe4aed60-c873-4642-857c-95e9af0c2360&xptdk=fe4aed60-c873-4642-857c-95e9af0c2360
Crawling item ID for URL: