In [None]:
#import libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import random
import time
import json
import logging
from dotenv import load_dotenv
import os
from utils import setup_logging
from itertools import cycle
import datetime

In [None]:
#import product category
def get_category():
    setup_logging()
    logger = logging.getLogger(__name__)
    logger.info("Aliexpress_api_get_categories initialised")

    try:
        url = 'https://ali-express1.p.rapidapi.com/categories'
        headers = {
            "X-RapidAPI-Host": os.getenv("RAPIDAPI_HOST"),
            "X-RapidAPI-Key": os.getenv("RAPIDAPI_KEY")
        }

        response = requests.get(url, headers=headers)
        categories = response.json()

        # create a parent directory
        parent_dir = "raw_data"
        if not os.path.exists(parent_dir):
            os.makedirs(parent_dir)
    
         # write to directory
        datetimestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        path = os.path.join(parent_dir, f"aliexpress-categories-{datetimestamp}.json")

        with open(path, "w") as f:
            json.dump(categories, f, indent = 3)
        logger.info(f"Aliexpress categories saved to {path}")
    
    except requests.exceptions.RequestException as e:
        logger.error(f"Error fetching Aliexpress categories: {e}")

    except Exception as e:
        logger.error(f"Error fecthing Aliexpress categories: {e}")



In [None]:
class Aliexpress_products_scraper:
    def __init__(self):
        self.url = "https://www.aliexpress.com/fn/search-pc/index"
        self.setup_logging()
        self.logger = logging.getLogger(__name__)
        self.logger.info("Aliexpress_products_scraper initialised")
        self.headers_cycle = cycle(self.get_headers())


    def get_headers(self):
        total_headers = os.getenv("total_headers")
        headers = []
        total_headers = int(total_headers)

        for i in range(1, total_headers+1):
            headers.append(os.getenv(f"product_header_{i}"))
        
        return headers


    def rotate_headers(self):
        """Rotate headers based on the index."""
        return next(self.headers_cycle)


    def get_payload(self, query, page):
        payload = {"pageVersion":"7ece9c0cc9cf2052db74f0d1b26b7033",
            "target":"root",
            "data":{
                "isFromCategory":"y",
                "categoryUrlParams":json.dumps({
                    "q":query,
                    "s":"qp_nw",
                    "sg_search_params":"",
                    "searchBizScene":"openSearch",
                    "recog_lang":"en", 
                    "guideModule":"category_navigate_vertical",
                }),
                    "page":page,
                    "g":"y",
                    "SearchText":query,
                    "origin":"y"
                },
                    "eventName":"onChange",
                    "dependency":[]
                }

        return payload
    
    def fetch_category_dim_from_csv(self, file):
        category_dim = pd.read_csv(f"{file}")
        category_dim.to_dict(orient="records")
        return category_dim
    

    def write_products_details_to_file(self, category, subcategory, products_results):
        """Write product details to a JSON file."""

        datetimestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        path = f"rawdata/{category}_{subcategory}_{datetimestamp}.json"
        try:
            if os.path.exists(path):
                with open(path, "r+") as f:
                    existing_data = json.load(f)
                    products_results.extend(existing_data)
                    f.seek(0)
                    json.dump(products_results, f, indent = 3)
            else:
                with open(path, "w") as f:
                    json.dump(products_results, f, indent = 3)
        except Exception as e:
            self.logger.error(f"Error writing to {path}: {e}")  
        

    def extract_products_listings(self, query):

        """Extract the products listings from the website."""
        category_dim = self.fetch_category_dim_from_csv
        category = category_dim["category"]
        subcategory = category_dim["subcategory"]

        try:
            # extract the first page
            payload = self.get_payload(query, 1)
            header = self.rotate_headers()
            
            response = requests.post(self.url, headers=header, json=payload)
            response.raise_for_status()            

            if response.status_code == 200:
                page_1_result = response.json()
                total_page_number = page_1_result["data"]["result"]["pageInfo"]["totalPage"]

                # save the first page to a file
                self.write_products_details_to_file(category, subcategory, page_1_result) 

                # Loop through the rest of the pages
                for page in range(2, total_page_number + 1):
                    payload = self.get_payload(query, page)
                    header = self.rotate_headers()

                    response = requests.post(self.url, headers=header, json=payload)
                    response.raise_for_status()
                    time.sleep(random.randint(1, 7))

                    if response.status_code == 200:
                        results = response.json()
                        self.write_products_details_to_file(category, subcategory, results) 
                    else:
                        self.logger.error(f"Error fetching product listings{page}: {response.status_code}")
                        continue
                
        except requests.exceptions.RequestException as e:
            self.logger.error(
                f"RequestException: {e} in {self.url} for query '{payload['data']['SearchText']}' "
                f"and page 1 for category '{category}' and subcategory '{subcategory}'"
                    )
            
        except Exception as e:
            self.logger.error(f"Error in extract_products_listings: {e}")

if __name__ == "__main__":
    load_dotenv()
    scraper = Aliexpress_products_scraper()
    scraper.extract_products_listings("laptop", "electronics", "computers")


In [None]:
class Aliexpress_reviews_scraper():
    def __init__(self):
        self.setup_logging()
        self.logger = logging.getLogger(__name__)
        self.logger.info("Aliexpress_reviews_scraper initialised")
        self.header_cycle = cycle(self.get_headers())

    def get_headers(self):
        total_headers = os.getenv("total_headers")
        headers = []
        total_headers = int(total_headers)

        for i in range(1, total_headers+1):
            headers.append(os.getenv(f"product_header_{i}"))
        
        return headers


    def rotate_headers(self):
        """Rotate headers based on the index."""       
        return next(self.header_cycle)
    

    def fetch_product_ids_from_csv(self, file):
        try:
            df = pd.read_csv(file)
            unretrieved_reviews = df.query("reviews_retrieved == False")
            if len(unretrieved_reviews) >= 60:
                product_ids_to_process = unretrieved_reviews.iloc[:60].to_dict(orient="records")
            else:
                product_ids_to_process = unretrieved_reviews.to_dict(orient="records")

            return product_ids_to_process
        except Exception as e:
            self.logger.error(f"Error fetching product IDs from CSV: {e}")
            return None
    

    def write_reviews_to_file(self, category, subcategory, new_reviews):
        """Write scraped reviews to a JSON file."""
        datetimestamp = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
        path = f"rawdata/{category}_{subcategory}_{datetimestamp}_reviews.json"
        try:
            if os.path.exists(path):
                with open(path, "r+") as f:
                    existing_reviews = json.load(f)
                    existing_reviews.extend(new_reviews)
                    f.seek(0)
                    json.dump(existing_reviews, f, indent = 3)
            else:
                with open(path, "w") as f:
                    json.dump(new_reviews, f, indent = 3)
        except Exception as e:
            self.logger.error(f"Error writing to {path}: {e}")
                

    def extract_product_reviews(self, reviews_file_csv):

        """Extract the product reviews from Aliexpress."""

        productIDs_dict = self.fetch_product_ids_from_csv(reviews_file_csv)  

        for value in productIDs_dict:   
            productID = value["productID"]
            category = value["category"]
            subcategory = value["subcategory"]
        
            try:
            # extract the first page
                            
                url = f"https://feedback.aliexpress.com/pc/searchEvaluation.do?productId={productID}&lang=en_US&country=UK&page=1&pageSize=10&filter=all&sort=complex_default"
                payload = {}
                header = self.rotate_headers()

                response = requests.get(url, headers=header, json=payload)
                response.raise_for_status()
                time.sleep(random.randint(1, 3))
                
                if response != 200:
                    self.logger.error(f"Error fetching product reviews: {response.status_code}")
                    continue

                page_1_result = response.json()
                total_page_number = page_1_result["totalPage"]

                if not page_1_result.get("records", []):
                    self.logger.info(f"No reviews found for Product ID: {productID}")
                    continue
        
                # Write the first page to a file
                self.write_reviews_to_file(category, subcategory,  page_1_result)

                
                # Process the rest of the pages
                if total_page_number > 1:
                    for page in range(2, total_page_number + 1):
                        url = f"https://feedback.aliexpress.com/pc/searchEvaluation.do?productId={productID}&lang=en_US&country=UK&page={page}&pageSize=10&filter=all&sort=complex_default"
                        payload = {}
                        header = self.rotate_headers()

                        response = requests.get(url, headers=header, json=payload)
                        time.sleep(random.randint(1, 7))

                        if response.status_code == 200:
                            results = response.json()
                            self.write_reviews_to_file(category, subcategory, results)
                        else:
                            self.logger.error(f"Error fetching product reviews on page{page} for productID {productID}")
                            continue


            except requests.exceptions.RequestExceptionException as e:
                self.logger.error(f"Error fetching reviews for product id: {productID}- {e}")
                continue
        
if __name__ == "__main__":
    load_dotenv()
    scraper = Aliexpress_reviews_scraper()
    scraper.extract_product_reviews("1005001772520001") 
