In [None]:
import pandas as pd
import json
import os
import sys
import logging
from utils import setup_logging
import re
import glob

In [None]:
# create catgeory mapping
def create_category_mapping():

    # read categories json file
    pattern = r"\d{4}-\d{2}-\d{2}_\d{2}-\d{2}-\d{2}"
    path = os.path.join("rawdata", f"aliexpress-categories-*{pattern}.json")
    match_file = glob.glob(path)

    with open (match_file, "r") as f:
        json.load(f)


    

In [None]:
# extract product details

class TransformAliexpressProductsDetails:
    def __init__(self, path):
        self.path = path
        self.setup_logging()
        self.logger = logging.getLogger(__name__)
        self.logger.info("TransformAliexpressProductsDetails initialised")
    
    def read_json_file(self):
        try:
            with open(self.path, "r") as file:
                data = json.load(file)
            return data
        except Exception as e:
            self.logger.error(f"Error reading JSON file: {e}")
            return None
    

    def extract_product_details(self, data):
        """Extract product details from the JSON data."""

        try:
            contents = data["data"]["result"]["mods"]["itemList"]["content"]
            products = []

            for content in contents:
                # get product details
                productID = content.get("productId", None)
                display_title = content.get("title", None).get("displayTitle", None)
                product_url = content.get("productDetailUrl")
                image_url = content.get("image", None).get("imgUrl", None)

                # get store details
                store_url = content.get("store", None).get("storeUrl", None)
                store_name = content.get("store", None).get("storeName", None)

                # get prices
                prices = content.get("prices")
                if "originalPrice" in prices and "formatedPrice" in prices["originalPrice"]:
                    original_price = prices["originalPrice"]["formatedPrice"]
                else:
                    original_price = "0"
                    
                if "salePrice" in prices and "formattedPrice" in prices["salePrice"]:
                    sale_price  = prices["salePrice"]["formattedPrice"]
                else:
                    sale_price  = "0"

                if "salePrice" in prices and "discount" in prices["salePrice"]:
                    discount  = prices["salePrice"]["discount"]
                else:
                    discount  = "0"

                product_data = (productID, display_title, product_url, image_url, store_name, store_url, original_price,  sale_price, discount)

                products.append(product_data)

            return products
            
        except Exception as e:
            self.logger.error(f"Error extracting product details from Aliexpress: {e}")
            return None
        

    def create_df(self, products, categoryID, subcategoryID):
        """Create a Pandas DataFrame from the products list."""

        columns = ["categoryID", "subcategoryID", "productID", "display_title", "product_url", "image_url", "store_name", "store_url", "original_price", "sale_price", "discount"]
        df = pd.DataFrame(products, columns=columns)

        productIDs_df = df[["productID", "displayTitle"]]
        productIDs_df["categoryID"] = categoryID
        productIDs_df["subcategoryID"] = subcategoryID
        productIDs_df["reviews_retrieved"] = False
        productIDs_df = productIDs_df[["categoryID", "subcategoryID", "productID", "reviews_retrieved"]]

        return df, productIDs_df
    
    
    def clean_data(self, df, productIDs_df):
        """Clean the product DataFrame."""

        try:
            # remove duplicates
            df.drop_duplicates(subset="productID", inplace=True)
            productIDs_df.drop_duplicates(subset="productID", inplace=True)

            # remove £ from original_price and sale_price
            df["original_price"] = df["original_price"].str.replace("£", "")
            df["sale_price"] = df["sale_price"].str.replace("£", "")

            # convert prices to numeric
            numeric_columns = ["original_price", "sale_price", "discount"]
            for col in numeric_columns:
                df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)

            return df, productIDs_df
        except Exception as e:
            self.logger.error(f"Error cleaning data: {e}")
            return None
        
        
    def save_df_to_csv(self, df, file_name):
        """Save the DataFrame to a CSV file."""
        try:
            df.to_csv(file_name, index=False)
            productIDs_df.to_csv("productIDs.csv", index=False)
            self.logger.info(f"Data saved to {file_name}")
        except Exception as e:
            self.logger.error(f"Error saving DataFrame to CSV: {e}")


if __name__ == "__main__":
    setup_logging()
    aliexpress_products_details = TransformAliexpressProductsDetails("aliexpress_products_details.json")
    data = aliexpress_products_details.read_json_file()

    if data:
        products = aliexpress_products_details.transform_product_details(data)
        df, productIDs_df = aliexpress_products_details.create_df(products)
        df, productIDs_df = aliexpress_products_details.clean_data(df, productIDs_df)
        aliexpress_products_details.save_df_to_csv(df, "aliexpress_products_details.csv")
        aliexpress_products_details.save_df_to_csv(productIDs_df, "aliexpress_products_details.csv")
    

In [None]:
class TransformAliexpressProductReviews:
    def __init__(self, path):
        self.path = path
        self.setup_logging()
        self.logger = logging.getLogger(__name__)
        self.logger.info("Aliexpress_products_scraper initialised")
    
    def read_json_file(self):
        try:
            with open(self.path, "r") as file:
                data = json.load(file)
            return data
        except Exception as e:
            self.logger.error(f"Error reading JSON file: {e}")
            return None

    def transform_products_reviews(json_file):
        data = json_file["data"]["evaViewList"]
        reviews_list = []

        for review in data:
            evaluationId = review.get("evaluationIdStr", None)
            is_aigc = review.get("aigc", False)
            is_anonymous = review.get("anonymous", False)
            skuInfo = review.get("skuInfo", None)
            buyerCountry = review.get("buyerCountry", "Unknown")
            buyerEval =  review.get("buyerEval", 0)
            evalDate = review.get("evalDate", None)
            buyerFeedback  = review.get("buyerFeedback", None)
            buyerTranslationFeedback = review.get("buyerTranslationFeedback", None)
            buyerGender =  review.get("buyerGender", None)
            reviewLabel1 = review.get("reviewLabel1", None)
            reviewLabel2 = review.get("reviewLabel2", None)
            reviewLabel3 = review.get("reviewLabel3", None)
            reviewLabelValue1 = review.get("reviewLabelValue1", None)
            reviewLabelValue2 = review.get("reviewLabelValue2", None)
            reviewLabelValue3 = review.get("reviewLabelValue3", None)
            downVoteCount = review.get("downVoteCount", 0)
            upVoteCount = review.get("upVoteCount", 0)
            logistics = review.get("logistics", None)
                    
            review_tuple = (evaluationId, is_aigc, is_anonymous, skuInfo, buyerCountry, buyerEval, evalDate, buyerFeedback, buyerTranslationFeedback, buyerGender, reviewLabel1, reviewLabel2, reviewLabel3, reviewLabelValue1, reviewLabelValue2, reviewLabelValue3, downVoteCount, upVoteCount, logistics)
            reviews_list.append(review_tuple)

        return reviews_list
    

    # Fetch impressions
    def transform_impressions(json_file):
        impression_data_json = json_file["data"]["impressionDTOList"]
        impressions_list = []

        for impression in impression_data_json:
            content = impression.get("content", None)
            emotion = impression.get("emotion", 0)
            productID = impression.get("productId", None)
            evaluationID = impression.get("id", None)
            num_of_impressions = impression.get("num", 0)

            impression_tuple = (evaluationID, productID, content, emotion, num_of_impressions)
            impressions_list.append(impression_tuple)

        return impressions_list
    

    # get get_product_stat
    def get_product_stat(json_file):
        product_stat_json = json_file["data"]["productEvaluationStatistic"]
        product_stats_list = []

        for product_stat in product_stat_json:
            evarageStar = product_stat.get("evarageStar")
            evarageStarRage = product_stat.get("evarageStarRage")
            fiveStarNum = product_stat.get("fiveStarNum")
            fiveStarRate = product_stat.get("fiveStarRate")
            fourStarNum = product_stat.get("fourStarNum")
            fourStarRate = product_stat.get("fourStarRate")
            negativeNum = product_stat.get("negativeNum")
            negativeRate = product_stat.get("negativeRate")
            neutralNum = product_stat.get("neutralNum")
            neutralRate = product_stat.get("neutralRate")
            oneStarNum = product_stat.get("oneStarNum")
            oneStarRate = product_stat.get("oneStarRate")
            positiveNum = product_stat.get("positiveNum")
            positiveRate = product_stat.get("positiveRate")
            threeStarNum = product_stat.get("threeStarNum")
            threeStarRate = product_stat.get("threeStarRate")
            totalNum = product_stat.get("totalNum")
            twoStarNum = product_stat.get("twoStarNum")
            twoStarRate = product_stat.get("twoStarRate")

            product_stat_tuple = (evarageStar, evarageStarRage, fiveStarNum, fiveStarRate, 
                                fourStarNum, fourStarRate, negativeNum, negativeRate, 
                                neutralNum, neutralRate, oneStarNum, oneStarRate, 
                                positiveNum, positiveRate, threeStarNum, threeStarRate, 
                                twoStarNum, twoStarRate, totalNum)

            product_stats_list.append(product_stat_tuple)

        return product_stats_list
    
    # get helpful reviews
    def get_helpful_reviews():
        helpful_reviews = data["helpful"]
        helpful_reviews_list = []
        for i, val in helpful_reviews.items():
            evaluationID = i
            showButton = val["showButton"]
            useful =  val["useful"]
            useless = val["useless"]

            helpful_review_tuple = (evaluationID, showButton, useful, useless)
            helpful_reviews_list.append(helpful_review_tuple)
            
        return helpful_reviews_list
    
    # get reviewStructuredLabelDTOList
    def get_reviewStructuredLabelDTOList(productIDs):
        labelsDTO = productIDs["data"]["reviewStructuredLabelDTOList"]

        counts_list = []
        category_reviewList = []
        labels_list = []


        for categories in labelsDTO:
            count = categories.get("count", 0)
            labelID = categories.get("labelId", None)
            valueID = categories.get("valueID", None)

            count_tuple = (count, labelID, valueID)
            counts_list.append(count_tuple)
            
            for category in categories["categoryReviewLabelDTOList"]:
                categoryID = category.get("categoryId", None)
                labelID = category.get("labelId", None)
                labelName = category.get("labelName", None)

                category_tuple = (categoryID, labelID, labelName)
                category_reviewList.append(category_tuple)

                for label in category["labelValueOptions"]:
                    countPercentage = label.get("countPercentage", 0)
                    displayOption = label.get("displayOption", None)
                    valueID = label.get("labelValueId", None)
                    labelValueName = label.get("labelValueName", None)
                    score = label.get("score", 0)
                    
                    label_tuple = (labelID, countPercentage, displayOption, valueID, labelValueName, score)
                    labels_list.append(label_tuple)

        return counts_list, category_reviewList, labels_list
    
    def create_reviews_df(self, reviews_list, impressions_list, product_stats_list, helpful_reviews_list, counts_list, category_reviewList, labels_list):   
        reviews_columns = ["evaluationId", "is_aigc", "is_anonymous", "skuInfo", "buyerCountry", "buyerEval", "evalDate", "buyerFeedback", "buyerTranslationFeedback", "buyerGender", "reviewLabel1", "reviewLabel2", "reviewLabel3", "reviewLabelValue1", "reviewLabelValue2", "reviewLabelValue3", "downVoteCount", "upVoteCount", "logistics"]
        reviews = pd.DataFrame(reviews_list, columns=reviews_columns)

        impressions_columns = ["evaluationID", "productID", "content", "emotion", "num_of_impressions"]
        impressions = pd.DataFrame(impressions_list, columns=impressions_columns)

        product_stats_columns = ["evarageStar", "evarageStarRage", "fiveStarNum", "fiveStarRate", "fourStarNum", "fourStarRate", "negativeNum", "negativeRate", "neutralNum", "neutralRate", "oneStarNum", "oneStarRate", "positiveNum", "positiveRate", "threeStarNum", "threeStarRate", "twoStarNum", "twoStarRate", "totalNum"]
        product_stats = pd.DataFrame(product_stats_list, columns=product_stats_columns)

        helpful_reviews_columns = ["evaluationID", "showButton", "useful", "useless"]
        helpful_reviews = pd.DataFrame(helpful_reviews_list, columns=helpful_reviews_columns)

        counts_list_columns = ["count", "labelID", "valueID"]
        counts = pd.DataFrame(counts_list, columns=counts_list_columns)

        category_reviewList_columns = ["categoryID", "labelID", "labelName"]
        category_reviewList = pd.DataFrame(category_reviewList, columns=category_reviewList_columns)

        labels_list_columns = ["labelID", "countPercentage", "displayOption", "valueID", "labelValueName", "score"]
        labels = pd.DataFrame(labels_list, columns=labels_list_columns)

        return reviews, impressions, product_stats, helpful_reviews, counts, category_reviewList, labels
    

    def remove_emojis(self, col):
        self.col = col
        emoji_patterns = re.compile("["
                                u"\U0001F600-\U0001F64F"  # emoticons                      
                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            "]+", flags=re.UNICODE)
                        
        col = col.map(lambda x: emoji_patterns.sub("", x) if isinstance(x, str) else x)
                        
        return col
    
    
    def clean_data(self, reviews, impressions, product_stats, helpful_reviews, counts, category_reviewList, labels):
        try:
            # remove duplicates
            reviews.drop_duplicates(subset="evaluationId", inplace=True)
            impressions.drop_duplicates(subset="evaluationID", inplace=True)
            product_stats.drop_duplicates(inplace=True)
            helpful_reviews.drop_duplicates(subset="evaluationID", inplace=True)
            counts.drop_duplicates(subset="labelID", inplace=True)
            category_reviewList.drop_duplicates(subset="labelID", inplace=True)
            labels.drop_duplicates(subset="labelID", inplace=True)

            # remove emojis
            reviews["buyerFeedback"] = self.remove_emojis(reviews["buyerFeedback"])
            reviews["buyerTranslationFeedback"] = self.remove_emojis(reviews["buyerTranslationFeedback"])

            # extract colour and size from skuInfo
            colour = reviews["skuInfo"].str.extract(r"Color:(\S+)")
            size  = reviews["skuInfo"].str.extract(r"Size:(\S+)")
            reviews["colour"] = reviews.insert(3, "colour", colour)
            reviews["size"] = reviews.insert(4, "size", size)
            reviews.drop(columns = ["skuInfo"], inplace= True)

            # convert evalDate to datetime
            reviews["evalDate"] = pd.to_datetime(reviews["evalDate"])

            return reviews, impressions, product_stats, helpful_reviews, counts, category_reviewList, labels
        except Exception as e:
            self.logger.error(f"Error cleaning data: {e}")
            return None
        
    def save_df_to_csv(self, reviews, impressions, product_stats, helpful_reviews, counts, category_reviewList, labels, file_name):
        """Save the DataFrame to a CSV file."""
        try:
            reviews.to_csv(file_name, index=False)
            impressions.to_csv(file_name, index=False)
            product_stats.to_csv(file_name, index=False)
            helpful_reviews.to_csv(file_name, index=False)
            counts.to_csv(file_name, index=False)
            category_reviewList.to_csv(file_name, index=False)
            labels.to_csv(file_name, index=False)
            self.logger.info(f"Data saved to {file_name}")
        except Exception as e:
            self.logger.error(f"Error saving DataFrame to CSV: {e}")


if __name__ == "__main__":
    setup_logging()
    aliexpress_product_reviews = TransformAliexpressProductReviews("aliexpress_product_reviews.json")
    data = aliexpress_product_reviews.read_json_file()
    reviews_list = aliexpress_product_reviews.transform_products_reviews(data)
    impressions_list = aliexpress_product_reviews.transform_impressions(data)
    product_stats_list = aliexpress_product_reviews.get_product_stat(data)
    helpful_reviews_list = aliexpress_product_reviews.get_helpful_reviews(data)
    counts_list, category_reviewList, labels_list = aliexpress_product_reviews.get_reviewStructuredLabelDTOList(data)
    reviews, impressions, product_stats, helpful_reviews, counts, category_reviewList, labels = aliexpress_product_reviews.create_reviews_df(reviews_list, impressions_list, product_stats_list, helpful_reviews_list, counts_list, category_reviewList, labels_list)
    reviews, impressions, product_stats, helpful_reviews, counts, category_reviewList, labels = aliexpress_product_reviews.clean_data(reviews, impressions, product_stats, helpful_reviews, counts, category_reviewList, labels)
    aliexpress_product_reviews.save_df_to_csv(reviews, impressions, product_stats, helpful_reviews, counts, category_reviewList, labels, "aliexpress_product_reviews.csv")
