# Reviews Data Collection and Processing
### Objectives :
- Scrape "Backmarket" customers reviews from Trustpilot.
- Clean review data collected
- Processed reviews data collected


### Required packages

In [22]:
import os
import json
import requests
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime
from pathlib import Path
import time

import logging
import shutil

import re
from typing import List, Optional
import emoji
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords


### Global Logging Configuration
- All log messages (INFO, ERROR, etc.) will be written to the specified log file (concatenation.log).
- If StreamHandler is included, logs will also appear in the console.

In [2]:
# Ensure the logs directory exists
log_dir = "../logs_etl"
os.makedirs(log_dir, exist_ok=True)  # Create the directory if it doesn't exist
log_file = os.path.join(log_dir, "concatenation.log")

# Configure logging only if it hasn't been configured yet
if not logging.root.handlers:
    logging.basicConfig(
        level=logging.INFO,  # Fixed logging level
        format="%(asctime)s - %(levelname)s - %(message)s",
        handlers=[
            logging.FileHandler(log_file),  # Write logs to a file
            logging.StreamHandler()         # Optional: Keep console output
        ]
    )


### Data collection
Target company for review analysis : Back Market - A global marketplace for refurbished devices. 

Review data will be collected from Trustpilot, a platform for collecting verified customer reviews


#### Constants and Parameters

In [3]:
# Constants and parameters
COMPANY_NAME = "backmarket" # Backmarket
BASE_URL = f'https://fr.trustpilot.com/review/www.{COMPANY_NAME}.fr'
MAX_PAGES = 650
HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/91.0.4472.124 Safari/537.36"
    )
}

#### Function to scrape reviews from trustpilot
This Python function, extract_reviews, is designed to scrape and collect customer reviews from a paginated website ([trustpilot platform](https://fr.trustpilot.com/)) for a specific company. 
Here's a concise description of its functionality:

1. **Purpose** :
The function extracts structured review data (e.g., review ID, title, text, rating, reply, dates) from multiple pages of a website and saves the collected data into a CSV file.

2. **Key Steps** :
    - Input Parameters :
        - company_name: Name of the company (default is a predefined constant).
        - start_page: The page number to start scraping from (default is 1).
        - end_page: The last page to scrape (default is a predefined constant).
    - HTTP Requests :
        Sends GET requests to the website using requests with custom headers and handles potential HTTP errors.
    - HTML Parsing :
        Uses BeautifulSoup to parse the HTML content and extract embedded JSON data containing reviews from a script tag.
    - Data Extraction :
        Iterates through the JSON data, extracting specific fields (e.g., review text, rating, dates) and storing them in a list of dictionaries.
    -  Error Handling :
        Includes error handling for network issues, missing data, and parsing errors.
    - Rate Limiting :
        Implements a delay (time.sleep) after processing every 10 pages to avoid overloading the server.
    - Output :
    Converts the collected data into a Pandas DataFrame and saves it as a CSV file in a specified directory.
3. **Output** :
    - Returns the file path of the saved CSV file containing the extracted reviews.

In [4]:
def extract_reviews(company_name=COMPANY_NAME, start_page=1, end_page=2):
    """
    Scrapes customer reviews from a paginated website for a specific company. 
    Extracts key review details such as text, rating, and dates into a structured format. 
    Saves the collected data as a CSV file for further analysis.
    """
    headers = HEADERS
    base_url = BASE_URL
    keys = ["id", "title", "review", "rating", "reply", "experienceDate", "createdDateTime", "publishedDate"]
    
    reviews_list = []  # List to collect all review data
    for page in range(start_page, end_page + 1):
        logging.info(f"Processing page {page}")
        
        url_page = f"{base_url}?page={page}"
        
        try:
            response = requests.get(url_page, headers=headers, timeout=5)
            response.raise_for_status()  # Verify HTTP errors
        except requests.RequestException as e:
            logging.error(f"Error accessing page {page}: {e}")
            continue

        try:
            soup = BeautifulSoup(response.content, 'html.parser')
            script_content = soup.body.script.contents if soup.body and soup.body.script else None
            
            if not script_content:
                logging.warning(f"No data found in page {page}")
                continue

            raw_data = json.loads(script_content[0])
            raw_data = raw_data.get("props", {}).get("pageProps", {}).get("reviews", [])
            
            for review in raw_data:
                tmp = {}
                tmp["id"] = review.get("id")
                tmp["title"] = review.get("title")
                tmp["review"] = review.get("text")
                tmp["rating"] = review.get("rating")
                try:
                    tmp["reply"] = review.get("reply", {}).get("message")
                except:
                    tmp["reply"] = None
                
                tmp["experienceDate"] = review.get("dates", {}).get("experiencedDate")
                tmp["createdDateTime"] = review.get("labels", {}).get("verification", {}).get("createdDateTime")
                tmp["publishedDate"] = review.get("dates", {}).get("publishedDate")

                reviews_list.append({key: tmp.get(key) for key in keys})
            
        except Exception as e:
            logging.error(f"Error processing page {page}: {e}")
            continue
        
        # Avoid hitting the server too frequently
        if page > 10 and page % 10 == 0:
            logging.info("Sleeping for 100 seconds to avoid overloading the server.")
            time.sleep(100)

    if not reviews_list:
        logging.warning("No reviews collected.")
        return
    
    # Convert list of dicts to DataFrame and save to CSV
    df_raw_reviews = pd.DataFrame(reviews_list)

    # Save reviews data into a CSV file
    output_dir = "../data/raw"
    os.makedirs(output_dir, exist_ok=True)
    raw_file_path = os.path.join(output_dir, f"raw_reviews_{start_page}-{end_page}.csv")
    df_raw_reviews.to_csv(raw_file_path, index=False)
    logging.info(f"Saved reviews data to {raw_file_path}")
    return raw_file_path

In [37]:
# Test
extract_reviews(company_name=COMPANY_NAME, start_page=1, end_page=2)

2025-03-26 18:47:34,027 - INFO - Processing page 1
2025-03-26 18:47:35,094 - INFO - Processing page 2
2025-03-26 18:47:36,139 - INFO - Saved reviews data to ../data/raw/raw_reviews_1-2.csv


'../data/raw/raw_reviews_1-2.csv'

#### Initial data collection
There are over 3,000 review pages for Back Market, making it time-consuming to scrape all the data. To avoid overloading the server, we ran the extraction function in batches of 25 pages, incorporating significant delays between batches. The initial scraped data has been saved in the "data/raw" folder.

In [38]:
# Concatenate all extracted reviews into one csv file
def concatenate_reviews(input_dir="../data/raw", \
                        output_file="raw_reviews_0.csv", \
                        file_prefix="raw_reviews_"):
    """
    Concatenates multiple review CSV files into a single DataFrame,
    removes duplicates and missing values, 
    saves the cleaned data to a new CSV file,
    and moves processed files to an '.archive' folder.
    """
    # Validate input directory
    if not os.path.exists(input_dir):
        logging.error(f"Input directory '{input_dir}' does not exist.")
        return None

    # List files matching the prefix
    files = [f for f in os.listdir(input_dir) if f.startswith(file_prefix)]
    if not files:
        logging.error(f"No files found with prefix '{file_prefix}' in '{input_dir}'.")
        return None

    logging.info(f"Found {len(files)} files to process.")

    # Create .archive folder if it doesn't exist
    archive_dir = os.path.join(input_dir, ".archive")
    os.makedirs(archive_dir, exist_ok=True)
    logging.info(f"Created/verified '.archive' folder at: {archive_dir}")

    # Read and concatenate files efficiently
    try:
        df_list = []
        for f in files:
            file_path = os.path.join(input_dir, f)
            logging.info(f"Reading file: {file_path}")
            df_list.append(pd.read_csv(file_path))
        
        df = pd.concat(df_list, ignore_index=True)
        logging.info("Concatenated all files into a single DataFrame.")
    except Exception as e:
        logging.error(f"Error reading or concatenating files: {e}")
        return None

    # Log initial state
    logging.info(f"Initial DataFrame info:\n{df.info()}")

    # Remove duplicates
    df.drop_duplicates(subset=["id"], inplace=True)
    logging.info(f"Removed duplicates. New DataFrame info:\n{df.info()}")

    # Drop rows with missing values in critical columns
    critical_columns = ["id", "review", "rating", "experienceDate"]
    df.dropna(subset=critical_columns, inplace=True)
    logging.info(f"Removed rows with missing values. Final DataFrame info:\n{df.info()}")

    # Save cleaned DataFrame to CSV
    try:
        output_path = os.path.join(input_dir, output_file)
        df.to_csv(output_path, index=False)
        logging.info(f"Saved cleaned DataFrame to: {output_path}")
    except Exception as e:
        logging.error(f"Error saving cleaned DataFrame: {e}")
        return None

    # Move processed files to .archive folder
    try:
        for f in files:
            src_path = os.path.join(input_dir, f)
            dst_path = os.path.join(archive_dir, f)
            shutil.move(src_path, dst_path)
            logging.info(f"Moved file to archive: {dst_path}")
    except Exception as e:
        logging.error(f"Error moving files to archive: {e}")
        return None

    return df



In [40]:
# Concatenate all extracted reviews into one csv file
df = concatenate_reviews()
df.info()
df.head()

2025-03-26 18:51:53,962 - INFO - Found 102 files to process.
2025-03-26 18:51:53,963 - INFO - Created/verified '.archive' folder at: ../data/raw/.archive
2025-03-26 18:51:53,965 - INFO - Reading file: ../data/raw/raw_reviews_850-875.csv
2025-03-26 18:51:53,973 - INFO - Reading file: ../data/raw/raw_reviews_2025-2050.csv
2025-03-26 18:51:53,978 - INFO - Reading file: ../data/raw/raw_reviews_2000-2025.csv
2025-03-26 18:51:53,981 - INFO - Reading file: ../data/raw/raw_reviews_2700-2725.csv
2025-03-26 18:51:53,987 - INFO - Reading file: ../data/raw/raw_reviews_2725-2750.csv
2025-03-26 18:51:53,991 - INFO - Reading file: ../data/raw/raw_reviews_2900-2925.csv
2025-03-26 18:51:53,996 - INFO - Reading file: ../data/raw/raw_reviews_950-975.csv
2025-03-26 18:51:53,998 - INFO - Reading file: ../data/raw/raw_reviews_2075-2100.csv
2025-03-26 18:51:54,002 - INFO - Reading file: ../data/raw/raw_reviews_2250-2275.csv
2025-03-26 18:51:54,005 - INFO - Reading file: ../data/raw/raw_reviews_2925-2950.csv


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52960 entries, 0 to 52959
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               52960 non-null  object
 1   title            52960 non-null  object
 2   review           52960 non-null  object
 3   rating           52960 non-null  int64 
 4   reply            8311 non-null   object
 5   experienceDate   52467 non-null  object
 6   createdDateTime  52960 non-null  object
 7   publishedDate    52960 non-null  object
dtypes: int64(1), object(7)
memory usage: 3.2+ MB
<class 'pandas.core.frame.DataFrame'>
Index: 50972 entries, 0 to 52939
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               50972 non-null  object
 1   title            50972 non-null  object
 2   review           50972 non-null  object
 3   rating           50972 non-null  int64 
 4   reply         

2025-03-26 18:51:54,515 - INFO - Saved cleaned DataFrame to: ../data/raw/raw_reviews_0.csv
2025-03-26 18:51:54,516 - INFO - Moved file to archive: ../data/raw/.archive/raw_reviews_850-875.csv
2025-03-26 18:51:54,516 - INFO - Moved file to archive: ../data/raw/.archive/raw_reviews_2025-2050.csv
2025-03-26 18:51:54,516 - INFO - Moved file to archive: ../data/raw/.archive/raw_reviews_2000-2025.csv
2025-03-26 18:51:54,516 - INFO - Moved file to archive: ../data/raw/.archive/raw_reviews_2700-2725.csv
2025-03-26 18:51:54,517 - INFO - Moved file to archive: ../data/raw/.archive/raw_reviews_2725-2750.csv
2025-03-26 18:51:54,517 - INFO - Moved file to archive: ../data/raw/.archive/raw_reviews_2900-2925.csv
2025-03-26 18:51:54,517 - INFO - Moved file to archive: ../data/raw/.archive/raw_reviews_950-975.csv
2025-03-26 18:51:54,517 - INFO - Moved file to archive: ../data/raw/.archive/raw_reviews_2075-2100.csv
2025-03-26 18:51:54,518 - INFO - Moved file to archive: ../data/raw/.archive/raw_reviews_

<class 'pandas.core.frame.DataFrame'>
Index: 50498 entries, 0 to 52939
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               50498 non-null  object
 1   title            50498 non-null  object
 2   review           50498 non-null  object
 3   rating           50498 non-null  int64 
 4   reply            7838 non-null   object
 5   experienceDate   50498 non-null  object
 6   createdDateTime  50498 non-null  object
 7   publishedDate    50498 non-null  object
dtypes: int64(1), object(7)
memory usage: 3.5+ MB


Unnamed: 0,id,title,review,rating,reply,experienceDate,createdDateTime,publishedDate
0,640053299b64b1bdaf6661e9,Satisfait de mon choix avec backmarket,Satisfait de mon choix avec backmarket,5,,2023-02-28T23:00:00.000Z,2023-03-02T09:41:29.000Z,2023-03-02T09:41:29.000Z
1,640046cda2e3a177e9e8b614,"A fuir, aucun service client","A éviter ! Pas reçu le bon article, je demande...",1,,2023-03-01T00:00:00.000Z,2023-03-02T08:48:45.000Z,2023-03-02T08:48:45.000Z
2,6400387bd143b326fe43200d,"Parfait,rapide,bien emballé","Parfait,rapide,bien emballé",5,,2023-02-28T23:00:00.000Z,2023-03-02T07:47:39.000Z,2023-03-02T07:47:39.000Z
3,640034a7d143b326fe431f53,Pentax,Livraison très rapide,5,,2023-02-28T23:00:00.000Z,2023-03-02T07:31:19.000Z,2023-03-02T07:31:19.000Z
4,640030139b64b1bdaf665737,Mécontent car produit endommagéBien que le col...,Bien que le colis soit arrivé tôt j'ai globale...,1,,2023-02-27T23:00:00.000Z,2023-03-02T07:11:47.000Z,2023-03-02T07:11:47.000Z


In [10]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51018 entries, 0 to 51017
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               51018 non-null  object
 1   title            51018 non-null  object
 2   review           51018 non-null  object
 3   rating           51018 non-null  int64 
 4   reply            7838 non-null   object
 5   experienceDate   51018 non-null  object
 6   createdDateTime  51018 non-null  object
 7   publishedDate    51018 non-null  object
dtypes: int64(1), object(7)
memory usage: 3.1+ MB


Unnamed: 0,id,title,review,rating,reply,experienceDate,createdDateTime,publishedDate
0,640053299b64b1bdaf6661e9,Satisfait de mon choix avec backmarket,Satisfait de mon choix avec backmarket,5,,2023-02-28T23:00:00.000Z,2023-03-02T09:41:29.000Z,2023-03-02T09:41:29.000Z
1,640046cda2e3a177e9e8b614,"A fuir, aucun service client","A éviter ! Pas reçu le bon article, je demande...",1,,2023-03-01T00:00:00.000Z,2023-03-02T08:48:45.000Z,2023-03-02T08:48:45.000Z
2,6400387bd143b326fe43200d,"Parfait,rapide,bien emballé","Parfait,rapide,bien emballé",5,,2023-02-28T23:00:00.000Z,2023-03-02T07:47:39.000Z,2023-03-02T07:47:39.000Z
3,640034a7d143b326fe431f53,Pentax,Livraison très rapide,5,,2023-02-28T23:00:00.000Z,2023-03-02T07:31:19.000Z,2023-03-02T07:31:19.000Z
4,640030139b64b1bdaf665737,Mécontent car produit endommagéBien que le col...,Bien que le colis soit arrivé tôt j'ai globale...,1,,2023-02-27T23:00:00.000Z,2023-03-02T07:11:47.000Z,2023-03-02T07:11:47.000Z


### Process reviews data

In this step we preprocess raw text data for Natural Language Processing (NLP) tasks by performing a series of cleaning and normalization steps. We removes noise (e.g., hashtags, URLs, mentions, stopwords), converts text to lowercase, tokenizes it, filters out non-alphabetic tokens, and returns the cleaned text as a single string.

#### Text cleaning and normalization
- Noise Removal :
    - Removes hashtags, HTML entities, stock tickers, URLs, retweet tags, mentions, and special characters.
    - Replaces emojis with their textual descriptions and strips punctuation.

- Normalization :
    - Converts text to lowercase, replaces ampersands (&) with "and," and removes short words (≤3 characters).

- Tokenization :
    - Splits text into tokens using French-specific word tokenization.

- Filtering :
    - Removes numbers, non-alphabetic tokens, and stopwords (including custom additions to the French stopwords list).


In [41]:
# Function to clean and process review text

# os.environ["NLTK_DATA"] = "/Users/micheldpd/Projects/custrev/nltk_data"

# Preload stopwords outside the function to avoid repeated loading
STOP_WORDS_TO_ADD = ["être", "leur", "leurs", "avoir", "cela", "les", "de", "pour", "des", "cette", "a",
                   "j'ai", "car", "c'est", "chez", "tout", "fait", "chez", "donc", 
                   "n'est", "si", "alors", "n'ai", "faire", "deux", "comme", "jour", "tr", "si", "ue"

]
STOP_WORDS = set(stopwords.words('french')).union(set(STOP_WORDS_TO_ADD))


def clean_text(text: str) -> str:
    """
    Cleans raw text by removing noise (e.g., hashtags, URLs, stopwords) and normalizing content.
    Tokenizes, filters alphabetic tokens, and removes French stopwords for NLP tasks.
    Returns the cleaned and normalized text as a single string.
    """
    
    # Remove hashtags (keep text after #)
    text = re.sub(r'#', '', text)

    # Remove HTML special entities (e.g., &amp;)
    text = re.sub(r'&\w*;', '', text)

    # Remove stock tickers (e.g., $AAPL)
    text = re.sub(r'\$\w*', '', text)

    # Remove hyperlinks (covers various URL patterns)
    text = re.sub(r'https?://[^\s/$.?#].[^\s]*', '', text)
    text = re.sub(r'http(\S)+', '', text)  # Catch incomplete URLs
    text = re.sub(r'http\s*\.\.\.', '', text)  # Catch truncated URLs

    # Remove retweet tags and mentions
    text = re.sub(r'(RT|rt)\s*@\s*\S+', '', text)
    text = re.sub(r'RT\s?@', '', text)
    text = re.sub(r'@\S+', '', text)

    # Replace & with 'and', fix < and > (assuming intent was to escape them)
    text = re.sub(r'&', 'and', text)

    # Remove words with 3 or fewer letters (e.g., "the", "cat")
    text = re.sub(r'\b\w{1,3}\b', ' ', text)

    # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode
    text = ''.join(c for c in text if ord(c) <= 0xFFFF)

    # Strip leading/trailing whitespace
    text = text.strip()

    # Convert emojis to text descriptions (e.g., 😊 -> :smiling_face:)
    text = emoji.demojize(text)

    # Remove punctuation, keeping alphanumeric characters and spaces
    text = re.sub(r'[^\w\s]', ' ', text)

    # Tokenize text (lowercase for consistency)
    tokens: List[str] = word_tokenize(text.lower(), language='french')

    # Filter out numbers and keep only alphabetic tokens
    tokens_alpha = [token for token in tokens if token.isalpha()]

    # Remove stopwords
    tokens_cleaned = [token for token in tokens_alpha if token not in STOP_WORDS]

    # Join tokens back into a single string
    cleaned_text = ' '.join(tokens_cleaned)

    return cleaned_text


# Example usage
if __name__ == "__main__":
    sample_text = "RT @user: I love this product! < and 16§789> #great https://example.com 😊 &amp; fast service"
    cleaned = clean_text(sample_text)
    print(cleaned)

love this product great fast service


#### Review data cleaning and transformation

In [50]:
import os
import pandas as pd
from datetime import datetime
import logging

def process_reviews(raw_file):
    """
    Processes raw review data by standardizing dates, extracting temporal features, and cleaning text.
    Removes rows with missing values, invalid ratings, or short reviews, ensuring data quality.
    Saves the cleaned and filtered reviews to a timestamped CSV file for further analysis.
    """
    
    try:
        # Load raw data
        logging.info(f"Loading raw data from: {raw_file}")
        df = pd.read_csv(raw_file)
        logging.info(f"Successfully loaded {len(df)} rows of data.")
    except Exception as e:
        logging.error(f"Error loading raw data: {e}")
        return None

    try:
        # Standardize date formats
        logging.info("Standardizing date formats...")
        df["experienceDate"] = pd.to_datetime(df["experienceDate"]).dt.strftime('%Y-%m-%d %H:%M:%S')
        df["createdDateTime"] = pd.to_datetime(df["createdDateTime"]).dt.strftime('%Y-%m-%d %H:%M:%S')
        df["publishedDate"] = pd.to_datetime(df["publishedDate"]).dt.strftime('%Y-%m-%d %H:%M:%S')
        logging.info("Date formats standardized successfully.")
    except Exception as e:
        logging.error(f"Error standardizing date formats: {e}")
        return None

    try:
        # Extract temporal features
        logging.info("Extracting temporal features (year, month, day, hour)...")
        df['year'] = pd.to_datetime(df['createdDateTime']).dt.year
        df["year_quarter"] = pd.to_datetime(df['createdDateTime']).dt.year.astype(str) + \
            "-Q" + pd.to_datetime(df['createdDateTime']).dt.quarter.astype(str)
        df['month'] = pd.to_datetime(df['createdDateTime']).dt.month
        df['month_name'] = pd.to_datetime(df['createdDateTime']).dt.month_name()
        df['day'] = pd.to_datetime(df['createdDateTime']).dt.day
        df['day_name'] = pd.to_datetime(df['createdDateTime']).dt.day_name()
        df['hour'] = pd.to_datetime(df['createdDateTime']).dt.hour
        logging.info("Temporal features extracted successfully.")
    except Exception as e:
        logging.error(f"Error extracting temporal features: {e}")
        return None

    try:
        # Remove rows with missing values
        logging.info("Removing rows with missing values...")
        initial_rows = len(df)
        df.dropna(inplace=True, subset=["id", "review", "rating", "experienceDate", "createdDateTime", "publishedDate"])
        removed_rows = initial_rows - len(df)
        logging.info(f"Removed {removed_rows} rows with missing values. Remaining rows: {len(df)}")
    except Exception as e:
        logging.error(f"Error removing rows with missing values: {e}")
        return None

    try:
        # Remove rows with invalid ratings
        logging.info("Removing rows with invalid ratings...")
        initial_rows = len(df)
        df = df[df["rating"].isin([1, 2, 3, 4, 5])]
        removed_rows = initial_rows - len(df)
        logging.info(f"Removed {removed_rows} rows with invalid ratings. Remaining rows: {len(df)}")
    except Exception as e:
        logging.error(f"Error removing rows with invalid ratings: {e}")
        return None

    try:
        # Clean review text
        logging.info("Cleaning review text...")
        df["review"] = df["review"].apply(clean_text)
        logging.info("Review text cleaned successfully.")
    except Exception as e:
        logging.error(f"Error cleaning review text: {e}")
        return None

    try:
        # Remove short reviews
        logging.info("Removing short reviews...")
        initial_rows = len(df)
        df = df[df["review"].str.len() > 4]
        removed_rows = initial_rows - len(df)
        logging.info(f"Removed {removed_rows} short reviews. Remaining rows: {len(df)}")
    except Exception as e:
        logging.error(f"Error removing short reviews: {e}")
        return None

    try:
        # Save cleaned data to CSV
        logging.info("Saving cleaned data to CSV...")
        output_dir = "../data/cleaned"
        os.makedirs(output_dir, exist_ok=True)
        timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
        cleaned_file_path = os.path.join(output_dir, f"cleaned_reviews_{timestamp}.csv")
        df.to_csv(cleaned_file_path, index=False)
        logging.info(f"Cleaned data saved to: {cleaned_file_path}")
    except Exception as e:
        logging.error(f"Error saving cleaned data: {e}")
        return None

    return cleaned_file_path


In [None]:
# Processed raw review
process_reviews("../data/raw/raw_reviews_0.csv")

2025-03-26 18:52:42,983 - INFO - Loading raw data from: ../data/raw/raw_reviews_0.csv
2025-03-26 18:52:43,151 - INFO - Successfully loaded 50498 rows of data.
2025-03-26 18:52:43,151 - INFO - Standardizing date formats...
2025-03-26 18:52:43,466 - INFO - Date formats standardized successfully.
2025-03-26 18:52:43,467 - INFO - Extracting temporal features (year, month, day, hour)...
2025-03-26 18:52:43,501 - INFO - Temporal features extracted successfully.
2025-03-26 18:52:43,502 - INFO - Removing rows with missing values...
2025-03-26 18:52:43,516 - INFO - Removed 0 rows with missing values. Remaining rows: 50498
2025-03-26 18:52:43,517 - INFO - Removing rows with invalid ratings...
2025-03-26 18:52:43,520 - INFO - Removed 0 rows with invalid ratings. Remaining rows: 50498
2025-03-26 18:52:43,521 - INFO - Cleaning review text...
2025-03-26 18:52:47,859 - INFO - Review text cleaned successfully.
2025-03-26 18:52:47,860 - INFO - Removing short reviews...
2025-03-26 18:52:47,869 - INFO - 

'../data/cleaned/cleaned_reviews_20250326185247.csv'

### Loading cleaned review into the full review base

After cleaning and processing, the extracted raw reviews are consolidated into the full review database.
Before consolidation, the pipeline checks for the presence of new data. If no new data is found, the full review database remains unchanged, and the pipeline stops at this stage.
Once the loading process is complete, the cleaned review file is archived to ensure proper organization and to maintain a history of processed data.

In [44]:
def load_reviews(cleaned_file) -> Optional[pd.DataFrame]:
    """
    Processes review data by merging new data with existing cleaned data.
    
    Args:
        cleaned_file (str): Path to the new review data file
        
    Returns:
        pd.DataFrame: Updated DataFrame after merging and deduplication
        None: If an error occurs during processing or if no new data is added
        
    Key Features:
        - Ensures output directory exists
        - Handles existing data loading with error checking
        - Updates data only if new reviews are added
        - Writes "1" to ../parameters/new_reviews.txt if new reviews exist, otherwise "0"
        - Logs percentage increase in data when new reviews are added
        - Saves updated data with timestamp backup only if necessary
        - Archives the cleaned review file after processing
    """

    try:
        # Load new review data
        df = pd.read_csv(cleaned_file)

        # Define paths
        base_dir = "../data"
        full_reviews_folder = os.path.join(base_dir, "full")
        archive_folder = os.path.join(base_dir, "archive")  # Archive folder for processed files
        full_reviews_file = "full_reviews.csv"
        full_reviews_path = os.path.join(full_reviews_folder, full_reviews_file)
        new_reviews_flag_path = os.path.join("../parameters", "new_reviews.txt")

        # Ensure directories exist
        os.makedirs(full_reviews_folder, exist_ok=True)
        os.makedirs(archive_folder, exist_ok=True)  # Create archive folder
        os.makedirs(os.path.dirname(new_reviews_flag_path), exist_ok=True)
        logging.info(f"Directories ensured: {full_reviews_folder}, {archive_folder}, {os.path.dirname(new_reviews_flag_path)}")

        # Load existing data with error handling
        df_full = pd.DataFrame()
        if os.path.isfile(full_reviews_path):
            try:
                df_full = pd.read_csv(full_reviews_path, low_memory=False)
                logging.info(f"Loaded existing data from {full_reviews_path}")
            except pd.errors.EmptyDataError:
                logging.warning(f"Empty CSV file found at {full_reviews_path}")
            except pd.errors.ParserError:
                logging.error(f"Parsing error in {full_reviews_path}")
        else:
            logging.info(f"No existing file found at {full_reviews_path}, initializing empty DataFrame")

        # Validate input DataFrame
        if df.empty:
            logging.warning("Input DataFrame is empty")
            return None

        # Merge and deduplicate
        initial_length = len(df_full)
        df_full_updated = pd.concat([df_full, df], ignore_index=True)
        df_full_updated = df_full_updated.drop_duplicates(subset=df.columns, keep='last')
        final_length = len(df_full_updated)

        # Log DataFrame sizes
        logging.info(f"Original records: {initial_length}")
        logging.info(f"Updated records: {final_length}")
        new_records_added = final_length - initial_length
        logging.info(f"New records added: {new_records_added}")

        # Check if there are new records
        has_new_reviews = final_length > initial_length
        flag_value = "1" if has_new_reviews else "0"
        with open(new_reviews_flag_path, "w") as f:
            f.write(flag_value)
        logging.info(f"Wrote '{flag_value}' to {new_reviews_flag_path}")

        # Exit early if no new reviews are added
        if not has_new_reviews:
            logging.info("No new reviews to process. Exiting without updates.")
            return df_full_updated

        # Calculate percentage increase
        if initial_length == 0 and final_length > 0:
            percentage_increase = 100
        elif initial_length > 0:
            percentage_increase = ((final_length - initial_length) / initial_length * 100) 
        else: percentage_increase = 0
        logging.info(f"Percentage increase in data: {percentage_increase:.2f}%")

        # Create backup before overwriting
        if os.path.isfile(full_reviews_path):
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            backup_path = os.path.join(full_reviews_folder, f"full_reviews_backup_{timestamp}.csv")
            df_full.to_csv(backup_path, index=False)
            logging.info(f"Backup created at {backup_path}")

        # Save updated data
        df_full_updated.to_csv(full_reviews_path, index=False)
        logging.info(f"Updated data saved to {full_reviews_path}")

        # Archive the cleaned review file
        try:
            archive_path = os.path.join(archive_folder, os.path.basename(cleaned_file))
            shutil.move(cleaned_file, archive_path)
            logging.info(f"Archived cleaned review file to: {archive_path}")
        except Exception as e:
            logging.error(f"Error archiving cleaned review file: {e}")

        return df_full_updated

    except Exception as e:
        logging.error(f"Error processing reviews: {str(e)}")
        return None

### Application

In [45]:
# application
if __name__ == "__main__":
    
    raw_file_path = "../data/raw/raw_reviews_0.csv"
    print("\n transformed reviews: \n")
    cleaned_file = process_reviews(raw_file_path)
    print(f"\n cleaned_file_path: {cleaned_file} \n")

    df_clean = pd.read_csv(cleaned_file)
    print(df_clean.info())
    print(df_clean.head())

    print("\n load reviews: \n")
    load_reviews(cleaned_file)
    df_full = pd.read_csv("../data/full/full_reviews.csv")
    print(df_full.info())
    print(df_full.head())


2025-03-26 18:55:54,679 - INFO - Loading raw data from: ../data/raw/raw_reviews_0.csv
2025-03-26 18:55:54,844 - INFO - Successfully loaded 50498 rows of data.
2025-03-26 18:55:54,844 - INFO - Standardizing date formats...



 transformed reviews: 



2025-03-26 18:55:55,188 - INFO - Date formats standardized successfully.
2025-03-26 18:55:55,189 - INFO - Extracting temporal features (year, month, day, hour)...
2025-03-26 18:55:55,223 - INFO - Temporal features extracted successfully.
2025-03-26 18:55:55,224 - INFO - Removing rows with missing values...
2025-03-26 18:55:55,238 - INFO - Removed 0 rows with missing values. Remaining rows: 50498
2025-03-26 18:55:55,239 - INFO - Removing rows with invalid ratings...
2025-03-26 18:55:55,243 - INFO - Removed 0 rows with invalid ratings. Remaining rows: 50498
2025-03-26 18:55:55,244 - INFO - Cleaning review text...
2025-03-26 18:55:59,564 - INFO - Review text cleaned successfully.
2025-03-26 18:55:59,564 - INFO - Removing short reviews...
2025-03-26 18:55:59,575 - INFO - Removed 113 short reviews. Remaining rows: 50385
2025-03-26 18:55:59,575 - INFO - Saving cleaned data to CSV...
2025-03-26 18:55:59,759 - INFO - Cleaned data saved to: ../data/cleaned/cleaned_reviews_20250326185559.csv



 cleaned_file_path: ../data/cleaned/cleaned_reviews_20250326185559.csv 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50385 entries, 0 to 50384
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               50385 non-null  object
 1   title            50385 non-null  object
 2   review           50385 non-null  object
 3   rating           50385 non-null  int64 
 4   reply            7828 non-null   object
 5   experienceDate   50385 non-null  object
 6   createdDateTime  50385 non-null  object
 7   publishedDate    50385 non-null  object
 8   year             50385 non-null  int64 
 9   month            50385 non-null  int64 
 10  month_name       50385 non-null  object
 11  day              50385 non-null  int64 
 12  day_name         50385 non-null  object
 13  hour             50385 non-null  int64 
dtypes: int64(5), object(9)
memory usage: 5.4+ MB
None
                         id  \
0  64005329

2025-03-26 18:56:00,018 - INFO - Directories ensured: ../data/full, ../data/archive, ../parameters
2025-03-26 18:56:00,018 - INFO - No existing file found at ../data/full/full_reviews.csv, initializing empty DataFrame
2025-03-26 18:56:00,053 - INFO - Original records: 0
2025-03-26 18:56:00,053 - INFO - Updated records: 50385
2025-03-26 18:56:00,053 - INFO - New records added: 50385
2025-03-26 18:56:00,054 - INFO - Wrote '1' to ../parameters/new_reviews.txt
2025-03-26 18:56:00,055 - INFO - Percentage increase in data: 100.00%
2025-03-26 18:56:00,241 - INFO - Updated data saved to ../data/full/full_reviews.csv
2025-03-26 18:56:00,242 - INFO - Archived cleaned review file to: ../data/archive/cleaned_reviews_20250326185559.csv


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50385 entries, 0 to 50384
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               50385 non-null  object
 1   title            50385 non-null  object
 2   review           50385 non-null  object
 3   rating           50385 non-null  int64 
 4   reply            7828 non-null   object
 5   experienceDate   50385 non-null  object
 6   createdDateTime  50385 non-null  object
 7   publishedDate    50385 non-null  object
 8   year             50385 non-null  int64 
 9   month            50385 non-null  int64 
 10  month_name       50385 non-null  object
 11  day              50385 non-null  int64 
 12  day_name         50385 non-null  object
 13  hour             50385 non-null  int64 
dtypes: int64(5), object(9)
memory usage: 5.4+ MB
None
                         id  \
0  640053299b64b1bdaf6661e9   
1  640046cda2e3a177e9e8b614   
2  6400387bd143b326fe43

In [25]:
from typing import Optional
import os
import pandas as pd
from datetime import datetime
import logging
import shutil

def load_reviews(cleaned_file) -> Optional[pd.DataFrame]:
    """
    Processes review data by merging new data with existing cleaned data.
    
    Args:
        cleaned_file (str): Path to the new review data file
        
    Returns:
        pd.DataFrame: Updated DataFrame after merging and deduplication
        None: If an error occurs during processing or if no new data is added
        
    Key Features:
        - Ensures output directory exists
        - Handles existing data loading with error checking
        - Updates data only if new reviews are added
        - Writes "1" to ../parameters/new_reviews.txt if new reviews exist, otherwise "0"
        - Logs percentage increase in data when new reviews are added
        - Saves updated data with timestamp backup only if necessary
        - Archives the cleaned review file after processing
    """

    try:
        # Load new review data
        df = pd.read_csv(cleaned_file)

        # Define paths
        base_dir = "../data"
        full_reviews_folder = os.path.join(base_dir, "full")
        archive_folder = os.path.join(base_dir, "archive")  # Archive folder for processed files
        full_reviews_file = "full_reviews.csv"
        full_reviews_path = os.path.join(full_reviews_folder, full_reviews_file)
        new_reviews_flag_path = os.path.join("../parameters", "new_reviews.txt")

        # Ensure directories exist
        os.makedirs(full_reviews_folder, exist_ok=True)
        os.makedirs(archive_folder, exist_ok=True)  # Create archive folder
        os.makedirs(os.path.dirname(new_reviews_flag_path), exist_ok=True)
        logging.info(f"Directories ensured: {full_reviews_folder}, {archive_folder}, {os.path.dirname(new_reviews_flag_path)}")

        # Load existing data with error handling
        df_full = pd.DataFrame()
        if os.path.isfile(full_reviews_path):
            try:
                df_full = pd.read_csv(full_reviews_path, low_memory=False)
                logging.info(f"Loaded existing data from {full_reviews_path}")
            except pd.errors.EmptyDataError:
                logging.warning(f"Empty CSV file found at {full_reviews_path}")
            except pd.errors.ParserError:
                logging.error(f"Parsing error in {full_reviews_path}")
        else:
            logging.info(f"No existing file found at {full_reviews_path}, initializing empty DataFrame")

        # Validate input DataFrame
        if df.empty:
            logging.warning("Input DataFrame is empty")
            return None

        # Merge and deduplicate
        initial_length = len(df_full)
        df_full_updated = pd.concat([df_full, df], ignore_index=True)
        df_full_updated = df_full_updated.drop_duplicates(subset=df.columns, keep='last')
        final_length = len(df_full_updated)

        # Log DataFrame sizes
        logging.info(f"Original records: {initial_length}")
        logging.info(f"Updated records: {final_length}")
        new_records_added = final_length - initial_length
        logging.info(f"New records added: {new_records_added}")

        # Check if there are new records
        has_new_reviews = final_length > initial_length
        flag_value = "1" if has_new_reviews else "0"
        with open(new_reviews_flag_path, "w") as f:
            f.write(flag_value)
        logging.info(f"Wrote '{flag_value}' to {new_reviews_flag_path}")

        # Exit early if no new reviews are added
        if not has_new_reviews:
            logging.info("No new reviews to process. Exiting without updates.")
            return df_full_updated

        # Calculate percentage increase
        percentage_increase = ((final_length - initial_length) / initial_length * 100) if initial_length > 0 else 0
        logging.info(f"Percentage increase in data: {percentage_increase:.2f}%")

        # Create backup before overwriting
        if os.path.isfile(full_reviews_path):
            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
            backup_path = os.path.join(full_reviews_folder, f"full_reviews_backup_{timestamp}.csv")
            df_full.to_csv(backup_path, index=False)
            logging.info(f"Backup created at {backup_path}")

        # Save updated data
        df_full_updated.to_csv(full_reviews_path, index=False)
        logging.info(f"Updated data saved to {full_reviews_path}")

        # Archive the cleaned review file
        try:
            archive_path = os.path.join(archive_folder, os.path.basename(cleaned_file))
            shutil.move(cleaned_file, archive_path)
            logging.info(f"Archived cleaned review file to: {archive_path}")
        except Exception as e:
            logging.error(f"Error archiving cleaned review file: {e}")

        return df_full_updated

    except Exception as e:
        logging.error(f"Error processing reviews: {str(e)}")
        return None

In [49]:
data = pd.read_csv("../data/full/full_reviews.csv")
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50385 entries, 0 to 50384
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               50385 non-null  object
 1   title            50385 non-null  object
 2   review           50385 non-null  object
 3   rating           50385 non-null  int64 
 4   reply            7828 non-null   object
 5   experienceDate   50385 non-null  object
 6   createdDateTime  50385 non-null  object
 7   publishedDate    50385 non-null  object
 8   year             50385 non-null  int64 
 9   month            50385 non-null  int64 
 10  month_name       50385 non-null  object
 11  day              50385 non-null  int64 
 12  day_name         50385 non-null  object
 13  hour             50385 non-null  int64 
dtypes: int64(5), object(9)
memory usage: 5.4+ MB


Unnamed: 0,id,title,review,rating,reply,experienceDate,createdDateTime,publishedDate,year,month,month_name,day,day_name,hour
0,640053299b64b1bdaf6661e9,Satisfait de mon choix avec backmarket,satisfait choix backmarket,5,,2023-02-28 23:00:00,2023-03-02 09:41:29,2023-03-02 09:41:29,2023,3,March,2,Thursday,9
1,640046cda2e3a177e9e8b614,"A fuir, aucun service client",éviter reçu article demande retour fois avant ...,1,,2023-03-01 00:00:00,2023-03-02 08:48:45,2023-03-02 08:48:45,2023,3,March,2,Thursday,8
2,6400387bd143b326fe43200d,"Parfait,rapide,bien emballé",parfait rapide bien emballé,5,,2023-02-28 23:00:00,2023-03-02 07:47:39,2023-03-02 07:47:39,2023,3,March,2,Thursday,7
3,640034a7d143b326fe431f53,Pentax,livraison très rapide,5,,2023-02-28 23:00:00,2023-03-02 07:31:19,2023-03-02 07:31:19,2023,3,March,2,Thursday,7
4,640030139b64b1bdaf665737,Mécontent car produit endommagéBien que le col...,bien colis arrivé globalement décu batterie ar...,1,,2023-02-27 23:00:00,2023-03-02 07:11:47,2023-03-02 07:11:47,2023,3,March,2,Thursday,7
