In [None]:
# A notebook for Joaco to test stuff related to MIND preprocessing. Will be erased for the final version.

In [1]:
import os
import sys
import numpy as np
import pandas as pd
from tqdm import tqdm
import pickle
from collections import Counter
from tempfile import TemporaryDirectory

from recommenders.datasets.mind import (download_mind,
                                     extract_mind,
                                     download_and_extract_glove,
                                     load_glove_matrix,
                                     word_tokenize
                                    )
from recommenders.datasets.download_utils import unzip_file
from recommenders.utils.notebook_utils import store_metadata

print("System version: {}".format(sys.version))

System version: 3.10.12 (main, Nov  6 2024, 20:22:13) [GCC 11.4.0]


In [2]:
# MIND sizes: "demo", "small" or "large"
mind_type="demo" 
# word_embedding_dim should be in [50, 100, 200, 300]
word_embedding_dim = 300

In [None]:
tmpdir = TemporaryDirectory()
data_path = tmpdir.name
train_zip, valid_zip = download_mind(size=mind_type, dest_path=data_path)
unzip_file(train_zip, os.path.join(data_path, 'train'), clean_zip_file=False)
unzip_file(valid_zip, os.path.join(data_path, 'valid'), clean_zip_file=False)
output_path = os.path.join(data_path, 'utils')
os.makedirs(output_path, exist_ok=True)

100%|██████████| 17.0k/17.0k [00:27<00:00, 628KB/s]  
100%|██████████| 9.84k/9.84k [00:14<00:00, 660KB/s]  


In [9]:
import os
from typing import Dict
from pyspark.sql import DataFrame, SparkSession
import pyspark.sql.functions as F
from pyspark.sql.functions import col, explode, split, when, lit, rand
from pyspark.sql.window import Window
import logging
from logging.handlers import RotatingFileHandler

#Load training and validation data based on the selected data source.
def load_training_data(spark,
                       data_source = "recommenders",  # "db", "recommenders", or "csv"
                       **kwargs):
    if data_source == "recommenders":
        
        train_path = "./data/mind/train/behaviors.tsv"
        valid_path = "./data/mind/valid/behaviors.tsv"
        
        logger.info("Preprocessing MIND dataset...")        
        
        training_data, validation_data = preprocess_behaviors_mind(
            spark=spark,
            train_path=train_path,
            valid_path=valid_path
        )
        logger.info("MIND dataset preprocessed successfully.")
        
    elif data_source == "db":
        from data_management.data_utils import load_data_split
        config = kwargs.get("config")
        query = kwargs.get("query")
        training_data, validation_data = load_data_split(spark, config=config, query=query)
    
    elif data_source == "csv":
        file_path = kwargs.get("file_path", "./data/csv")
        logger.info(f"Loading training data from CSV: {file_path}/training_data.csv")
        training_data = spark.read.csv(f"{file_path}/training_data.csv", header=True)
        logger.info(f"Loading validation data from CSV: {file_path}/validation_data.csv")
        validation_data = spark.read.csv(f"{file_path}/validation_data.csv", header=True)
    
    else:
        raise ValueError(f"Unsupported data source: {data_source}")

    return training_data, validation_data

def get_logger(name: str, log_file: str = None, level: int = logging.INFO, max_bytes: int = 10 * 1024 * 1024, backup_count: int = 5):
    
    logger = logging.getLogger(name)
    logger.setLevel(level)

    if not logger.handlers:
        
        console_handler = logging.StreamHandler()
        console_handler.setLevel(level)

        formatter = logging.Formatter(
            "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
        )
        console_handler.setFormatter(formatter)
        logger.addHandler(console_handler)

        if log_file:
            os.makedirs(os.path.dirname(log_file), exist_ok=True) #Create directory if it doesn't exist
            
            file_handler = RotatingFileHandler(
                log_file, maxBytes=max_bytes, backupCount=backup_count
            )
            file_handler.setLevel(level)
            file_handler.setFormatter(formatter)
            logger.addHandler(file_handler)

    return logger


logger = get_logger("DataUtils", log_file="logs/data_utils.log")

# Preprocesses the behaviors table for training Spark's ALS model
def preprocess_behaviors_mind(
    spark: SparkSession, 
    train_path: str, 
    valid_path: str, 
    npratio: int = 4
):
    logger.info(f"Starting to preprocess MIND dataset. Train: {train_path}, Valid: {valid_path}")
    
    def process_behaviors(df):
        impressions_df = df.withColumn("impression", explode(split(col("impressions"), " ")))
        
        # Extract clicked (1) or non-clicked (0) status
        impressions_df = impressions_df.withColumn(
            "clicked",
            when(col("impression").endswith("-1"), lit(1)).otherwise(lit(0))
        ).withColumn(
            "newsId",
            split(col("impression"), "-")[0]
        ).select("userId", "newsId", "clicked")
        
        positive_samples = impressions_df.filter(col("clicked") == 1)
        negative_samples = impressions_df.filter(col("clicked") == 0) \
            .withColumn("rand", rand())
        
        # Select npratio negative samples per positive sample (addressing class imbalance and making the matrix lighter)
        window = Window.partitionBy("userId").orderBy("rand")
        negative_samples = negative_samples.withColumn("rank", F.row_number().over(window)) \
            .filter(col("rank") <= npratio) \
            .drop("rank", "rand")
        
        combined_samples = positive_samples.union(negative_samples)
        
        return combined_samples

    train_behaviors = spark.read.csv(train_path, sep="\t", header=False) \
        .toDF("impressionId", "userId", "timestamp", "click_history", "impressions")
    valid_behaviors = spark.read.csv(valid_path, sep="\t", header=False) \
        .toDF("impressionId", "userId", "timestamp", "click_history", "impressions")
    
    train_df = process_behaviors(train_behaviors)
    valid_df = process_behaviors(valid_behaviors)

    logger.info("Preprocessing of MIND dataset completed.")
    return train_df, valid_df