In [1]:
import sys
import os

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../../..")))

In [2]:
import zipfile

from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

from src.utilities.data_utils import preprocess_behaviors_mind

In [3]:
train_path = "/home/joaquin_l_calvo/Trento/Data_Mining/MINDsmall_train.zip"
validation_path = "/home/joaquin_l_calvo/Trento/Data_Mining/MINDsmall_dev.zip"


In [4]:
# Spark session initialization
spark = SparkSession.builder \
    .appName("ALS Hyperparameter Tuning") \
    .master("local[*]") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()

# Schema for loading the dataset
schema = StructType([
    StructField("impression_id", IntegerType(), True),
    StructField("user_id", StringType(), True),
    StructField("time", StringType(), True),
    StructField("history", StringType(), True),
    StructField("impressions", StringType(), True)
])

your 131072x1 screen size is bogus. expect trouble
24/12/27 11:49:13 WARN Utils: Your hostname, DESKTOP-LQJ6T08 resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
24/12/27 11:49:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/12/27 11:49:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
# Helper function to extract and preprocess data
def extract_and_load_zip(file_path, schema):
    # Create a temporary directory for extraction
    extracted_path = os.path.splitext(file_path)[0]
    if not os.path.exists(extracted_path):
        print(f"Extracting {file_path}...")
        with zipfile.ZipFile(file_path, 'r') as zip_ref:
            zip_ref.extractall(extracted_path)
    else:
        print(f"Using already extracted data at {extracted_path}...")

    # Find the CSV file inside the extracted directory
    csv_files = [os.path.join(extracted_path, f) for f in os.listdir(extracted_path) if f.endswith('.tsv')]
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found in {extracted_path}")

    # Load CSV into Spark
    print(f"Loading data from {csv_files[0]}...")
    df = spark.read.csv(csv_files[0], schema=schema, sep="\t", header=False)
    return df

In [6]:
# Load train and validation data
train_raw_df = extract_and_load_zip(train_path, schema)
valid_raw_df = extract_and_load_zip(validation_path, schema)

Using already extracted data at /home/joaquin_l_calvo/Trento/Data_Mining/MINDsmall_train...
Loading data from /home/joaquin_l_calvo/Trento/Data_Mining/MINDsmall_train/behaviors.tsv...
Using already extracted data at /home/joaquin_l_calvo/Trento/Data_Mining/MINDsmall_dev...
Loading data from /home/joaquin_l_calvo/Trento/Data_Mining/MINDsmall_dev/behaviors.tsv...


In [7]:
# Load the raw train and validation datasets
#train_raw_df = spark.read.csv(train_csv_path, header=True, schema=schema)
#valid_raw_df = spark.read.csv(valid_csv_path, header=True, schema=schema)

# Preprocess the datasets
npratio = 4  # Define your negative sampling ratio
training_data, validation_data = preprocess_behaviors_mind(spark, train_raw_df, valid_raw_df, npratio)


2024-12-27 11:49:20,036 - DataUtils - INFO - Starting to preprocess MIND dataset.


root
 |-- impression_id: integer (nullable = true)
 |-- user_id: string (nullable = true)
 |-- time: string (nullable = true)
 |-- history: string (nullable = true)
 |-- impressions: string (nullable = true)

root
 |-- impression_id: integer (nullable = true)
 |-- user_id: string (nullable = true)
 |-- time: string (nullable = true)
 |-- history: string (nullable = true)
 |-- impressions: string (nullable = true)



2024-12-27 11:49:21,523 - DataUtils - INFO - Preprocessing of MIND dataset completed.


In [8]:
# Hyperparameter tuning setup
als = ALS(userCol="userId",
          itemCol="newsId",
          ratingCol="clicked",
          coldStartStrategy="drop",
          maxIter=15)

# Define the parameter grid without maxIter
param_grid = ParamGridBuilder() \
    .addGrid(als.rank, [10, 20, 30, 40]) \
    .addGrid(als.regParam, [0.01, 0.05, 0.1, 0.2]) \
    .addGrid(als.alpha, [1.0, 5.0, 10.0]) \
    .build()

# Set up CrossValidator
cv = CrossValidator(
    estimator=als,
    estimatorParamMaps=param_grid,
    evaluator=RegressionEvaluator(metricName="rmse", labelCol="clicked", predictionCol="prediction"),
    numFolds=3,
    parallelism=4
)

In [9]:
# Fit cross-validation model
cv_model = cv.fit(training_data)

# Extract the best model and parameters
best_model = cv_model.bestModel
best_rank = best_model._java_obj.parent().getRank()
best_reg_param = best_model._java_obj.parent().getRegParam()
best_alpha = best_model._java_obj.parent().getAlpha()

print(f"Best Hyperparameters:\nRank: {best_rank}, RegParam: {best_reg_param}, Alpha: {best_alpha}")

24/12/27 11:50:09 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
24/12/27 11:50:09 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
24/12/27 11:50:10 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.lapack.JNILAPACK
                                                                                ]]]]]6]]

Best Hyperparameters:
Rank: 40, RegParam: 0.1, Alpha: 1.0
