In [65]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, split, udf, regexp_replace, lit, from_unixtime
from pyspark.sql.types import ArrayType, StringType, StructType, StructField, IntegerType, StringType, MapType
from pyspark.sql.functions import split
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, MapType, StringType
from pyspark.sql.functions import explode


from pyspark.sql.functions import split, explode, regexp_extract, col, collect_list, udf, broadcast
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, DoubleType, FloatType
from pyspark.ml.linalg import VectorUDT, Vectors
import numpy as np
import os

# NEWS

In [34]:
from pyspark.sql import SparkSession

# Initialize a SparkSession
spark = SparkSession.builder \
    .appName("MIND Dataset Processing") \
    .getOrCreate()

24/12/12 16:03:49 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [35]:
# Define the path to the news.tsv file
news_path = "data/mind/MINDsmall_train/news.tsv"

# Define column names for the news.tsv file
news_columns = ["NewsID", "Category", "Subcategory", "Title", "Abstract", "URL", "TitleEntities", "AbstractEntities"]

# Load the news.tsv file into a Spark DataFrame
news_df = spark.read.csv(
    news_path,
    sep="\t",
    schema="NewsID STRING, Category STRING, Subcategory STRING, Title STRING, Abstract STRING, URL STRING, TitleEntities STRING, AbstractEntities STRING",
    header=False
)

# Assign column names
news_df.show(n=2, truncate=True) 

+------+---------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|NewsID| Category|    Subcategory|               Title|            Abstract|                 URL|       TitleEntities|    AbstractEntities|
+------+---------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|N55528|lifestyle|lifestyleroyals|The Brands Queen ...|Shop the notebook...|https://assets.ms...|[{"Label": "Princ...|                  []|
|N19639|   health|     weightloss|50 Worst Habits F...|These seemingly h...|https://assets.ms...|[{"Label": "Adipo...|[{"Label": "Adipo...|
+------+---------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 2 rows



In [36]:
news_df.columns

['NewsID',
 'Category',
 'Subcategory',
 'Title',
 'Abstract',
 'URL',
 'TitleEntities',
 'AbstractEntities']

In [37]:
#### PRE PROCESSING ###

In [38]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, length, size, udf
from pyspark.sql.types import StringType, ArrayType, IntegerType
from pyspark.ml.feature import Tokenizer, StopWordsRemover

import json

In [39]:
# Initialize Spark session
spark = SparkSession.builder.appName("PreprocessingPipeline").getOrCreate()

# Load your data (modify the path as necessary)
news_df = spark.read.csv("data/mind/MINDsmall_train/news.tsv", sep="\t", header=False, inferSchema=True)

# Assign column names
news_df = news_df.toDF("NewsID", "Category", "Subcategory", "Title", "Abstract", "URL", "TitleEntities", "AbstractEntities")

# Display initial rows
news_df.show(5, truncate=True)

24/12/12 16:03:50 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


+------+---------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|NewsID| Category|    Subcategory|               Title|            Abstract|                 URL|       TitleEntities|    AbstractEntities|
+------+---------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|N55528|lifestyle|lifestyleroyals|The Brands Queen ...|Shop the notebook...|https://assets.ms...|[{"Label": "Princ...|                  []|
|N19639|   health|     weightloss|50 Worst Habits F...|These seemingly h...|https://assets.ms...|[{"Label": "Adipo...|[{"Label": "Adipo...|
|N61837|     news|      newsworld|The Cost of Trump...|Lt. Ivan Molchane...|https://assets.ms...|                  []|[{"Label": "Ukrai...|
|N53526|   health|         voices|I Was An NBA Wife...|I felt like I was...|https://assets.ms...|                  []|[{"Label": "Natio...|
|N38324|   health|  

In [40]:
### MISSING VALUES ###

# Drop rows where Title or Abstract are missing
news_df = news_df.na.drop(subset=["Title", "Abstract"])

# Verify the results
print(f"Rows after dropping missing values: {news_df.count()}")


Rows after dropping missing values: 48616


In [41]:
### TEXT CLEANING ###

# Define a function to clean text (remove special characters and convert to lowercase)
def clean_text(text):
    if text:
        return text.lower().replace("\n", " ").replace("\t", " ")
    return None

# Register the UDF
clean_text_udf = udf(lambda x: clean_text(x), StringType())

# Apply text cleaning to Title and Abstract
news_df = news_df.withColumn("CleanTitle", clean_text_udf(col("Title")))
news_df = news_df.withColumn("CleanAbstract", clean_text_udf(col("Abstract")))

# Display cleaned text
news_df.select("CleanTitle", "CleanAbstract").show(5, truncate=True)

[Stage 7:>                                                          (0 + 1) / 1]

+--------------------+--------------------+
|          CleanTitle|       CleanAbstract|
+--------------------+--------------------+
|the brands queen ...|shop the notebook...|
|50 worst habits f...|these seemingly h...|
|the cost of trump...|lt. ivan molchane...|
|i was an nba wife...|i felt like i was...|
|how to get rid of...|they seem harmles...|
+--------------------+--------------------+
only showing top 5 rows



                                                                                

In [42]:
### TOKENIZATION ###

# Tokenize CleanTitle and CleanAbstract
tokenizer_title = Tokenizer(inputCol="CleanTitle", outputCol="TitleTokens")
tokenizer_abstract = Tokenizer(inputCol="CleanAbstract", outputCol="AbstractTokens")

news_df = tokenizer_title.transform(news_df)
news_df = tokenizer_abstract.transform(news_df)

# Display tokenized data
news_df.select("TitleTokens", "AbstractTokens").show(5, truncate=True)


+--------------------+--------------------+
|         TitleTokens|      AbstractTokens|
+--------------------+--------------------+
|[the, brands, que...|[shop, the, noteb...|
|[50, worst, habit...|[these, seemingly...|
|[the, cost, of, t...|[lt., ivan, molch...|
|[i, was, an, nba,...|[i, felt, like, i...|
|[how, to, get, ri...|[they, seem, harm...|
+--------------------+--------------------+
only showing top 5 rows



In [43]:
### STOPWORDS REMOVAL ###

# Remove stopwords from TitleTokens and AbstractTokens
stopword_remover_title = StopWordsRemover(inputCol="TitleTokens", outputCol="FilteredTitleTokens")
stopword_remover_abstract = StopWordsRemover(inputCol="AbstractTokens", outputCol="FilteredAbstractTokens")

news_df = stopword_remover_title.transform(news_df)
news_df = stopword_remover_abstract.transform(news_df)

# Display filtered tokens
news_df.select("FilteredTitleTokens", "FilteredAbstractTokens").show(5, truncate=True)


+--------------------+----------------------+
| FilteredTitleTokens|FilteredAbstractTokens|
+--------------------+----------------------+
|[brands, queen, e...|  [shop, notebooks,...|
|[50, worst, habit...|  [seemingly, harml...|
|[cost, trump's, a...|  [lt., ivan, molch...|
|[nba, wife., affe...|  [felt, like, frau...|
|[get, rid, skin, ...|  [seem, harmless,,...|
+--------------------+----------------------+
only showing top 5 rows



In [44]:
# Define a UDF to clean each token in the array
def clean_tokens(tokens):
    if tokens:
        return [token.replace(",", "") for token in tokens]  # Remove commas
    return tokens

# Register the UDF
clean_tokens_udf = udf(clean_tokens, ArrayType(StringType()))

# Apply the UDF to FilteredTitleTokens
news_df = news_df.withColumn("FilteredTitleTokens", clean_tokens_udf(col("FilteredTitleTokens")))
news_df = news_df.withColumn("FilteredAbstractTokens", clean_tokens_udf(col("FilteredAbstractTokens")))

In [45]:
news_df.select("Title", "FilteredTitleTokens").show(5, truncate=True)

+--------------------+--------------------+
|               Title| FilteredTitleTokens|
+--------------------+--------------------+
|The Brands Queen ...|[brands, queen, e...|
|50 Worst Habits F...|[50, worst, habit...|
|The Cost of Trump...|[cost, trump's, a...|
|I Was An NBA Wife...|[nba, wife., affe...|
|How to Get Rid of...|[get, rid, skin, ...|
+--------------------+--------------------+
only showing top 5 rows



In [46]:
news_df.select("Abstract", "FilteredAbstractTokens").show(2, truncate=True)

+--------------------+----------------------+
|            Abstract|FilteredAbstractTokens|
+--------------------+----------------------+
|Shop the notebook...|  [shop, notebooks,...|
|These seemingly h...|  [seemingly, harml...|
+--------------------+----------------------+
only showing top 2 rows



In [47]:
news_df.columns

['NewsID',
 'Category',
 'Subcategory',
 'Title',
 'Abstract',
 'URL',
 'TitleEntities',
 'AbstractEntities',
 'CleanTitle',
 'CleanAbstract',
 'TitleTokens',
 'AbstractTokens',
 'FilteredTitleTokens',
 'FilteredAbstractTokens']

In [48]:
news_df.select('NewsID', 'Category', 'FilteredTitleTokens', 'FilteredAbstractTokens').show(1)

+------+---------+--------------------+----------------------+
|NewsID| Category| FilteredTitleTokens|FilteredAbstractTokens|
+------+---------+--------------------+----------------------+
|N55528|lifestyle|[brands, queen, e...|  [shop, notebooks,...|
+------+---------+--------------------+----------------------+
only showing top 1 row



In [49]:
pip install spark-nlp==5.5.1

Note: you may need to restart the kernel to use updated packages.


In [50]:
### EMBEDDINGS ###

In [57]:
import sparknlp
from pyspark.sql import SparkSession
from sparknlp.base import DocumentAssembler, TokenAssembler
from sparknlp.annotator import BertEmbeddings
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.sql.functions import col, concat_ws, array_union, explode
from pyspark.sql.functions import concat, col

In [52]:
news_df.printSchema()

root
 |-- NewsID: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Subcategory: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- Abstract: string (nullable = true)
 |-- URL: string (nullable = true)
 |-- TitleEntities: string (nullable = true)
 |-- AbstractEntities: string (nullable = true)
 |-- CleanTitle: string (nullable = true)
 |-- CleanAbstract: string (nullable = true)
 |-- TitleTokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- AbstractTokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- FilteredTitleTokens: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- FilteredAbstractTokens: array (nullable = true)
 |    |-- element: string (containsNull = true)



In [58]:
news_df = news_df.withColumn(
    "combined_tokens", concat(col("FilteredTitleTokens"), col("FilteredAbstractTokens"))
)


In [59]:
news_df.select('combined_tokens').show(1, truncate=True)

+--------------------+
|     combined_tokens|
+--------------------+
|[brands, queen, e...|
+--------------------+
only showing top 1 row



In [60]:
# Term frequency 

from pyspark.ml.feature import CountVectorizer

cv = CountVectorizer(inputCol="combined_tokens", outputCol="raw_features")
cv_model = cv.fit(news_df)
news_df_tf = cv_model.transform(news_df)


                                                                                

In [61]:
from pyspark.ml.feature import IDF

idf = IDF(inputCol="raw_features", outputCol="tf_idf")
idf_model = idf.fit(news_df_tf)
news_df_tfidf = idf_model.transform(news_df_tf)

24/12/12 16:07:02 WARN DAGScheduler: Broadcasting large task binary with size 1236.8 KiB
24/12/12 16:07:09 WARN DAGScheduler: Broadcasting large task binary with size 1237.8 KiB
                                                                                

In [63]:
news_df_tfidf.select("NewsID", "combined_tokens", "tf_idf").show(truncate=True)

24/12/12 16:07:27 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB


+------+--------------------+--------------------+
|NewsID|     combined_tokens|              tf_idf|
+------+--------------------+--------------------+
|N55528|[brands, queen, e...|(109675,[309,978,...|
|N19639|[50, worst, habit...|(109675,[27,437,8...|
|N61837|[cost, trump's, a...|(109675,[63,176,3...|
|N53526|[nba, wife., affe...|(109675,[38,89,23...|
|N38324|[get, rid, skin, ...|(109675,[6,17,20,...|
| N2073|[nfl, able, fine,...|(109675,[87,94,17...|
|N49186|[orlando's, hotte...|(109675,[98,225,2...|
|N59295|[chile:, three, d...|(109675,[2,19,23,...|
|N24510|[best, ps5, games...|(109675,[13,28,33...|
|N39237|[report, weather-...|(109675,[52,65,93...|
| N9721|[50, foods, never...|(109675,[17,187,2...|
|N60905|[trying, make, ra...|(109675,[1,44,58,...|
|N39758|[25, biggest, gro...|(109675,[5,206,22...|
|N28361|[instagram, filte...|(109675,[142,181,...|
|N18680|[michigan, apple,...|(109675,[156,181,...|
|N55610|[kate, middleton'...|(109675,[33,35,41...|
|N35621|[stars, got, fire...|(1

# USERS

In [67]:
# Define the schema
behaviors_schema = StructType([
    StructField("ImpressionID", StringType(), True),
    StructField("UserID", StringType(), True),
    StructField("Time", StringType(), True),
    StructField("History", StringType(), True),
    StructField("Impressions", StringType(), True)
])

# Load the behaviors.tsv file
behaviors_df = spark.read.csv(
    "data/mind/MINDsmall_train/behaviors.tsv",
    sep="\t",
    schema=behaviors_schema,
    header=False
)

# Display the schema and a sample row
# behaviors_df.printSchema()
behaviors_df.show(1, truncate=False)

+------------+------+---------------------+--------------------------------------------------------------+-----------------+
|ImpressionID|UserID|Time                 |History                                                       |Impressions      |
+------------+------+---------------------+--------------------------------------------------------------+-----------------+
|1           |U13740|11/11/2019 9:05:58 AM|N55189 N42782 N34694 N45794 N18445 N63302 N10414 N19347 N31801|N55689-1 N35729-0|
+------------+------+---------------------+--------------------------------------------------------------+-----------------+
only showing top 1 row



In [68]:
# Split History into an array
behaviors_df = behaviors_df.withColumn("HistoryList", split(col("History"), " "))
behaviors_df = behaviors_df.drop("History")  # Drop original History column if not needed

# Verify the transformation
behaviors_df.select("ImpressionID", "UserID", "HistoryList").show(1, truncate=False)

+------------+------+------------------------------------------------------------------------+
|ImpressionID|UserID|HistoryList                                                             |
+------------+------+------------------------------------------------------------------------+
|1           |U13740|[N55189, N42782, N34694, N45794, N18445, N63302, N10414, N19347, N31801]|
+------------+------+------------------------------------------------------------------------+
only showing top 1 row



In [69]:
# Split Impressions into an array
behaviors_df = behaviors_df.withColumn("ImpressionsList", split(col("Impressions"), " "))
behaviors_df = behaviors_df.drop("Impressions")  # Drop original Impressions column if not needed

# Verify the transformation
behaviors_df.select("ImpressionID", "ImpressionsList").show(1, truncate=False)

+------------+--------------------+
|ImpressionID|ImpressionsList     |
+------------+--------------------+
|1           |[N55689-1, N35729-0]|
+------------+--------------------+
only showing top 1 row



In [70]:
# Explode ImpressionsList
impressions_exploded = behaviors_df.select(
    "ImpressionID",
    "UserID",
    "Time",
    "HistoryList",
    explode("ImpressionsList").alias("ImpressionItem")
)

# Extract CandidateNewsID and ClickLabel using regex
impressions_exploded = impressions_exploded \
    .withColumn("CandidateNewsID", regexp_extract(col("ImpressionItem"), r"^(N\d+)-\d+$", 1)) \
    .withColumn("ClickLabel", regexp_extract(col("ImpressionItem"), r"^N\d+-(\d+)$", 1).cast("integer")) \
    .drop("ImpressionItem")

# Verify the transformation
impressions_exploded.select("ImpressionID", "UserID", "CandidateNewsID", "ClickLabel").show(5, truncate=False)

+------------+------+---------------+----------+
|ImpressionID|UserID|CandidateNewsID|ClickLabel|
+------------+------+---------------+----------+
|1           |U13740|N55689         |1         |
|1           |U13740|N35729         |0         |
|2           |U91836|N20678         |0         |
|2           |U91836|N39317         |0         |
|2           |U91836|N58114         |0         |
+------------+------+---------------+----------+
only showing top 5 rows



In [92]:
# Join impressions with news_features_df on CandidateNewsID
impressions_with_features = impressions_exploded.join(
    news_df_tfidf,
    impressions_exploded.CandidateNewsID == news_df_tfidf.NewsID,
    how="left"
).drop(news_df_tfidf.NewsID)  # Drop duplicate NewsID column if present

In [93]:
impressions_with_features.show(3, truncate=True)

24/12/12 16:18:37 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
24/12/12 16:18:42 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB


+------------+------+--------------------+--------------------+---------------+----------+------------+------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+--------------------+--------------------+--------------------+
|ImpressionID|UserID|                Time|         HistoryList|CandidateNewsID|ClickLabel|    Category|       Subcategory|               Title|            Abstract|                 URL|       TitleEntities|    AbstractEntities|          CleanTitle|       CleanAbstract|         TitleTokens|      AbstractTokens| FilteredTitleTokens|FilteredAbstractTokens|     combined_tokens|        raw_features|              tf_idf|
+------------+------+--------------------+--------------------+---------------+----------+------------+------------------+--------------------+-------------------

                                                                                

In [94]:
# Filter records where ClickLabel == 1
clicked_news_df = impressions_with_features.filter(col("ClickLabel") == 1)

# Verify the filtered DataFrame
clicked_news_df.show(5, truncate=True)

24/12/12 16:18:48 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
24/12/12 16:18:53 WARN DAGScheduler: Broadcasting large task binary with size 2.9 MiB
                                                                                

+------------+------+--------------------+--------------------+---------------+----------+------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+--------------------+--------------------+--------------------+
|ImpressionID|UserID|                Time|         HistoryList|CandidateNewsID|ClickLabel|    Category|     Subcategory|               Title|            Abstract|                 URL|       TitleEntities|    AbstractEntities|          CleanTitle|       CleanAbstract|         TitleTokens|      AbstractTokens| FilteredTitleTokens|FilteredAbstractTokens|     combined_tokens|        raw_features|              tf_idf|
+------------+------+--------------------+--------------------+---------------+----------+------------+----------------+--------------------+--------------------+----

In [117]:
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType
from pyspark.ml.linalg import VectorUDT, DenseVector, SparseVector

# Define a function to compute cosine similarity without numpy
def cosine_similarity(u_vec, i_vec):
    if u_vec is None or i_vec is None:
        return 0.0
    
    # Ensure vectors are of type DenseVector or SparseVector
    if not isinstance(u_vec, (DenseVector, SparseVector)) or not isinstance(i_vec, (DenseVector, SparseVector)):
        return 0.0
    
    # Convert vectors to lists
    u_arr = u_vec.toArray().tolist()
    i_arr = i_vec.toArray().tolist()
    
    # Compute dot product
    dot_product = sum(uv * iv for uv, iv in zip(u_arr, i_arr))
    
    # Compute norms
    norm_u = sum(uv ** 2 for uv in u_arr) ** 0.5
    norm_i = sum(iv ** 2 for iv in i_arr) ** 0.5
    
    if norm_u == 0 or norm_i == 0:
        return 0.0
    
    return dot_product / (norm_u * norm_i)

# Register the UDF
cosine_udf = udf(cosine_similarity, FloatType())

In [118]:
from pyspark.sql.functions import udf
from pyspark.ml.linalg import VectorUDT, Vectors

# Define a function to compute the average of vectors without numpy
def average_vectors(vectors):
    if not vectors:
        return Vectors.dense([])
    
    # Convert all vectors to lists
    list_of_vectors = [v.toArray().tolist() for v in vectors if v is not None]
    
    if not list_of_vectors:
        return Vectors.dense([])
    
    # Sum each component
    summed_vector = [sum(components) for components in zip(*list_of_vectors)]
    
    # Compute average
    count = len(list_of_vectors)
    averaged_vector = [x / count for x in summed_vector]
    
    return Vectors.dense(averaged_vector)

# Register the UDF
avg_vector_udf = udf(average_vectors, VectorUDT())

In [121]:
from pyspark.ml.feature import Normalizer

# Create the Normalizer object
normalizer = Normalizer(inputCol="tf_idf", outputCol="ItemNormalized", p=2.0)

# Apply the normalizer to the DataFrame
news_df_normalized = normalizer.transform(news_df_tfidf)


In [122]:
from pyspark.sql.functions import col

# Join impressions with normalized news to get candidate embeddings
impressions_with_normalized_features = impressions_exploded.join(
    news_df_normalized.select("NewsID", "ItemNormalized"),
    impressions_exploded.CandidateNewsID == news_df_normalized.NewsID,
    how="left"
).drop(news_df_normalized.NewsID)

# Join with normalized user embeddings
user_candidate_df = impressions_with_normalized_features.join(
    user_embeddings_normalized.select("UserID", "UserNormalized"),
    on="UserID",
    how="inner"
)