In [1]:
from pyspark.sql import SparkSession

# Initialize a SparkSession
spark = SparkSession.builder \
    .appName("MIND Dataset Processing") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/12/09 16:42:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# Define the path to the news.tsv file
news_path = "data/mind/MINDsmall_train/news.tsv"

# Define column names for the news.tsv file
news_columns = ["NewsID", "Category", "Subcategory", "Title", "Abstract", "URL", "TitleEntities", "AbstractEntities"]

# Load the news.tsv file into a Spark DataFrame
news_df = spark.read.csv(
    news_path,
    sep="\t",
    schema="NewsID STRING, Category STRING, Subcategory STRING, Title STRING, Abstract STRING, URL STRING, TitleEntities STRING, AbstractEntities STRING",
    header=False
)

# Assign column names
news_df.show(n=2, truncate=True) 

+------+---------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|NewsID| Category|    Subcategory|               Title|            Abstract|                 URL|       TitleEntities|    AbstractEntities|
+------+---------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|N55528|lifestyle|lifestyleroyals|The Brands Queen ...|Shop the notebook...|https://assets.ms...|[{"Label": "Princ...|                  []|
|N19639|   health|     weightloss|50 Worst Habits F...|These seemingly h...|https://assets.ms...|[{"Label": "Adipo...|[{"Label": "Adipo...|
+------+---------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 2 rows



In [3]:
news_df.columns

['NewsID',
 'Category',
 'Subcategory',
 'Title',
 'Abstract',
 'URL',
 'TitleEntities',
 'AbstractEntities']

In [4]:
#### PRE PROCESSING ###

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, length, size, udf
from pyspark.sql.types import StringType, ArrayType, IntegerType
from pyspark.ml.feature import Tokenizer, StopWordsRemover

import json

In [6]:
# Initialize Spark session
spark = SparkSession.builder.appName("PreprocessingPipeline").getOrCreate()

# Load your data (modify the path as necessary)
news_df = spark.read.csv("data/mind/MINDsmall_train/news.tsv", sep="\t", header=False, inferSchema=True)

# Assign column names
news_df = news_df.toDF("NewsID", "Category", "Subcategory", "Title", "Abstract", "URL", "TitleEntities", "AbstractEntities")

# Display initial rows
news_df.show(5, truncate=True)

24/12/09 16:42:17 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.
+------+---------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|NewsID| Category|    Subcategory|               Title|            Abstract|                 URL|       TitleEntities|    AbstractEntities|
+------+---------+---------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|N55528|lifestyle|lifestyleroyals|The Brands Queen ...|Shop the notebook...|https://assets.ms...|[{"Label": "Princ...|                  []|
|N19639|   health|     weightloss|50 Worst Habits F...|These seemingly h...|https://assets.ms...|[{"Label": "Adipo...|[{"Label": "Adipo...|
|N61837|     news|      newsworld|The Cost of Trump...|Lt. Ivan Molchane...|https://assets.ms...|                  []|[{"Label": "Ukrai...|
|N53526|   health|         voices|I Was 

In [7]:
### MISSING VALUES ###

# Drop rows where Title or Abstract are missing
news_df = news_df.na.drop(subset=["Title", "Abstract"])

# Verify the results
print(f"Rows after dropping missing values: {news_df.count()}")


Rows after dropping missing values: 48616


In [8]:
### TEXT CLEANING ###

# Define a function to clean text (remove special characters and convert to lowercase)
def clean_text(text):
    if text:
        return text.lower().replace("\n", " ").replace("\t", " ")
    return None

# Register the UDF
clean_text_udf = udf(lambda x: clean_text(x), StringType())

# Apply text cleaning to Title and Abstract
news_df = news_df.withColumn("CleanTitle", clean_text_udf(col("Title")))
news_df = news_df.withColumn("CleanAbstract", clean_text_udf(col("Abstract")))

# Display cleaned text
news_df.select("CleanTitle", "CleanAbstract").show(5, truncate=True)

+--------------------+--------------------+
|          CleanTitle|       CleanAbstract|
+--------------------+--------------------+
|the brands queen ...|shop the notebook...|
|50 worst habits f...|these seemingly h...|
|the cost of trump...|lt. ivan molchane...|
|i was an nba wife...|i felt like i was...|
|how to get rid of...|they seem harmles...|
+--------------------+--------------------+
only showing top 5 rows



In [9]:
### TOKENIZATION ###

# Tokenize CleanTitle and CleanAbstract
tokenizer_title = Tokenizer(inputCol="CleanTitle", outputCol="TitleTokens")
tokenizer_abstract = Tokenizer(inputCol="CleanAbstract", outputCol="AbstractTokens")

news_df = tokenizer_title.transform(news_df)
news_df = tokenizer_abstract.transform(news_df)

# Display tokenized data
news_df.select("TitleTokens", "AbstractTokens").show(5, truncate=True)


+--------------------+--------------------+
|         TitleTokens|      AbstractTokens|
+--------------------+--------------------+
|[the, brands, que...|[shop, the, noteb...|
|[50, worst, habit...|[these, seemingly...|
|[the, cost, of, t...|[lt., ivan, molch...|
|[i, was, an, nba,...|[i, felt, like, i...|
|[how, to, get, ri...|[they, seem, harm...|
+--------------------+--------------------+
only showing top 5 rows



In [10]:
### STOPWORDS REMOVAL ###

# Remove stopwords from TitleTokens and AbstractTokens
stopword_remover_title = StopWordsRemover(inputCol="TitleTokens", outputCol="FilteredTitleTokens")
stopword_remover_abstract = StopWordsRemover(inputCol="AbstractTokens", outputCol="FilteredAbstractTokens")

news_df = stopword_remover_title.transform(news_df)
news_df = stopword_remover_abstract.transform(news_df)

# Display filtered tokens
news_df.select("FilteredTitleTokens", "FilteredAbstractTokens").show(5, truncate=True)


+--------------------+----------------------+
| FilteredTitleTokens|FilteredAbstractTokens|
+--------------------+----------------------+
|[brands, queen, e...|  [shop, notebooks,...|
|[50, worst, habit...|  [seemingly, harml...|
|[cost, trump's, a...|  [lt., ivan, molch...|
|[nba, wife., affe...|  [felt, like, frau...|
|[get, rid, skin, ...|  [seem, harmless,,...|
+--------------------+----------------------+
only showing top 5 rows



In [11]:
# Define a UDF to clean each token in the array
def clean_tokens(tokens):
    if tokens:
        return [token.replace(",", "") for token in tokens]  # Remove commas
    return tokens

# Register the UDF
clean_tokens_udf = udf(clean_tokens, ArrayType(StringType()))

# Apply the UDF to FilteredTitleTokens
news_df = news_df.withColumn("FilteredTitleTokens", clean_tokens_udf(col("FilteredTitleTokens")))
news_df = news_df.withColumn("FilteredAbstractTokens", clean_tokens_udf(col("FilteredAbstractTokens")))

In [12]:
news_df.select("Title", "FilteredTitleTokens").show(5, truncate=True)

+--------------------+--------------------+
|               Title| FilteredTitleTokens|
+--------------------+--------------------+
|The Brands Queen ...|[brands, queen, e...|
|50 Worst Habits F...|[50, worst, habit...|
|The Cost of Trump...|[cost, trump's, a...|
|I Was An NBA Wife...|[nba, wife., affe...|
|How to Get Rid of...|[get, rid, skin, ...|
+--------------------+--------------------+
only showing top 5 rows



In [13]:
news_df.select("Abstract", "FilteredAbstractTokens").show(2, truncate=True)

+--------------------+----------------------+
|            Abstract|FilteredAbstractTokens|
+--------------------+----------------------+
|Shop the notebook...|  [shop, notebooks,...|
|These seemingly h...|  [seemingly, harml...|
+--------------------+----------------------+
only showing top 2 rows



In [14]:
news_df.columns

['NewsID',
 'Category',
 'Subcategory',
 'Title',
 'Abstract',
 'URL',
 'TitleEntities',
 'AbstractEntities',
 'CleanTitle',
 'CleanAbstract',
 'TitleTokens',
 'AbstractTokens',
 'FilteredTitleTokens',
 'FilteredAbstractTokens']

In [15]:
news_df.show(1, truncate=True)

+------+---------+---------------+--------------------+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+
|NewsID| Category|    Subcategory|               Title|            Abstract|                 URL|       TitleEntities|AbstractEntities|          CleanTitle|       CleanAbstract|         TitleTokens|      AbstractTokens| FilteredTitleTokens|FilteredAbstractTokens|
+------+---------+---------------+--------------------+--------------------+--------------------+--------------------+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+
|N55528|lifestyle|lifestyleroyals|The Brands Queen ...|Shop the notebook...|https://assets.ms...|[{"Label": "Princ...|              []|the brands queen ...|shop the notebook...|[the, brands, que...|[shop, the

In [16]:
### EMBEDDINGS ###

In [17]:
from pyspark.sql.functions import concat_ws
from pyspark.ml.feature import CountVectorizer
from pyspark.sql.functions import split
from pyspark.ml.feature import IDF

In [18]:
# combine tokens 
news_df = news_df.withColumn("CombinedTokens", concat_ws(" ", "FilteredTitleTokens", "FilteredAbstractTokens"))

# Show the updated DataFrame
news_df.select("CombinedTokens").show(5, truncate=True)

+--------------------+
|      CombinedTokens|
+--------------------+
|brands queen eliz...|
|50 worst habits b...|
|cost trump's aid ...|
|nba wife. affecte...|
|get rid skin tags...|
+--------------------+
only showing top 5 rows



In [19]:
# Convert CombinedTokens (string) into an array of tokens
news_df = news_df.withColumn("CombinedWords", split(news_df["CombinedTokens"], " "))


In [20]:
# Compute term frequency (TF) using CountVectorizer
cv = CountVectorizer(inputCol="CombinedWords", outputCol="RawFeatures", vocabSize=10000, minDF=2)
cv_model = cv.fit(news_df)
news_df = cv_model.transform(news_df)

# Show the resulting term frequency vector
news_df.select("NewsID", "RawFeatures").show(5, truncate=True)

                                                                                

+------+--------------------+
|NewsID|         RawFeatures|
+------+--------------------+
|N55528|(10000,[312,978,1...|
|N19639|(10000,[27,438,85...|
|N61837|(10000,[63,176,35...|
|N53526|(10000,[38,89,230...|
|N38324|(10000,[6,17,20,1...|
+------+--------------------+
only showing top 5 rows



In [21]:
idf = IDF(inputCol="RawFeatures", outputCol="TFIDFeatures")
idf_model = idf.fit(news_df)
news_df = idf_model.transform(news_df)

# Show the resulting DataFrame with TF-IDF features
news_df.select("NewsID", "RawFeatures").show(5, truncate=True)

                                                                                

+------+--------------------+
|NewsID|         RawFeatures|
+------+--------------------+
|N55528|(10000,[312,978,1...|
|N19639|(10000,[27,438,85...|
|N61837|(10000,[63,176,35...|
|N53526|(10000,[38,89,230...|
|N38324|(10000,[6,17,20,1...|
+------+--------------------+
only showing top 5 rows



In [22]:
### BERT EMBEDDING ###

In [23]:
### EXPORT PROCESSED DATA ###

In [24]:
news_df.columns

['NewsID',
 'Category',
 'Subcategory',
 'Title',
 'Abstract',
 'URL',
 'TitleEntities',
 'AbstractEntities',
 'CleanTitle',
 'CleanAbstract',
 'TitleTokens',
 'AbstractTokens',
 'FilteredTitleTokens',
 'FilteredAbstractTokens',
 'CombinedTokens',
 'CombinedWords',
 'RawFeatures',
 'TFIDFeatures']

In [25]:
## NB: example (10000, [310, 978, 1119], [0.5, 1.2, 0.8])
#This means:
#The vocabulary size is 10,000.
#Words at indices 310, 978, and 1119 in the vocabulary are present in the document.
#Their respective TF-IDF scores are 0.5, 1.2, and 0.8.


In [26]:
# Save DataFrame to CSV
news_df.write.parquet("news", mode="overwrite")

[Stage 22:>                                                         (0 + 8) / 8]

24/12/09 16:42:30 WARN MemoryManager: Total allocation exceeds 95.00% (1,020,054,720 bytes) of heap memory
Scaling row group sizes to 95.00% for 8 writers


                                                                                

In [27]:
# Path to the saved Parquet file
parquet_path = "news"

# Read the Parquet file into a DataFrame
tfidf_df = spark.read.parquet(parquet_path)

# Show the schema of the loaded DataFrame
tfidf_df.show(1, truncate = True)


+------+--------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+--------------------+--------------------+--------------------+--------------------+
|NewsID|Category|        Subcategory|               Title|            Abstract|                 URL|       TitleEntities|    AbstractEntities|          CleanTitle|       CleanAbstract|         TitleTokens|      AbstractTokens| FilteredTitleTokens|FilteredAbstractTokens|      CombinedTokens|       CombinedWords|         RawFeatures|        TFIDFeatures|
+------+--------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------------------+--------------------