In [1]:
from pyspark.sql import SparkSession

# Initialize a SparkSession
spark = SparkSession.builder \
    .appName("MIND Dataset Processing") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/12/07 14:44:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
24/12/07 14:44:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [2]:
# Define the path to the news.tsv file
news_path = "data/mind/MINDsmall_train/news.tsv"

# Define column names for the news.tsv file
news_columns = ["NewsID", "Category", "Subcategory", "Title", "Abstract", "URL", "TitleEntities", "AbstractEntities"]

# Load the news.tsv file into a Spark DataFrame
news_df = spark.read.csv(
    news_path,
    sep="\t",
    schema="NewsID STRING, Category STRING, Subcategory STRING, Title STRING, Abstract STRING, URL STRING, TitleEntities STRING, AbstractEntities STRING",
    header=False
)

# Assign column names
news_df.show(n=2, truncate=False) 

[Stage 0:>                                                          (0 + 1) / 1]

+------+---------+---------------+----------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------+---------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+
|NewsID|Category |Subcategory    |Title                                                                 |Abstract  

                                                                                

In [3]:
news_df.columns

['NewsID',
 'Category',
 'Subcategory',
 'Title',
 'Abstract',
 'URL',
 'TitleEntities',
 'AbstractEntities']

In [4]:
#### PRE PROCESSING ###

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lower, regexp_replace, length, size, udf
from pyspark.sql.types import StringType, ArrayType, IntegerType
from pyspark.ml.feature import Tokenizer, StopWordsRemover

import json

In [6]:
# Initialize Spark session
spark = SparkSession.builder.appName("PreprocessingPipeline").getOrCreate()

# Load your data (modify the path as necessary)
news_df = spark.read.csv("data/mind/MINDsmall_train/news.tsv", sep="\t", header=False, inferSchema=True)

# Assign column names
news_df = news_df.toDF("NewsID", "Category", "Subcategory", "Title", "Abstract", "URL", "TitleEntities", "AbstractEntities")

# Display initial rows
news_df.show(5, truncate=False)

24/12/07 14:44:48 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


                                                                                

+------+---------+---------------+----------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [7]:
### MISSING VALUES ###

# Drop rows where Title or Abstract are missing
news_df = news_df.na.drop(subset=["Title", "Abstract"])

# Verify the results
print(f"Rows after dropping missing values: {news_df.count()}")


Rows after dropping missing values: 48616


In [8]:
### TEXT CLEANING ###

# Define a function to clean text (remove special characters and convert to lowercase)
def clean_text(text):
    if text:
        return text.lower().replace("\n", " ").replace("\t", " ")
    return None

# Register the UDF
clean_text_udf = udf(lambda x: clean_text(x), StringType())

# Apply text cleaning to Title and Abstract
news_df = news_df.withColumn("CleanTitle", clean_text_udf(col("Title")))
news_df = news_df.withColumn("CleanAbstract", clean_text_udf(col("Abstract")))

# Display cleaned text
news_df.select("CleanTitle", "CleanAbstract").show(5, truncate=False)

[Stage 7:>                                                          (0 + 1) / 1]

+----------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|CleanTitle                                                            |CleanAbstract                                                                                                                                                                                       |
+----------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|the brands queen elizabeth, prince charles, and prince philip swear by|shop the notebooks, jackets, and more that the royals can't live without.                                             

                                                                                

In [9]:
### TOKENIZATION ###

# Tokenize CleanTitle and CleanAbstract
tokenizer_title = Tokenizer(inputCol="CleanTitle", outputCol="TitleTokens")
tokenizer_abstract = Tokenizer(inputCol="CleanAbstract", outputCol="AbstractTokens")

news_df = tokenizer_title.transform(news_df)
news_df = tokenizer_abstract.transform(news_df)

# Display tokenized data
news_df.select("TitleTokens", "AbstractTokens").show(5, truncate=False)


+----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|TitleTokens                                                                       |AbstractTokens                                                                                                                                                                                                                           |
+----------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[the, brands, queen, elizabeth,, prince, c

In [10]:
### STOPWORDS REMOVAL ###

# Remove stopwords from TitleTokens and AbstractTokens
stopword_remover_title = StopWordsRemover(inputCol="TitleTokens", outputCol="FilteredTitleTokens")
stopword_remover_abstract = StopWordsRemover(inputCol="AbstractTokens", outputCol="FilteredAbstractTokens")

news_df = stopword_remover_title.transform(news_df)
news_df = stopword_remover_abstract.transform(news_df)

# Display filtered tokens
news_df.select("FilteredTitleTokens", "FilteredAbstractTokens").show(5, truncate=False)


+--------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|FilteredTitleTokens                                                 |FilteredAbstractTokens                                                                                                                                                |
+--------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[brands, queen, elizabeth,, prince, charles,, prince, philip, swear]|[shop, notebooks,, jackets,, royals, live, without.]                                                                                                                  |
|[50, worst, habits, belly, fat]                

In [11]:
# Define a UDF to clean each token in the array
def clean_tokens(tokens):
    if tokens:
        return [token.replace(",", "") for token in tokens]  # Remove commas
    return tokens

# Register the UDF
clean_tokens_udf = udf(clean_tokens, ArrayType(StringType()))

# Apply the UDF to FilteredTitleTokens
news_df = news_df.withColumn("FilteredTitleTokens", clean_tokens_udf(col("FilteredTitleTokens")))
news_df = news_df.withColumn("FilteredAbstractTokens", clean_tokens_udf(col("FilteredAbstractTokens")))

In [12]:
news_df.select("Title", "FilteredTitleTokens").show(5, truncate=False)

+----------------------------------------------------------------------+------------------------------------------------------------------+
|Title                                                                 |FilteredTitleTokens                                               |
+----------------------------------------------------------------------+------------------------------------------------------------------+
|The Brands Queen Elizabeth, Prince Charles, and Prince Philip Swear By|[brands, queen, elizabeth, prince, charles, prince, philip, swear]|
|50 Worst Habits For Belly Fat                                         |[50, worst, habits, belly, fat]                                   |
|The Cost of Trump's Aid Freeze in the Trenches of Ukraine's War       |[cost, trump's, aid, freeze, trenches, ukraine's, war]            |
|I Was An NBA Wife. Here's How It Affected My Mental Health.           |[nba, wife., affected, mental, health.]                           |
|How to Get Rid of S

In [13]:
news_df.select("Abstract", "FilteredAbstractTokens").show(2, truncate=False)

+--------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------+
|Abstract                                                                                                            |FilteredAbstractTokens                                                                      |
+--------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------+
|Shop the notebooks, jackets, and more that the royals can't live without.                                           |[shop, notebooks, jackets, royals, live, without.]                                          |
|These seemingly harmless habits are holding you back and keeping you from shedding that unwanted belly fat for good.|[seemingly, harmless, habits, hold

In [14]:
news_df.columns

['NewsID',
 'Category',
 'Subcategory',
 'Title',
 'Abstract',
 'URL',
 'TitleEntities',
 'AbstractEntities',
 'CleanTitle',
 'CleanAbstract',
 'TitleTokens',
 'AbstractTokens',
 'FilteredTitleTokens',
 'FilteredAbstractTokens']

In [15]:
### EMBEDDINGS ###

In [16]:
from pyspark.sql.functions import concat_ws
from pyspark.ml.feature import CountVectorizer
from pyspark.sql.functions import split
from pyspark.ml.feature import IDF

In [17]:
# combine tokens 
news_df = news_df.withColumn("CombinedTokens", concat_ws(" ", "FilteredTitleTokens", "FilteredAbstractTokens"))

# Show the updated DataFrame
news_df.select("CombinedTokens").show(5, truncate=False)

+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|CombinedTokens                                                                                                                                                                                |
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|brands queen elizabeth prince charles prince philip swear shop notebooks jackets royals live without.                                                                                         |
|50 worst habits belly fat seemingly harmless habits holding back keeping shedding unwanted belly fat good.                                                                                    |
|cost trump's aid freeze trenches u

In [18]:
# Convert CombinedTokens (string) into an array of tokens
news_df = news_df.withColumn("CombinedWords", split(news_df["CombinedTokens"], " "))


In [19]:
# Compute term frequency (TF) using CountVectorizer
cv = CountVectorizer(inputCol="CombinedWords", outputCol="RawFeatures", vocabSize=10000, minDF=2)
cv_model = cv.fit(news_df)
news_df = cv_model.transform(news_df)

# Show the resulting term frequency vector
news_df.select("NewsID", "RawFeatures").show(5, truncate=False)

                                                                                

+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|NewsID|RawFeatures                                                                                                                                                         |
+------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|N55528|(10000,[310,978,1119,1182,1431,1891,3093,3350,7504,7995],[1.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0])                                                                 |
|N19639|(10000,[27,437,856,1610,1773,3287,3891,4806,4875,5879],[1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,1.0])                                                                   |
|N61837|(10000,[63,176,357,358,397,531,546,966,1683,2588,2761,2803,3638,3674,4368,5554,5723,7958],[1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0

In [20]:
idf = IDF(inputCol="RawFeatures", outputCol="TFIDFeatures")
idf_model = idf.fit(news_df)
news_df = idf_model.transform(news_df)

# Show the resulting DataFrame with TF-IDF features
news_df.select("NewsID", "TFIDFFeatures").show(5, truncate=False)

                                                                                

+------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|NewsID|TFIDFFeatures                                                                                                                                                                                                                                                                                                                                                                                                                 |
+------+------------------------------------------------------------------------------------------------------------------------------------------------

In [21]:
news_df.columns

['NewsID',
 'Category',
 'Subcategory',
 'Title',
 'Abstract',
 'URL',
 'TitleEntities',
 'AbstractEntities',
 'CleanTitle',
 'CleanAbstract',
 'TitleTokens',
 'AbstractTokens',
 'FilteredTitleTokens',
 'FilteredAbstractTokens',
 'CombinedTokens',
 'CombinedWords',
 'RawFeatures',
 'TFIDFFeatures']

In [None]:
## NB: example (10000, [310, 978, 1119], [0.5, 1.2, 0.8])
#This means:
#The vocabulary size is 10,000.
#Words at indices 310, 978, and 1119 in the vocabulary are present in the document.
#Their respective TF-IDF scores are 0.5, 1.2, and 0.8.


In [28]:
# Path to the saved Parquet file
parquet_path = "news_tfidf.parquet"

# Read the Parquet file into a DataFrame
tfidf_df = spark.read.parquet(parquet_path)

# Show the schema of the loaded DataFrame
tfidf_df.show()


+------+--------------------+
|NewsID|       TFIDFFeatures|
+------+--------------------+
| N5727|(10000,[1,9,87,90...|
|N25908|(10000,[165,280,4...|
| N2490|(10000,[1,218,262...|
|  N192|(10000,[71,2006,2...|
| N1298|(10000,[3,26,42,4...|
|N57313|(10000,[36,61,109...|
|N36185|(10000,[6,21,74,8...|
|N33743|(10000,[2,13,16,3...|
|N58255|(10000,[0,10,14,3...|
|N44291|(10000,[0,4,15,84...|
|N38233|(10000,[5,7,11,69...|
| N1970|(10000,[4,26,30,3...|
|N41692|(10000,[4,12,31,1...|
|N31209|(10000,[4,34,59,1...|
|N60452|(10000,[154,159,2...|
|N22043|(10000,[36,119,25...|
|N30368|(10000,[45,815,12...|
| N4233|(10000,[11,90,121...|
|N51387|(10000,[111,865,1...|
|N22126|(10000,[4,18,34,3...|
+------+--------------------+
only showing top 20 rows

