# Exploration

In [2]:
import os
spark_home = os.path.abspath(os.getcwd() + "/spark-3.5.5-bin-hadoop3")
hadoop_home = os.path.abspath(os.getcwd() + "/winutils")
print(f"I am using the following SPARK_HOME: {spark_home}")
if os.name == 'nt':
    os.environ["HADOOP_HOME"] = f"{hadoop_home}"
    print(f"Windows detected: set HADOOP_HOME to: {os.environ['HADOOP_HOME']}")
    hadoop_bin = os.path.join(hadoop_home, "bin")
    os.environ["PATH"] = f"{hadoop_bin};{os.environ['PATH']}"
    print(f"  Also added Hadoop bin directory to PATH: {hadoop_bin}")

import findspark
import pyspark
from pyspark.streaming import StreamingContext

findspark.init(spark_home)
sc = pyspark.SparkContext()
spark = pyspark.sql.SparkSession.builder.getOrCreate()

I am using the following SPARK_HOME: C:\Users\kian3\Spark 2\spark-3.5.5-bin-hadoop3
Windows detected: set HADOOP_HOME to: C:\Users\kian3\Spark 2\winutils
  Also added Hadoop bin directory to PATH: C:\Users\kian3\Spark 2\winutils\bin


In [3]:
import threading

# Helper thread to avoid the Spark StreamingContext from blocking Jupyter
        
class StreamingThread(threading.Thread):
    def __init__(self, ssc):
        super().__init__()
        self.ssc = ssc
    def run(self):
        self.ssc.start()
        self.ssc.awaitTermination()
    def stop(self):
        print('----- Stopping... this may take a few seconds -----')
        self.ssc.stop(stopSparkContext=False, stopGraceFully=True)

In [None]:
# Code to make dataset
from pyspark.sql import Row, DataFrame
from pyspark.sql.functions import udf, struct, array, col, lit
from pyspark.sql.types import StringType

main_folder = r"C:\Users\kian3\Spark 2\notebooks"

# Allow multiple JSON instances per 'saved_data-' folder to be included
json_files = []
for root, _, files in os.walk(main_folder):
    if os.path.basename(root).startswith("saved_data-"):
        json_files.extend([
            os.path.join(root, file)
            for file in files
            if file.startswith("part-") and not file.endswith(".crc")
        ])

if json_files:
    df = spark.read.json(json_files, multiLine=False)
else:
    df = spark.createDataFrame([], schema=None)

In [None]:

# Convert to spark DataFrame from locally stored Excel file
import pandas as pd
pdf = pd.read_excel("./data/output_test.xlsx")
df = spark.createDataFrame(pdf)

In [5]:
df = df.dropDuplicates()
df.show()
print(f"Number of articles: {df.count()}")

+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
|                 aid|          categories|     main_category|           published|             summary|               title|
+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
|http://arxiv.org/...|         cs.CL,cs.LG|             cs.CL|2025-04-21T17:33:23Z|Scaling test-time...|Evaluating Judges...|
|http://arxiv.org/...|         astro-ph.GA|       astro-ph.GA|2025-04-02T08:10:05Z|In this work an a...|Multiscale explor...|
|http://arxiv.org/...|               cs.IR|             cs.IR|2025-04-03T17:55:12Z|Intermittent rene...|An Assessment of ...|
|http://arxiv.org/...|               cs.NI|             cs.NI|2025-04-03T14:34:38Z|Medium access in ...|Medium Access for...|
|http://arxiv.org/...|cond-mat.stat-mec...|cond-mat.stat-mech|2025-04-21T17:27:41Z|We investigate th...|Tracer dynamic

In [6]:
print(df.show(10))
print(df.columns)
df.printSchema()
print(df.dtypes)
df.count()


+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
|                 aid|          categories|     main_category|           published|             summary|               title|
+--------------------+--------------------+------------------+--------------------+--------------------+--------------------+
|http://arxiv.org/...|         cs.CL,cs.LG|             cs.CL|2025-04-21T17:33:23Z|Scaling test-time...|Evaluating Judges...|
|http://arxiv.org/...|         astro-ph.GA|       astro-ph.GA|2025-04-02T08:10:05Z|In this work an a...|Multiscale explor...|
|http://arxiv.org/...|               cs.IR|             cs.IR|2025-04-03T17:55:12Z|Intermittent rene...|An Assessment of ...|
|http://arxiv.org/...|               cs.NI|             cs.NI|2025-04-03T14:34:38Z|Medium access in ...|Medium Access for...|
|http://arxiv.org/...|cond-mat.stat-mec...|cond-mat.stat-mech|2025-04-21T17:27:41Z|We investigate th...|Tracer dynamic

12355

In [None]:
from pyspark.sql.functions import col, sum, when
# No missings
print("Number of missings per column:")

null_counts_agg = [sum(when(col(c).isNull(), 1).otherwise(0)).alias(c + "_null_count") for c in df.columns]
null_counts_df = df.agg(*null_counts_agg)
null_counts_df.show()

Number of missings per column:
+--------------+---------------------+------------------------+--------------------+------------------+----------------+
|aid_null_count|categories_null_count|main_category_null_count|published_null_count|summary_null_count|title_null_count|
+--------------+---------------------+------------------------+--------------------+------------------+----------------+
|             0|                    0|                       0|                   0|                 0|               0|
+--------------+---------------------+------------------------+--------------------+------------------+----------------+



In [None]:
from pyspark.sql.functions import desc

print("\nCount per 'categories' (sorted by frequency):")
df.groupBy("categories").count().orderBy(desc("count")).show(150)

print("\nCount per 'main_category' (sorted by frequency):")
df.groupBy("main_category").count().orderBy(desc("count")).show(150)


Count per 'categories' (sorted by frequency):
+--------------------+-----+
|          categories|count|
+--------------------+-----+
|               cs.CV|  861|
|            quant-ph|  318|
|               cs.CL|  283|
|               cs.LG|  263|
|             math.AP|  213|
|         cs.CV,cs.AI|  182|
|               cs.RO|  168|
|             eess.SP|  152|
|       eess.SY,cs.SY|  148|
|         cs.LG,cs.AI|  146|
|   cond-mat.mtrl-sci|  142|
|             math.OC|  138|
|         cs.CL,cs.AI|  137|
|               cs.AI|  129|
|       math.NA,cs.NA|  124|
|              hep-ph|  120|
|             math.CO|  112|
|               cs.CR|  109|
|         astro-ph.GA|  105|
|      physics.optics|  103|
|               cs.HC|  102|
|               cs.SE|  100|
|         astro-ph.HE|   96|
|               gr-qc|   93|
|             math.PR|   86|
|       cs.IT,math.IT|   85|
|             math.NT|   84|
|             stat.ME|   82|
|   cond-mat.mes-hall|   82|
|              hep-th|   

In [9]:
df_summary_title_spark = df.select("title", "summary", "main_category")
df_summary_title_spark.show(10)


+--------------------+--------------------+------------------+
|               title|             summary|     main_category|
+--------------------+--------------------+------------------+
|Evaluating Judges...|Scaling test-time...|             cs.CL|
|Multiscale explor...|In this work an a...|       astro-ph.GA|
|An Assessment of ...|Intermittent rene...|             cs.IR|
|Medium Access for...|Medium access in ...|             cs.NI|
|Tracer dynamics i...|We investigate th...|cond-mat.stat-mech|
|Prompt Optimizati...|We study how to u...|             cs.LG|
|Detection of Deut...|Deuteration of hy...|       astro-ph.GA|
|From Easy to Hard...|Differentially pr...|             cs.CR|
|The restoration o...|In the present wo...|           math.HO|
|Quantitative Stra...|This paper presen...|             cs.GT|
+--------------------+--------------------+------------------+
only showing top 10 rows



In [10]:
from pyspark.sql.functions import concat, lower, lit
df_with_combined_spark = df_summary_title_spark.withColumn(
    'combined',
    lower(concat(col('title'), lit(" [SUMMARY] "), col('summary')))
)
df_with_combined_spark.select('combined').show(10, truncate=False)


+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [11]:
from pyspark.sql.functions import when

# Only keep needed columns
df_selected_spark = df_with_combined_spark.select('combined', 'main_category')
df_selected_spark.show(10, truncate=False)

# Preparation for multiclass main category classification
# Create a new column 'main_cat_main' based on the value in 'main_category'
df_with_mapped_cat_spark = df_selected_spark.withColumn(
    'main_cat_main',
    when(col('main_category').startswith("cs."), lit("cs"))
    .when(col('main_category').startswith("econ."), lit("econ"))
    .when(col('main_category').startswith("eess."), lit("eess"))
    .when(col('main_category').startswith("math."), lit("math"))
    .when(
        col('main_category').startswith("astro-") |
        col('main_category').startswith("cond-mat") |
        col('main_category').startswith("gr-") |
        col('main_category').startswith("hep-") |
        col('main_category').startswith("math-") | 
        col('main_category').startswith("nlin.") |
        col('main_category').startswith("nucl-") |
        col('main_category').startswith("physics.") |
        col('main_category').startswith("quant-"),
        lit("physics")
    )
    .when(col('main_category').startswith("q-bio."), lit("q-bio"))
    .when(col('main_category').startswith("q-fin."), lit("q-fin"))
    .when(col('main_category').startswith("stat."), lit("stat"))
    .otherwise(lit("other"))
)

print("\nFirst 10 rows of df with 'main_cat_main' kolom:")
df_with_mapped_cat_spark.show(10, truncate=False)
df_with_mapped_cat_spark.groupBy('main_cat_main').count().orderBy(desc('count')).show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [12]:
# Remove punctuation and extra spaces
from pyspark.sql.functions import regexp_replace

df_with_mapped_cat_spark = df_with_mapped_cat_spark.withColumn("combined", regexp_replace(col("combined"), "[^a-zA-Z\\s]", ""))
df_with_mapped_cat_spark = df_with_mapped_cat_spark.withColumn("combined", regexp_replace(col("combined"), "\\s+", " "))

In [13]:
df_with_mapped_cat_spark.show(truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [14]:
newdf = df_with_mapped_cat_spark.toPandas()