In [1]:
# spark setup

import os
os.environ["JAVA_HOME"] = r"C:\Program Files\Java\jdk-11.0.2"
os.environ["SPARK_HOME"] = r"C:\Program Files\Spark\spark-3.5.5-bin-hadoop3"

In [2]:
# findspark helps locate the environment variables
import findspark
findspark.init()

# `agnews` Dataset

In [12]:
!curl https://raw.githubusercontent.com/mosesyhc/de300-2025sp-class/refs/heads/main/agnews.csv -O

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
 12 29.3M   12 3759k    0     0  5229k      0  0:00:05 --:--:--  0:00:05 5236k
100 29.3M  100 29.3M    0     0  20.8M      0  0:00:01  0:00:01 --:--:-- 20.8M


# Pipelining with PySpark MLlib

In [3]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline # pipeline to transform data


In [4]:
spark = (SparkSession.builder
         .master("local[*]")
         .appName("AG news")
         .getOrCreate()
        )
sc = spark.sparkContext

In [13]:
# load dataset
df = spark.read.csv("agnews.csv", inferSchema=True, header=True)

In [14]:
df.show(20)

+-----------+--------------------+--------------------+
|Class Index|               Title|         Description|
+-----------+--------------------+--------------------+
|          3|Wall St. Bears Cl...|Reuters - Short-s...|
|          3|Carlyle Looks Tow...|Reuters - Private...|
|          3|Oil and Economy C...|Reuters - Soaring...|
|          3|Iraq Halts Oil Ex...|Reuters - Authori...|
|          3|Oil prices soar t...|AFP - Tearaway wo...|
|          3|Stocks End Up, Bu...|Reuters - Stocks ...|
|          3|Money Funds Fell ...|AP - Assets of th...|
|          3|Fed minutes show ...|USATODAY.com - Re...|
|          3|Safety Net (Forbe...|"Forbes.com - Aft...|
|          3|Wall St. Bears Cl...| NEW YORK (Reuter...|
|          3|Oil and Economy C...| NEW YORK (Reuter...|
|          3|No Need for OPEC ...| TEHRAN (Reuters)...|
|          3|Non-OPEC Nations ...| JAKARTA (Reuters...|
|          3|Google IPO Auctio...| WASHINGTON/NEW Y...|
|          3|Dollar Falls Broa...| NEW YORK (Reu

# Arrange columns

In [15]:
from pyspark.sql.functions import concat_ws, col # to concatinate cols

# Renaming 'Class Index' col to 'label'
df = df.withColumnRenamed('Class Index', 'label')

# Add a new column 'text' by concatinating 'Title' and 'Description'
df = df.withColumn("text", concat_ws(" ", "Title", 'Description'))

# Remove old text columns
df = df.select('label', 'text')

# Shows top 10 rows
df.show(10)


+-----+--------------------+
|label|                text|
+-----+--------------------+
|    3|Wall St. Bears Cl...|
|    3|Carlyle Looks Tow...|
|    3|Oil and Economy C...|
|    3|Iraq Halts Oil Ex...|
|    3|Oil prices soar t...|
|    3|Stocks End Up, Bu...|
|    3|Money Funds Fell ...|
|    3|Fed minutes show ...|
|    3|Safety Net (Forbe...|
|    3|Wall St. Bears Cl...|
+-----+--------------------+
only showing top 10 rows



# Tokenize

In [None]:
from pyspark.ml.feature import RegexTokenizer # tokenizer

# convert sentences to list of words
tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")

# applies tokenizer to df
df = tokenizer.transform(df)

df.select(['label','text', 'words']).show(5)


# Stopwords

In [None]:
from pyspark.ml.feature import StopWordsRemover

# remove stopwords
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered")

# adds a column 'filtered' to df without stopwords
df = stopwords_remover.transform(df)

df.select(['label','text', 'words', 'filtered']).show(5)

# Term frequency, Inverse document frequency

In [None]:
from pyspark.ml.feature import HashingTF

# Calculate term frequency in each article
hashing_tf = HashingTF(inputCol="filtered",
                       outputCol="raw_features",
                       numFeatures=16384)

# adds raw tf features to df
featurized_data = hashing_tf.transform(df)

featurized_data.show(5)

In [None]:
from pyspark.ml.feature import IDF
# inverse document frequency
idf = IDF(inputCol="raw_features", outputCol="features")

idf_vectorizer = idf.fit(featurized_data)

# converting text to vectors
rescaled_data = idf_vectorizer.transform(featurized_data)

rescaled_data.show(5)

In [None]:
rescaled_data.select('raw_features').show(1, truncate=False)
rescaled_data.select('features').show(1, truncate=False)

# Training a multinomial logistic regression

In [None]:
(train, test) = rescaled_data.randomSplit([0.75, 0.25])
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

In [None]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(featuresCol='features',
                        labelCol='label',
                        family="multinomial",
                        regParam=0.3,
                        elasticNetParam=0,
                        maxIter=20)

lrModel = lr.fit(train)

# Prediction and evaluation

In [None]:
# predict on test data
predictions = lrModel.transform(test)

In [None]:
predictions.show(3)

In [None]:
predictions.select("text", 'probability').show()

In [None]:
predictions.select("text", 'probability', 'prediction', 'label').show()

In [None]:
# accuracy flag
predictions = predictions.withColumn('correctFlag', (col('label') == col('prediction')))

In [None]:
from pyspark.sql.functions import avg
from pyspark.sql.types import FloatType
# accuracy
predictions.select(avg(col('correctFlag').cast(FloatType())).alias('accuracy')).show()

In [None]:
from pyspark.mllib.evaluation import MulticlassMetrics
# labels = ["World", "Sports", "Business","Science"]

# take only the predictions
preds_and_labels = predictions.select(['prediction','label']) \
                              .withColumn('label', col('label') \
                              .cast(FloatType())) \
                              .orderBy('prediction')

# generate confusion matrix counts
metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

In [None]:
# confusion matrix
metrics.confusionMatrix().toArray()

# Pipelining, from start to finish

In [None]:
# load dataset
df = spark.read.csv("agnews.csv", inferSchema=True, header=True)

def arrangeColumns(df):
  # Renaming 'Class Index' col to 'label'
  df = df.withColumnRenamed('Class Index', 'label')

  # Add a new column 'text' by joining 'Title' and 'Description'
  df = df.withColumn("text", concat_ws(" ", "Title", 'Description'))

  # Select new text feature and labels
  df = df.select('label', 'text')
  return df

df = arrangeColumns(df)

# tokenizer
tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")

# stopwords
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered")

# term frequency
hashing_tf = HashingTF(inputCol="filtered",
                       outputCol="raw_features",
                       numFeatures=16384)

# Inverse Document Frequency
idf = IDF(inputCol="raw_features", outputCol="features")

# model
lr = LogisticRegression(featuresCol='features',
                        labelCol='label',
                        family="multinomial",
                        regParam=0.3,
                        elasticNetParam=0,
                        maxIter=20)


In [None]:
# Put everything in pipeline
pipeline = Pipeline(stages=[tokenizer,
                            stopwords_remover,
                            hashing_tf,
                            idf,
                            lr])

# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(df)

# transform add train
dataset = pipelineFit.transform(df)

In [None]:
# accuracy
dataset = dataset.withColumn('correctFlag', (col('label') == col('prediction')))
dataset.select(avg(col('correctFlag').cast(FloatType())).alias('accuracy')).show()