# (EX) News article processing (with ML pipeline)

In [None]:
# spark setup, only if you need to specify your paths

import os
os.environ["JAVA_HOME"] = r"C:\Program Files\Java\jdk-11.0.2"
os.environ["SPARK_HOME"] = r"C:\Program Files\Spark\spark-3.5.5-bin-hadoop3"

In [None]:
# findspark helps locate the environment variables
import findspark
findspark.init()

# `agnews` Dataset

In [1]:
!curl https://raw.githubusercontent.com/mosesyhc/de300-2025sp-class/refs/heads/main/agnews.csv -O

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  0 29.3M    0  222k    0     0   137k      0  0:03:38  0:00:01  0:03:37  137k
 10 29.3M   10 3146k    0     0  1208k      0  0:00:24  0:00:02  0:00:22 1208k
 19 29.3M   19 5975k    0     0  1655k      0  0:00:18  0:00:03  0:00:15 1655k
 26 29.3M   26 7938k    0     0  1722k      0  0:00:17  0:00:04  0:00:13 1722k
 32 29.3M   32 9712k    0     0  1729k      0  0:00:17  0:00:05  0:00:12 2305k
 35 29.3M   35 10.3M    0     0  1608k      0  0:00:18  0:00:06  0:00:12 2085k
 43 29.3M   43 12.6M    0     0  1699k      0  0:00:17  0:00:07  0:00:10 1955k
 49 29.3M   49 14.5M    0     0  1728k      0  0:00:17  0:00:08  0:00:09 1780k
 55 29.3M   55 16.4M    0     0  1750k      0  0:00

# Pipelining with PySpark MLlib

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline # pipeline to transform data


In [None]:
spark = (SparkSession.builder
         .master("local[*]")
         .appName("AG news")
         .getOrCreate()
        )
sc = spark.sparkContext

In [None]:
# load dataset
df = spark.read.csv("agnews.csv", inferSchema=True, header=True)

In [None]:
df.show(20)

# Arrange columns

In [None]:
from pyspark.sql.functions import concat_ws, col # to concatinate cols

# renaming 'Class Index' col to 'label'
df = df.withColumnRenamed('Class Index', 'label')

# concatenating texts
df = df.withColumn('text', concat_ws(" ", "Title", "Description"))

df = df.select('label', 'text')

df.show(10)

# Tokenize

In [None]:
from pyspark.ml.feature import RegexTokenizer # tokenizer

# convert sentences to list of words
tokenizer = RegexTokenizer(inputCol='text', outputCol='words', pattern="\\W")

df = tokenizer.transform(df)

df.select(['label', 'text', 'words']).show(5)

# Stopwords

In [None]:
from pyspark.ml.feature import StopWordsRemover

# remove stopwords
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered")

df = stopwords_remover.transform(df)


df.select(['label', 'text', 'words', 'filtered']).show(5)

# Term frequency, Inverse document frequency

In [None]:
from pyspark.ml.feature import HashingTF

# calculate term frequency in each article (row)

hashing_tf = HashingTF(inputCol="filtered", outputCol="raw_features",
                       numFeatures=16384)

featurized_data = hashing_tf.transform(df)

featurized_data.select('raw_features').show(5)

In [None]:
from pyspark.ml.feature import IDF

# inverse document frequency
idf = IDF(inputCol="raw_features", outputCol="features")

idf_vectorizer = idf.fit(featurized_data)

rescaled_data = idf_vectorizer.transform(featurized_data)

rescaled_data.show(5)

In [None]:
rescaled_data.select('raw_features').show(2, truncate=False)
rescaled_data.select('features').show(2, truncate=False)

# Training a multinomial logistic regression

In [None]:
# split data
(train, test) = rescaled_data.randomSplit([0.75, 0.25], seed=42)
train.count()

In [None]:
rescaled_data.printSchema()

In [None]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol='features',
                        labelCol='label',
                        family='multinomial',
                        regParam=0.3,
                        elasticNetParam=0,
                        maxIter=20)

lrModel = lr.fit(train)

# Prediction and evaluation

In [None]:
# predict on test data
predictions = lrModel.transform(test)

In [None]:
predictions.select('probability', 'prediction').show(5, truncate=False)

In [None]:
predictions = predictions.withColumn('correctFlag', (col('label') == col('prediction')))

In [None]:
from pyspark.sql.functions import avg
from pyspark.sql.types import FloatType

predictions.select(avg(col('correctFlag').cast(FloatType())).alias('acurracy')).show()

In [None]:
from pyspark.mllib.evaluation import MulticlassMetrics
# labels = ["World", "Sports", "Business","Science"]

# take only the predictions
preds_and_labels = predictions.select(['prediction','label']) \
                              .withColumn('label', col('label') \
                              .cast(FloatType())) \
                              .orderBy('prediction')


preds_and_labels.show(5)

In [None]:
# confusion matrix
metrics = MulticlassMetrics(predictionAndLabels=preds_and_labels.rdd.map(tuple))

In [None]:
metrics.confusionMatrix().toArray()

# Pipelining, from start to finish

In [None]:
# load dataset
df = spark.read.csv("agnews.csv", inferSchema=True, header=True)

def arrangeColumns(df):
  # Renaming 'Class Index' col to 'label'
  df = df.withColumnRenamed('Class Index', 'label')

  # Add a new column 'text' by joining 'Title' and 'Description'
  df = df.withColumn("text", concat_ws(" ", "Title", 'Description'))

  # Select new text feature and labels
  df = df.select('label', 'text')
  return df

df = arrangeColumns(df)

# tokenizer
tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")

# stopwords
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered")

# term frequency
hashing_tf = HashingTF(inputCol="filtered",
                       outputCol="raw_features",
                       numFeatures=16384)

# Inverse Document Frequency
idf = IDF(inputCol="raw_features", outputCol="features")

# model
lr = LogisticRegression(featuresCol='features',
                        labelCol='label',
                        family="multinomial",
                        regParam=0.3,
                        elasticNetParam=0,
                        maxIter=20)



In [None]:
# Put everything in pipeline
pipeline = Pipeline(stages=[tokenizer,
                            stopwords_remover,
                            hashing_tf,
                            idf,
                            lr])

# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(df)

# transform and train
dataset = pipelineFit.transform(df)