# (EX) News article processing (with ML pipeline)

# `agnews` Dataset

In [None]:
!curl https://raw.githubusercontent.com/mosesyhc/de300-2025sp-class/refs/heads/main/agnews.csv -O

# Pipelining with PySpark MLlib

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline # pipeline to transform data


In [None]:
spark = (SparkSession.builder
         .master("local[*]")
         .appName("AG news")
         .getOrCreate()
        )
sc = spark.sparkContext

In [None]:
# load dataset
df = 

# Arrange columns

In [None]:
from pyspark.sql.functions import concat_ws, col # to concatinate cols

# renaming 'Class Index' col to 'label'

# concatenating texts

df.show(10)

# Tokenize

In [None]:
from pyspark.ml.feature import RegexTokenizer # tokenizer

# convert sentences to list of words

# Stopwords

In [None]:
from pyspark.ml.feature import StopWordsRemover

# remove stopwords

# Term frequency, Inverse document frequency

In [None]:
from pyspark.ml.feature import HashingTF

# calculate term frequency in each article (row)


In [None]:
from pyspark.ml.feature import IDF

# inverse document frequency

In [None]:
rescaled_data.select('raw_features').show(2, truncate=False)
rescaled_data.select('features').show(2, truncate=False)

# Training a multinomial logistic regression

In [None]:
# split data into training and testing


In [None]:
from pyspark.ml.classification import LogisticRegression


# Prediction and evaluation

In [None]:
# predict on test data
predictions = lrModel.transform(test)

In [None]:
from pyspark.sql.functions import avg
from pyspark.sql.types import FloatType

# accuracy calculation

In [None]:
from pyspark.mllib.evaluation import MulticlassMetrics
# labels = ["World", "Sports", "Business","Science"]

# take only the predictions
preds_and_labels = 


In [None]:
# confusion matrix
metrics = 

# Pipelining, from start to finish

In [None]:
# load dataset
df = spark.read.csv("agnews.csv", inferSchema=True, header=True)

def arrangeColumns(df):
  # Renaming 'Class Index' col to 'label'
  df = df.withColumnRenamed('Class Index', 'label')

  # Add a new column 'text' by joining 'Title' and 'Description'
  df = df.withColumn("text", concat_ws(" ", "Title", 'Description'))

  # Select new text feature and labels
  df = df.select('label', 'text')
  return df

df = arrangeColumns(df)

# tokenizer
tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")

# stopwords
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered")

# term frequency
hashing_tf = HashingTF(inputCol="filtered",
                       outputCol="raw_features",
                       numFeatures=16384)

# Inverse Document Frequency
idf = IDF(inputCol="raw_features", outputCol="features")

# model
lr = LogisticRegression(featuresCol='features',
                        labelCol='label',
                        family="multinomial",
                        regParam=0.3,
                        elasticNetParam=0,
                        maxIter=20)



In [None]:
# Put everything in pipeline
pipeline = Pipeline(stages=[tokenizer,
                            stopwords_remover,
                            hashing_tf,
                            idf,
                            lr])

# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(df)

# transform and train
dataset = pipelineFit.transform(df)