In [22]:

from pyspark.ml import Pipeline # pipeline to transform data
from pyspark.sql import SparkSession # to initiate spark
from pyspark.sql.types import FloatType
from pyspark.ml.feature import RegexTokenizer # tokenizer
from pyspark.ml.feature import HashingTF, IDF # vectorizer
from pyspark.ml.feature import StopWordsRemover # to remove stop words
from pyspark.sql.functions import concat_ws, col # to concatinate cols
from pyspark.ml.classification import LogisticRegression # ml model
from pyspark.ml.evaluation import MulticlassClassificationEvaluator # to evaluate the model
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import DoubleType

In [23]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("news-classification") \
    .getOrCreate()


spark

In [49]:
df = spark.read.csv("data/news-dataset.csv", inferSchema=True, header=True)
df.show(5)

+-----------+--------------------+--------------------+
|Class Index|               Title|         Description|
+-----------+--------------------+--------------------+
|          3|Wall St. Bears Cl...|Reuters - Short-s...|
|          3|Carlyle Looks Tow...|Reuters - Private...|
|          3|Oil and Economy C...|Reuters - Soaring...|
|          3|Iraq Halts Oil Ex...|Reuters - Authori...|
|          3|Oil prices soar t...|AFP - Tearaway wo...|
+-----------+--------------------+--------------------+
only showing top 5 rows



In [50]:
#show 5 first rows with null values
df.filter(df['Class Index'].isNull()).count()

0

In [51]:
# Renaming 'Class Index' col to 'label'
df = df.withColumnRenamed('Class Index', 'label')

# Add a new column 'Text' by concatinating 'Title' and 'Description'
df = df.withColumn("Text", concat_ws(" ", "Title", 'Description'))

# Remove old text columns
df = df.select('label', 'Text')

In [27]:
df.first()

Row(label='3', Text="Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

In [56]:
# convert sentences to list of words
tokenizer = RegexTokenizer(inputCol="Text", outputCol="words", pattern="\\W")

# adds a column 'words' to df after tokenization
df = tokenizer.transform(df)

df.select(['label','Text', 'words']).show(5)

+-----+--------------------+--------------------+
|label|                Text|               words|
+-----+--------------------+--------------------+
|    3|Wall St. Bears Cl...|[wall, st, bears,...|
|    3|Carlyle Looks Tow...|[carlyle, looks, ...|
|    3|Oil and Economy C...|[oil, and, econom...|
|    3|Iraq Halts Oil Ex...|[iraq, halts, oil...|
|    3|Oil prices soar t...|[oil, prices, soa...|
+-----+--------------------+--------------------+
only showing top 5 rows



In [57]:
# to remove stop words like is, the, in, etc.
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered")

# adds a column 'filtered' to df without stopwords
df = stopwords_remover.transform(df)

df.select(['label','Text', 'words', 'filtered']).first()

Row(label='3', Text="Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", words=['wall', 'st', 'bears', 'claw', 'back', 'into', 'the', 'black', 'reuters', 'reuters', 'short', 'sellers', 'wall', 'street', 's', 'dwindling', 'band', 'of', 'ultra', 'cynics', 'are', 'seeing', 'green', 'again'], filtered=['wall', 'st', 'bears', 'claw', 'back', 'black', 'reuters', 'reuters', 'short', 'sellers', 'wall', 'street', 'dwindling', 'band', 'ultra', 'cynics', 'seeing', 'green'])

In [58]:
# Calculate term frequency in each article
hashing_tf = HashingTF(inputCol="filtered",
                       outputCol="raw_features", 
                       numFeatures=10000)

# adds raw tf features to df
featurized_data = hashing_tf.transform(df)

In [59]:
featurized_data.first()

Row(label='3', Text="Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", words=['wall', 'st', 'bears', 'claw', 'back', 'into', 'the', 'black', 'reuters', 'reuters', 'short', 'sellers', 'wall', 'street', 's', 'dwindling', 'band', 'of', 'ultra', 'cynics', 'are', 'seeing', 'green', 'again'], filtered=['wall', 'st', 'bears', 'claw', 'back', 'black', 'reuters', 'reuters', 'short', 'sellers', 'wall', 'street', 'dwindling', 'band', 'ultra', 'cynics', 'seeing', 'green'], raw_features=SparseVector(10000, {551: 1.0, 662: 1.0, 1262: 1.0, 1449: 1.0, 1889: 1.0, 1948: 1.0, 2503: 1.0, 2826: 1.0, 3038: 1.0, 3684: 1.0, 4443: 1.0, 6404: 2.0, 8318: 1.0, 8430: 1.0, 8450: 2.0, 9430: 1.0}))

In [61]:
# Inverse document frequency
idf = IDF(inputCol="raw_features", outputCol="features")

idf_vectorizer = idf.fit(featurized_data)

# converting text to vectors
rescaled_data = idf_vectorizer.transform(featurized_data)

# top 20 rows
rescaled_data.select("label",'Text', 'words', 'filtered', "features").first()

Row(label='3', Text="Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", words=['wall', 'st', 'bears', 'claw', 'back', 'into', 'the', 'black', 'reuters', 'reuters', 'short', 'sellers', 'wall', 'street', 's', 'dwindling', 'band', 'of', 'ultra', 'cynics', 'are', 'seeing', 'green', 'again'], filtered=['wall', 'st', 'bears', 'claw', 'back', 'black', 'reuters', 'reuters', 'short', 'sellers', 'wall', 'street', 'dwindling', 'band', 'ultra', 'cynics', 'seeing', 'green'], features=SparseVector(10000, {551: 5.0673, 662: 6.0128, 1262: 4.8019, 1449: 5.9636, 1889: 6.9275, 1948: 4.9649, 2503: 7.6128, 2826: 5.9398, 3038: 4.6635, 3684: 4.3958, 4443: 5.7126, 6404: 9.1681, 8318: 8.034, 8430: 3.394, 8450: 4.6006, 9430: 6.4834}))

In [67]:
rescaled_data.filter(rescaled_data['label'].isNull()).count()

0

In [94]:
rescaled_data = rescaled_data.coalesce(1)
(train, test) = rescaled_data.randomSplit([0.80, 0.20])

In [95]:
train.filter(train['label'].isNull()).count()

0

In [96]:
train_with_nulls = train.filter(train["label"].isNull())
train_with_nulls.show()

+-----+----+-----+--------+------------+--------+
|label|Text|words|filtered|raw_features|features|
+-----+----+-----+--------+------------+--------+
+-----+----+-----+--------+------------+--------+



In [72]:
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 103720
Test Dataset Count: 25813


In [97]:
train = train.withColumn("label", train["label"].cast(DoubleType()))
test = test.withColumn("label", test["label"].cast(DoubleType()))

In [99]:
train.filter(train['label'].isNull()).count()
test.filter(test['label'].isNull()).count()

975

In [100]:
#remove nulls from train
train = train.na.drop()
test = test.na.drop()
train.show(5)

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|label|                Text|               words|            filtered|        raw_features|            features|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|  1.0| #36;350,000 Siph...|[36, 350, 000, si...|[36, 350, 000, si...|(10000,[157,524,5...|(10000,[157,524,5...|
|  1.0| #36;71M Judgment...|[36, 71m, judgmen...|[36, 71m, judgmen...|(10000,[531,633,1...|(10000,[531,633,1...|
|  1.0| #39;70,000 Darfu...|[39, 70, 000, dar...|[39, 70, 000, dar...|(10000,[55,130,49...|(10000,[55,130,49...|
|  1.0| #39;9-11 helper ...|[39, 9, 11, helpe...|[39, 9, 11, helpe...|(10000,[132,626,6...|(10000,[132,626,6...|
|  1.0| #39;9/11 #39; PL...|[39, 9, 11, 39, p...|[39, 9, 11, 39, p...|(10000,[516,648,9...|(10000,[516,648,9...|
+-----+--------------------+--------------------+--------------------+--------------------+-----

In [103]:
lr = LogisticRegression(featuresCol='features',
                        labelCol='label',
                        family="multinomial",
                        regParam=0.3,
                        elasticNetParam=0,
                        maxIter=50)

lrModel = lr.fit(train)

# get predictions for test set
predictions = lrModel.transform(test)

# show top 20 predictions
predictions.select("Text", 'probability','prediction', 'label').show()

+--------------------+--------------------+----------+-----+
|                Text|         probability|prediction|label|
+--------------------+--------------------+----------+-----+
| #39;6-Way Talks ...|[8.73073472745514...|       1.0|  1.0|
| #39;Arms photo r...|[1.07704484254278...|       1.0|  1.0|
| #39;Batman #39; ...|[9.90718118605804...|       1.0|  1.0|
| #39;Batman #39; ...|[6.06380393164773...|       1.0|  1.0|
| #39;Batman dad i...|[1.11546818626372...|       1.0|  1.0|
| #39;Bin Laden #3...|[5.36456157147562...|       1.0|  1.0|
| #39;Black Widows...|[4.68598376297795...|       1.0|  1.0|
| #39;I only wish ...|[9.03125373304269...|       1.0|  1.0|
| #39;Jackal #39; ...|[1.02234890493050...|       1.0|  1.0|
| #39;Mutilated bo...|[4.23915322208111...|       1.0|  1.0|
| #39;Passion #39;...|[1.06412856826629...|       1.0|  1.0|
| #39;Piano Teache...|[5.08437873723869...|       4.0|  1.0|
| #39;Pirate #39; ...|[7.91468659029856...|       1.0|  1.0|
| #39;Ransom deman...|[4

In [104]:
#show unique value of label column of train
train.select("label").distinct().show()

+-----+
|label|
+-----+
|  1.0|
|  2.0|
|  3.0|
|  4.0|
|  5.0|
|  6.0|
|  7.0|
|  8.0|
+-----+

