In [1]:

from pyspark.ml import Pipeline # pipeline to transform data
from pyspark.sql import SparkSession # to initiate spark
from pyspark.sql.types import FloatType
from pyspark.ml.feature import RegexTokenizer # tokenizer
from pyspark.ml.feature import HashingTF, IDF # vectorizer
from pyspark.ml.feature import StopWordsRemover # to remove stop words
from pyspark.sql.functions import concat_ws, col # to concatinate cols
from pyspark.ml.classification import LogisticRegression # ml model
from pyspark.ml.evaluation import MulticlassClassificationEvaluator # to evaluate the model
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import DoubleType

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("news-classification") \
    .getOrCreate()


spark

In [3]:
df = spark.read.csv("data/news-dataset.csv", inferSchema=True, header=True)
df.show(5)

+-----------+--------------------+--------------------+
|Class Index|               Title|         Description|
+-----------+--------------------+--------------------+
|          3|Wall St. Bears Cl...|Reuters - Short-s...|
|          3|Carlyle Looks Tow...|Reuters - Private...|
|          3|Oil and Economy C...|Reuters - Soaring...|
|          3|Iraq Halts Oil Ex...|Reuters - Authori...|
|          3|Oil prices soar t...|AFP - Tearaway wo...|
+-----------+--------------------+--------------------+
only showing top 5 rows



In [50]:
#show 5 first rows with null values
df.filter(df['Class Index'].isNull()).count()

0

In [4]:
# Renaming 'Class Index' col to 'label'
df = df.withColumnRenamed('Class Index', 'label')

# Add a new column 'Text' by concatinating 'Title' and 'Description'
df = df.withColumn("Text", concat_ws(" ", "Title", 'Description'))

# Remove old text columns
df = df.select('label', 'Text')

In [27]:
df.first()

Row(label='3', Text="Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")

In [5]:
# convert sentences to list of words
tokenizer = RegexTokenizer(inputCol="Text", outputCol="words", pattern="\\W")

# adds a column 'words' to df after tokenization
df = tokenizer.transform(df)

df.select(['label','Text', 'words']).show(5)

+-----+--------------------+--------------------+
|label|                Text|               words|
+-----+--------------------+--------------------+
|    3|Wall St. Bears Cl...|[wall, st, bears,...|
|    3|Carlyle Looks Tow...|[carlyle, looks, ...|
|    3|Oil and Economy C...|[oil, and, econom...|
|    3|Iraq Halts Oil Ex...|[iraq, halts, oil...|
|    3|Oil prices soar t...|[oil, prices, soa...|
+-----+--------------------+--------------------+
only showing top 5 rows



In [6]:
# to remove stop words like is, the, in, etc.
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered")

# adds a column 'filtered' to df without stopwords
df = stopwords_remover.transform(df)

df.select(['label','Text', 'words', 'filtered']).first()

Row(label='3', Text="Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", words=['wall', 'st', 'bears', 'claw', 'back', 'into', 'the', 'black', 'reuters', 'reuters', 'short', 'sellers', 'wall', 'street', 's', 'dwindling', 'band', 'of', 'ultra', 'cynics', 'are', 'seeing', 'green', 'again'], filtered=['wall', 'st', 'bears', 'claw', 'back', 'black', 'reuters', 'reuters', 'short', 'sellers', 'wall', 'street', 'dwindling', 'band', 'ultra', 'cynics', 'seeing', 'green'])

In [7]:
# Calculate term frequency in each article
hashing_tf = HashingTF(inputCol="filtered",
                       outputCol="raw_features", 
                       numFeatures=10000)

# adds raw tf features to df
featurized_data = hashing_tf.transform(df)

In [8]:
featurized_data.first()

Row(label='3', Text="Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", words=['wall', 'st', 'bears', 'claw', 'back', 'into', 'the', 'black', 'reuters', 'reuters', 'short', 'sellers', 'wall', 'street', 's', 'dwindling', 'band', 'of', 'ultra', 'cynics', 'are', 'seeing', 'green', 'again'], filtered=['wall', 'st', 'bears', 'claw', 'back', 'black', 'reuters', 'reuters', 'short', 'sellers', 'wall', 'street', 'dwindling', 'band', 'ultra', 'cynics', 'seeing', 'green'], raw_features=SparseVector(10000, {551: 1.0, 662: 1.0, 1262: 1.0, 1449: 1.0, 1889: 1.0, 1948: 1.0, 2503: 1.0, 2826: 1.0, 3038: 1.0, 3684: 1.0, 4443: 1.0, 6404: 2.0, 8318: 1.0, 8430: 1.0, 8450: 2.0, 9430: 1.0}))

In [9]:
# Inverse document frequency
idf = IDF(inputCol="raw_features", outputCol="features")

idf_vectorizer = idf.fit(featurized_data)

# converting text to vectors
rescaled_data = idf_vectorizer.transform(featurized_data)

# top 20 rows
rescaled_data.select("label",'Text', 'words', 'filtered', "features").first()

Row(label='3', Text="Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.", words=['wall', 'st', 'bears', 'claw', 'back', 'into', 'the', 'black', 'reuters', 'reuters', 'short', 'sellers', 'wall', 'street', 's', 'dwindling', 'band', 'of', 'ultra', 'cynics', 'are', 'seeing', 'green', 'again'], filtered=['wall', 'st', 'bears', 'claw', 'back', 'black', 'reuters', 'reuters', 'short', 'sellers', 'wall', 'street', 'dwindling', 'band', 'ultra', 'cynics', 'seeing', 'green'], features=SparseVector(10000, {551: 5.0673, 662: 6.0128, 1262: 4.8019, 1449: 5.9636, 1889: 6.9275, 1948: 4.9649, 2503: 7.6128, 2826: 5.9398, 3038: 4.6635, 3684: 4.3958, 4443: 5.7126, 6404: 9.1681, 8318: 8.034, 8430: 3.394, 8450: 4.6006, 9430: 6.4834}))

In [11]:
rescaled_data.filter(rescaled_data['label'].isNull()).count()

0

In [12]:
rescaled_data = rescaled_data.coalesce(1)
(train, test) = rescaled_data.randomSplit([0.80, 0.20])

In [13]:
train_with_nulls = train.filter(train["label"].isNull())
train_with_nulls.show()

+-----+----+-----+--------+------------+--------+
|label|Text|words|filtered|raw_features|features|
+-----+----+-----+--------+------------+--------+
+-----+----+-----+--------+------------+--------+



In [72]:
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 103720
Test Dataset Count: 25813


In [14]:
train = train.withColumn("label", train["label"].cast(DoubleType()))
test = test.withColumn("label", test["label"].cast(DoubleType()))

In [15]:
train.filter(train['label'].isNull()).count()

3881

In [18]:
#remove nulls from train
train = train.na.drop()
test = test.na.drop()
train.show(5)

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|label|                Text|               words|            filtered|        raw_features|            features|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+
|  1.0| #36;350,000 Siph...|[36, 350, 000, si...|[36, 350, 000, si...|(10000,[157,524,5...|(10000,[157,524,5...|
|  1.0| #36;71M Judgment...|[36, 71m, judgmen...|[36, 71m, judgmen...|(10000,[531,633,1...|(10000,[531,633,1...|
|  1.0| #39;6-Way Talks ...|[39, 6, way, talk...|[39, 6, way, talk...|(10000,[376,446,5...|(10000,[376,446,5...|
|  1.0| #39;70,000 Darfu...|[39, 70, 000, dar...|[39, 70, 000, dar...|(10000,[55,130,49...|(10000,[55,130,49...|
|  1.0| #39;9-11 helper ...|[39, 9, 11, helpe...|[39, 9, 11, helpe...|(10000,[132,626,6...|(10000,[132,626,6...|
+-----+--------------------+--------------------+--------------------+--------------------+-----

In [20]:
train.filter(train['label'].isNull()).count()

0

In [21]:
lr = LogisticRegression(featuresCol='features',
                        labelCol='label',
                        family="multinomial",
                        regParam=0.3,
                        elasticNetParam=0,
                        maxIter=50)

lrModel = lr.fit(train)

In [22]:
# get predictions for test set
predictions = lrModel.transform(test)

In [23]:
# show top 10 predictions
predictions.select("Text", 'probability','prediction', 'label').show(10)

+--------------------+--------------------+----------+-----+
|                Text|         probability|prediction|label|
+--------------------+--------------------+----------+-----+
| #39;Batman #39; ...|[9.94715584818671...|       1.0|  1.0|
| #39;Black boxes ...|[1.00056389661963...|       1.0|  1.0|
| #39;Chemical Ali...|[5.78458034240974...|       1.0|  1.0|
| #39;Deserter #39...|[6.27428663807804...|       1.0|  1.0|
| #39;Ethnic viole...|[6.23124782322524...|       1.0|  1.0|
| #39;Hundreds #39...|[3.14086818099292...|       1.0|  1.0|
| #39;Hurriyat get...|[1.15737679612712...|       4.0|  1.0|
| #39;IAEA may hav...|[6.00777986000207...|       1.0|  1.0|
| #39;Militant #39...|[2.30651369034430...|       1.0|  1.0|
| #39;Miracle #39;...|[1.23534646179662...|       1.0|  1.0|
+--------------------+--------------------+----------+-----+
only showing top 10 rows



In [28]:
#accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy")
#accuracy in percentage
accuracy = evaluator.evaluate(predictions)*100
print("Accuracy: " + str(accuracy.__round__(2)) + "%")

Accuracy: 88.28%


In [None]:
#a function that process a given news
def process_text(news):
    df = spark.createDataFrame([(news,)], schema=["news"]) # create df from news (news)
    tokenizer = RegexTokenizer(inputCol="news", outputCol="words", pattern="\\W")
    df = tokenizer.transform(df)
    stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    df = stopwords_remover.transform(df)
    hashing_tf = HashingTF(inputCol="filtered",
                           outputCol="raw_features", 
                           numFeatures=10000)
    featurized_data = hashing_tf.transform(df)
    idf = IDF(inputCol="raw_features", outputCol="features")
    idf_vectorizer = idf.fit(featurized_data)
    rescaled_data = idf_vectorizer.transform(featurized_data)
    return rescaled_data

def predict(news):
    df = process_text(news)
    predictions = lrModel.transform(df)
    return predictions

predictions = predict("Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.")
predictions.select("Text", 'probability','prediction', 'label').show()


In [None]:
string_data =[("Hello World",)]
columns = ["news"]
df1 = spark.createDataFrame(string_data, columns)
df1.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

labels = ["World", "Sports", "Business","Science", "Health","Politics","Entertainment"]

# important: need to cast to float type, and order by prediction, else it won't work
preds_and_labels = predictions.select(['prediction','label']) \
                              .withColumn('label', col('label') \
                              ) \
                              .orderBy('prediction')

preds_and_labels_rdd = preds_and_labels.rdd.map(lambda row: (row['prediction'], row['label']))

# generate metrics
metrics = MulticlassMetrics(preds_and_labels_rdd)

# figure object
_ = plt.figure(figsize=(7, 7))

# plot confusion matrix
sns.heatmap(metrics.confusionMatrix().toArray(),
            cmap='viridis',
            annot=True,fmt='0',
            cbar=False, 
            xticklabels=labels, 
            yticklabels=labels)
plt.show()

In [None]:
data = [(0.0, 0.0), (1.0, 1.0), (2.0, 2.0), (1.0, 0.0)]
predictions = spark.createDataFrame(data, ["prediction", "label"])

# Transforme en RDD
preds_and_labels_rdd = predictions.rdd.map(lambda row: (float(row['prediction']), float(row['label'])))

# Testez l'action
print(preds_and_labels_rdd.take(5))


In [None]:

data = [(1, 'a'), (2, 'b'), (3, 'c')]
df = spark.createDataFrame(data, ["id", "value"])
print(df.show())
