### Spark-NLP

#### The Data

The UCI ML News Aggregator dataset contains headlines and categories for over 400k news articles. Task is to build classification model that classifies news headline into pre-defined categories.
The data can be downloaded from https://www.kaggle.com/uciml/news-aggregator-dataset

In [4]:
from pyspark.ml import Pipeline 
from pyspark.ml.feature import CountVectorizer,StringIndexer, RegexTokenizer,StopWordsRemover
from pyspark.sql.functions import col, udf,regexp_replace,isnull
from pyspark.sql.types import StringType,IntegerType
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [7]:
s3_bucket_path = "/mnt/lp-dataset/news-aggregator/uci-news-aggregator.csv"

In [8]:
news_data = spark.read.csv(s3_bucket_path,header = 'True',inferSchema='True')

In [9]:
news_data.show()

In [10]:
news_data.count()

There are 422937 news articles

d Select the columns needed for analysis

In [13]:
title_category = news_data.select("TITLE","CATEGORY")

In [14]:
title_category.show()

Let's check null values in TITLE and CATEGORY columns

In [16]:
def null_value_count(df):
  null_columns_counts = []
  numRows = df.count()
  for k in df.columns:
    nullRows = df.where(col(k).isNull()).count()
    if(nullRows > 0):
      temp = k,nullRows
      null_columns_counts.append(temp)
  return(null_columns_counts)

In [17]:
null_columns_count_list = null_value_count(title_category)


In [18]:
spark.createDataFrame(null_columns_count_list, ['Column_With_Null_Value', 'Null_Values_Count']).show()

There are 389 empty titles and 516 categories

Drop the null values

In [21]:
title_category = title_category.dropna()

In [22]:
title_category.count()

In [23]:
title_category.show(truncate=False)

In [24]:
title_category.select("Category").distinct().count()

Top 20 news categories

In [26]:
title_category.groupBy("Category").count().orderBy(col("count").desc()).show(truncate=False)

Top 20 news title

In [28]:
title_category.groupBy("TITLE").count().orderBy(col("count").desc()).show(truncate=False)

#### Let's clean the dataset

Removing numbers from titles

In [31]:
title_category = title_category.withColumn("only_str",regexp_replace(col('TITLE'), '\d+', ''))

In [32]:
title_category.select("TITLE","only_str").show(truncate=False)

Spliting text into words

In [34]:
regex_tokenizer = RegexTokenizer(inputCol="only_str", outputCol="words", pattern="\\W")
raw_words = regex_tokenizer.transform(title_category)

In [35]:
raw_words.show()

Removing stop words

In [37]:
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
words_df = remover.transform(raw_words)

In [38]:
words_df.select("words","filtered").show(truncate=False)

Lets encode column of category to a column of category indices

In [40]:
indexer = StringIndexer(inputCol="CATEGORY", outputCol="categoryIndex")
feature_data = indexer.fit(words_df).transform(words_df)

In [41]:
feature_data.select("CATEGORY","categoryIndex").show()

Converting text into vectors of token counts.

In [43]:
cv = CountVectorizer(inputCol="filtered", outputCol="features")
model = cv.fit(words_label_index)
countVectorizer_feateures = model.transform(words_label_index)

Partition Training & Test sets

In [45]:
(trainingData, testData) = countVectorizer_feateures.randomSplit([0.8, 0.2],seed = 11)

#### Model Training and Evaluation

In [47]:
nb = NaiveBayes(modelType="multinomial",labelCol="categoryIndex", featuresCol="features")
nbModel = nb.fit(trainingData)
nb_predictions = nbModel.transform(testData)

In [48]:
nb_predictions.select("prediction", "categoryIndex", "features").show(5)

In [49]:
evaluator = MulticlassClassificationEvaluator(labelCol="categoryIndex", predictionCol="prediction", metricName="accuracy")
nb_accuracy = evaluator.evaluate(nb_predictions)
print("Accuracy of NaiveBayes is = %g"% (nb_accuracy))
print("Test Error of NaiveBayes = %g " % (1.0 - nb_accuracy))