### This notebook performs text cleaning and preprocessing and runs two models to predict whether an article is reliable (real) or unreliable (fake).

In [1]:
%%bash
apt-get update -qq
apt-get install -y openjdk-8-jdk-headless -qq > /dev/null
wget -q "https://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop2.7.tgz" > /dev/null
tar -xvf spark-3.1.1-bin-hadoop2.7.tgz > /dev/null

pip install pyspark findspark --quiet
pip install sparknlp --quiet

     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 310.8/310.8 MB 3.5 MB/s eta 0:00:00
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 489.4/489.4 kB 26.7 MB/s eta 0:00:00


In [2]:
# imports
from pyspark.ml.feature import CountVectorizer
from pyspark.sql.types import ArrayType, StringType
from pyspark.sql.functions import array, col, udf, split
from pyspark.ml import Pipeline
from sparknlp.annotator import Lemmatizer, LemmatizerModel, Tokenizer, StopWordsCleaner, Normalizer
from sparknlp.base import DocumentAssembler, Finisher

from pyspark.ml.classification import LogisticRegression, GBTClassifier
import pyspark.sql.functions as F
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import pyspark.ml.evaluation as evals
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# setup Spark
import os
import sys

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

# Find Spark so that we can access session within our notebook
import findspark
findspark.init()

# Start SparkSession on all available cores
from pyspark.sql import SparkSession
# spark = SparkSession.builder.master("local[*]").getOrCreate()
spark = SparkSession.builder \
    .master("local[*]")\
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:4.3.2")\
    .getOrCreate()

In [5]:
# read in data
data = spark.read.csv('train1.csv',
                      header='true',
                      inferSchema='true',
                      multiLine=True)

In [6]:
print('Total Columns: %d' % len(data.dtypes))
print('Total Rows: %d' % data.count())
data.printSchema()

Total Columns: 5
Total Rows: 6288
root
 |-- id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- author: string (nullable = true)
 |-- text: string (nullable = true)
 |-- label: integer (nullable = true)



In [7]:
data.show()

+---+--------------------+--------------------+--------------------+-----+
| id|               title|              author|                text|label|
+---+--------------------+--------------------+--------------------+-----+
|  0|House Dem Aide: W...|       Darrell Lucus|"House Dem Aide: ...|    1|
|  1|FLYNN: Hillary Cl...|     Daniel J. Flynn|Ever get the feel...|    0|
|  2|Why the Truth Mig...|  Consortiumnews.com|Why the Truth Mig...|    1|
|  3|15 Civilians Kill...|     Jessica Purkiss|Videos 15 Civilia...|    1|
|  4|Iranian woman jai...|      Howard Portnoy|Print An Iranian ...|    1|
|  5|Jackie Mason: Hol...|     Daniel Nussbaum|In these trying t...|    0|
|  6|Life: Life Of Lux...|                null|Ever wonder how B...|    1|
|  7|Benoît Hamon Wins...|     Alissa J. Rubin|PARIS  —   France...|    0|
|  8|Excerpts From a D...|                null|Donald J. Trump i...|    0|
|  9|A Back-Channel Pl...|Megan Twohey and ...|A week before Mic...|    0|
| 10|Obama’s Organizin...

### Check Class Balance

In [8]:
labels = (data.groupBy('label')
             .count()
        )
labels.show()

+-----+-----+
|label|count|
+-----+-----+
| null|    1|
|    1| 3386|
|    0| 3357|
+-----+-----+



### Text Preprocessing

In [9]:
def doc_assembler(inputCol):
    '''Spark NLP document assembler'''
    
    return DocumentAssembler().setInputCol(inputCol)


def tokenizer(inputCol, outputCol):
    '''Tokenize text for input to the lemmatizer'''
    
    tokenizer = (Tokenizer()
        .setInputCols([inputCol])
        .setOutputCol(outputCol))
    return tokenizer


def stopwords(inputCol, outputCol):
    '''Remove stopwords'''

    stopwords = StopWordsCleaner.pretrained("stopwords_en", "en") \
        .setInputCols([inputCol]) \
        .setOutputCol(outputCol)
    return stopwords


def normalizer(inputCol, outputCol):
    '''Remove unnecessary characters and make tokens lowercase'''
    
    normalizer = (Normalizer() 
        .setInputCols([inputCol])
        .setOutputCol(outputCol)
        .setLowercase(True))
    return normalizer
          
    
def lemmatizer(inputCol, outputCol):
    '''
    Retrieve root words out of the input tokens
    using a pretrained lemmatizer
    '''
    
    lemmatizer = (LemmatizerModel.pretrained(name="lemma_antbnc", lang="en")
        .setInputCols([inputCol])
        .setOutputCol(outputCol))
    return lemmatizer


def finisher(finishedCol):
    '''Finisher transform for Spark NLP pipeline'''
    
    finisher = (Finisher()
        .setInputCols([finishedCol])
        .setIncludeMetadata(False))
    return finisher


def run_sparknlp_pipeline(df):
    '''
    Create a SparkNLP pipeline that takes the input df to produce a final output
    column storing each document as a sequence of lemmas (root words)
    '''
   
    nlpPipeline = Pipeline(stages=[
        doc_assembler("text"),
        tokenizer("document", "token"),
        stopwords('token', 'token_s'),
        normalizer('token_s', 'cleaned_tokens'),
        lemmatizer("cleaned_tokens", "lemma"),
        finisher("lemma")
    ])
    df1 = nlpPipeline.fit(df).transform(df).withColumnRenamed('finished_lemma', 'allTokens')

    return df1

In [19]:
data = data.dropna()
sampled_data = data.sample(fraction=0.3)

nlpPipelineDF = run_sparknlp_pipeline(sampled_data)

stopwords_en download started this may take some time.
Approximate size to download 2.9 KB
[OK!]
lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [20]:
nlpPipelineDF.show()

+---+--------------------+--------------------+--------------------+-----+--------------------+
| id|               title|              author|                text|label|           allTokens|
+---+--------------------+--------------------+--------------------+-----+--------------------+
|  2|Why the Truth Mig...|  Consortiumnews.com|Why the Truth Mig...|    1|[truth, fire, oct...|
|  3|15 Civilians Kill...|     Jessica Purkiss|Videos 15 Civilia...|    1|[video, civilian,...|
|  5|Jackie Mason: Hol...|     Daniel Nussbaum|In these trying t...|    0|[time, jackie, ma...|
|  7|Benoît Hamon Wins...|     Alissa J. Rubin|PARIS  —   France...|    0|[paris, france, c...|
| 13|US Officials See ...|          Jason Ditz|Clinton Campaign ...|    1|[clinton, campaig...|
| 15|In Major League S...|       Jack Williams|Guillermo Barros ...|    0|[guillermo, barro...|
| 18|FBI Closes In On ...|             The Doc|FBI Closes In On ...|    1|[fbi, close, hill...|
| 19|Chuck Todd: ’Buzz...|           Jef

In [21]:
print('Total Rows: %d' % nlpPipelineDF.count())
nlpPipelineDF.persist()

Total Rows: 5468


DataFrame[id: int, title: string, author: string, text: string, label: int, allTokens: array<string>]

### ML Prediction Pipeline

In [22]:
def count_vec(inputCol, outputCol, params):
    '''
    Convert a collection of text documents to vectors of token counts
    '''
    cv = CountVectorizer(
        inputCol=inputCol,
        outputCol=outputCol,
        vocabSize=params['vocabsize'],
        minDF=params['minDF'],
        maxDF=params['maxDF'],
        minTF=1.0
    )
    return cv


def ml_pipeline(df, params):
    '''
    Create a Spark ML pipeline and transform the input NLP-transformed DataFrame 
    to produce an features for an ML model
    '''

    mlPipeline = Pipeline(
        stages=[
            count_vec("allTokens", "features", params)
        ]
    )
    
    final_df = mlPipeline.fit(df).transform(df)


    return final_df

In [23]:
ml_params = dict(vocabsize = 7000,
    minDF = 0.02,
    maxDF = 0.8
 )
ml_params

{'vocabsize': 7000, 'minDF': 0.02, 'maxDF': 0.8}

In [24]:
final_df = ml_pipeline(nlpPipelineDF, ml_params)

In [25]:
final_df.count()

5468

### Prepare for modeling

In [26]:
features = ['features']
assembler = VectorAssembler(inputCols = features, outputCol = 'final_features', handleInvalid='skip')
mlPipelineDF = assembler.transform(final_df)
model_df = mlPipelineDF.select(['final_features', 'label'])

### Logistic Regression

In [27]:
# train-test split & hyperparameter tuning

train, test = model_df.randomSplit([0.8, 0.2])
lr = LogisticRegression(featuresCol='final_features', labelCol='label')

grid = ParamGridBuilder().addGrid(lr.regParam, np.arange(0,
    .1, .01)).addGrid(lr.elasticNetParam, [0, 1.0]).build()
evaluator = BinaryClassificationEvaluator(labelCol='label', metricName='areaUnderROC')
cv = CrossValidator(estimator=lr, estimatorParamMaps=grid, evaluator=evaluator,
     numFolds=5)

# run model & evaluate
cvModel = cv.fit(train)

lr_pred = cvModel.transform(test)

In [28]:
print("Test AUC: ", evaluator.evaluate(lr_pred))

Test AUC:  0.9732311189506937


In [29]:
# Compute accuracy
lr_pred = lr_pred.withColumn('accuracy', (lr_pred.label == lr_pred.prediction).cast('float'))
lr_pred.select(F.mean('accuracy')).show()

+------------------+
|     avg(accuracy)|
+------------------+
|0.9216757741347905|
+------------------+



### Tree-based Model (Gradient Boosted tree)

In [30]:
gbt = GBTClassifier(featuresCol='final_features', labelCol='label')

tree_grid = ParamGridBuilder()\
  .addGrid(gbt.maxDepth, [2, 5])\
  .addGrid(gbt.maxIter, [10, 100])\
  .build()

evaluator = BinaryClassificationEvaluator(labelCol='label', metricName='areaUnderROC')

cv = CrossValidator(estimator=gbt, estimatorParamMaps=tree_grid, evaluator=evaluator,
     numFolds=5)

# run model
cvModel = cv.fit(train)

tree_pred = cvModel.transform(test)

In [31]:
# evaluate model

print("Test AUC: ", evaluator.evaluate(tree_pred))

# Compute accuracy
tree_pred = tree_pred.withColumn('accuracy', (tree_pred.label == tree_pred.prediction).cast('float'))
tree_pred.select(F.mean('accuracy')).show()

Test AUC:  0.9724048148235575
+-----------------+
|    avg(accuracy)|
+-----------------+
|0.907103825136612|
+-----------------+

