In [1]:
from pyspark.sql.types import StructType, StructField, StringType, LongType
jsonSchema = StructType([
    StructField('label', StringType(), True),
    StructField('tweet_id', LongType(), True),
    StructField('tweet_text', StringType(), True)
])

#replace the file path
df=spark.read.format("json").schema(jsonSchema).load("/Users/Pavel/Documents/KULeuven/Courses/AdvancedAnalyticsinBigDataWorld/spark/data/*")

In [2]:
df.count()

61008

In [3]:
(train, test) = df.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 42674
Test Dataset Count: 18334


In [4]:
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasOutputCols, Param, Params
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.sql.functions import lit # for the dummy _transform
import pyspark.sql.functions as f
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import ltrim
from pyspark.ml import Pipeline, Transformer
from pyspark.sql import DataFrame
from typing import Iterable
import pandas as pd

class RegexReplacerWritable(
    Transformer, DefaultParamsReadable, DefaultParamsWritable,
):
    #value = Param(
    #   Params._dummy(),
    #   "value",
    #   "value to fill",
    #)

    @keyword_only
    def __init__(self):
        super(RegexReplacerWritable, self).__init__()
        kwargs = self._input_kwargs
        self._set(**kwargs)

    @keyword_only
    def setParams(self):
        """
        setParams(self, outputCols=None, value=0.0)
        Sets params for this RegexReplacerWritable.
        """
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def _transform(self, df: DataFrame) -> DataFrame:
        #Converting all letters to lowercase
        df = df.withColumn("tweet_text",f.lower(f.col("tweet_text")))
        #removing punctuations, numbers, http and spaces
        df = df.withColumn("tweet_text",f.regexp_replace(f.col("tweet_text"),'([^ a-zA-Z\'])',''))
        df = df.withColumn("tweet_text",f.regexp_replace(f.col("tweet_text"),'http.*?\\b',' '))
        df = df.withColumn("tweet_text",f.ltrim(f.regexp_replace(f.col("tweet_text"),'[\r\n\t\f\v ]+', ' ')))
        return df

In [5]:
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasOutputCols, Param, Params
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.sql.functions import lit # for the dummy _transform
import pyspark.sql.functions as f
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import ltrim
from pyspark.ml import Pipeline, Transformer
from pyspark.sql import DataFrame
from typing import Iterable
import pandas as pd



class UDLemmatization(
    Transformer, DefaultParamsReadable, DefaultParamsWritable,
):
    @keyword_only
    def __init__(self):
        super(UDLemmatization, self).__init__()
        kwargs = self._input_kwargs
        self._set(**kwargs)

    @keyword_only
    def setParams(self):
        """
        setParams(self, outputCols=None, value=0.0)
        Sets params for this RegexReplacerWritable.
        """
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def _transform(self, df: DataFrame) -> DataFrame:
        
        lemmatizer = WordNetLemmatizer() 
        pandas_df = df.select("*").toPandas()
        pandas_df['lemmatized'] = pandas_df['words'].apply(
                    lambda lst:[lemmatizer.lemmatize(word) for word in lst])
        pandas_df['lemmatized']
        df = spark.createDataFrame(pandas_df)
        return df
    
    

m = __import__("__main__"); 
setattr(m, 'UDLemmatization', UDLemmatization)

In [6]:
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasOutputCols, Param, Params
from pyspark.ml.util import DefaultParamsReadable, DefaultParamsWritable
from pyspark.sql.functions import lit # for the dummy _transform
import pyspark.sql.functions as f
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import ltrim
from pyspark.ml import Pipeline, Transformer
from pyspark.sql import DataFrame
from typing import Iterable
import pandas as pd



class UDShortWordsRemover(
    Transformer, DefaultParamsReadable, DefaultParamsWritable,
):
    @keyword_only
    def __init__(self):
        super(UDShortWordsRemover, self).__init__()
        kwargs = self._input_kwargs
        self._set(**kwargs)

    @keyword_only
    def setParams(self):
        """
        setParams(self, outputCols=None, value=0.0)
        Sets params for this RegexReplacerWritable.
        """
        kwargs = self._input_kwargs
        return self._set(**kwargs)

    def _transform(self, df: DataFrame) -> DataFrame:
        
        df = df.withColumn("filtered2", f.expr("filter(filtered, x -> not(length(x) < 3))")).where(f.size(f.col("filtered2")) > 0).drop("filtered")
        return df

## Pipeline preparation

In [7]:
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer, StringIndexer, IDF, HashingTF, IndexToString
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
import nltk
from nltk.stem import WordNetLemmatizer



# 1. Regex Filter replacer
regexrep = RegexReplacerWritable()

# 2. Tokenizer - splitting words 
tokenizer = Tokenizer(inputCol="tweet_text", outputCol="words")

# 3. Lemmatizer user defined
lemmatizerUD = UDLemmatization()

# 4. Stop Words Remover
stopwordList = ["u","ur", "amp", "q"] 
stopwordList.extend(StopWordsRemover().getStopWords())
remover = StopWordsRemover(inputCol="words", outputCol="filtered" ,stopWords=stopwordList)

# 5. Short Words len < 3 user defined remover 
shortWordsremover = UDShortWordsRemover()

# 6. Count Vectorizer
cv = CountVectorizer(inputCol="filtered2", outputCol="features")

# 7. IDF
idf = IDF(inputCol = "features", outputCol = "tf_idf_features")

# 8. String Indexer
label_stringIdx = StringIndexer(inputCol = "label", outputCol = "labelIndex")

# 9. Logistic Regression
lr = LogisticRegression(labelCol = "labelIndex", featuresCol = "tf_idf_features", maxIter=20, regParam=0.3, elasticNetParam=0)

# 10. Index to String, for now labels only, not prediction - TODO
converter = IndexToString(inputCol="labelIndex", outputCol="labelOriginal")

#create the pipeline
pipeline = Pipeline(stages=[regexrep, tokenizer, lemmatizerUD, remover, shortWordsremover, cv, idf, label_stringIdx, lr, converter])

In [None]:
pipelineFit = pipeline.fit(df)

In [9]:
pipelineFitTrain = pipeline.fit(train)

In [10]:
prediction = pipelineFitTrain.transform(test)

In [11]:
pipelineFitTrain.write().overwrite().save('lr_model')

## Evaluation

In [12]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
# Select (prediction, true label) and compute accuracy
evaluator = MulticlassClassificationEvaluator(
    labelCol="labelIndex", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(prediction)
print(accuracy)
print("Test Error = %g " % (1.0 - accuracy))

0.6465093411996067
Test Error = 0.353491 


In [15]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# Create both evaluators
evaluatorMulti = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction")

# Make predicitons
predictionAndTarget = prediction.select("prediction", "labelIndex")

# Get metrics
acc = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "accuracy"})
f1 = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "f1"})
weightedPrecision = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedPrecision"})
weightedRecall = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedRecall"})
auc = evaluatorMulti.evaluate(predictionAndTarget)

In [16]:
print("ACC: %s" % acc)
print("F1 score: %s" % f1)
print("Weighted Precision: %s" % weightedPrecision)
print("Weighted Recall: %s" % weightedRecall)
print("AUC: %s" % auc)

ACC: 0.6465093411996067
F1 score: 0.6467342949460959
Weighted Precision: 0.6731761558628765
Weighted Recall: 0.6465093411996066
AUC: 0.6467342949460959


In [17]:
trainingSummary = pipelineFitTrain.stages[8].summary

print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

False positive rate by label:
label 0: 0.05860863540452871
label 1: 0.028397693079237712
label 2: 0.008277703604806409
label 3: 0.0061695800794819775
label 4: 0.00062940914216779
label 5: 2.4047710657945365e-05
True positive rate by label:
label 0: 0.9652822543512294
label 1: 0.9135225999252895
label 2: 0.9415066801392163
label 3: 0.8971187207723639
label 4: 0.8868556125864763
label 5: 0.7276264591439688
Precision by label:
label 0: 0.8492262821032164
label 1: 0.9152320359281437
label 2: 0.967801500288517
label 3: 0.964013616469444
label 4: 0.9939969984992496
label 5: 0.9986648865153538
Recall by label:
label 0: 0.9652822543512294
label 1: 0.9135225999252895
label 2: 0.9415066801392163
label 3: 0.8971187207723639
label 4: 0.8868556125864763
label 5: 0.7276264591439688
F-measure by label:
label 0: 0.9035427980346521
label 1: 0.9143765189755094
label 2: 0.9544730252674709
label 3: 0.9293639631192374
label 4: 0.937374690411605
label 5: 0.8418683173888576
Accuracy: 0.9227212991645546
FPR: 