In [1]:
from pyspark.sql.types import StructType, StructField, StringType, LongType
jsonSchema = StructType([
    StructField('label', StringType(), True),
    StructField('tweet_id', LongType(), True),
    StructField('tweet_text', StringType(), True)
])

#replace the file path
df=spark.read.format("json").schema(jsonSchema).load("/Users/Pavel/Documents/KULeuven/Courses/AdvancedAnalyticsinBigDataWorld/spark/data/*")

## Text preprocessing

In [2]:
import pyspark.sql.functions as f
from pyspark.sql.functions import regexp_replace
from pyspark.sql.functions import ltrim

#Converting all letters to lowercase
df = df.withColumn("tweet_text",f.lower(f.col("tweet_text")))

#removing punctuations, numbers, http and spaces
df =df.withColumn("tweet_text",f.regexp_replace(f.col("tweet_text"),'([^ a-zA-Z\'])',''))
df = df.withColumn("tweet_text",f.regexp_replace(f.col("tweet_text"),'http.*?\\b',' '))
df = df.withColumn("tweet_text",f.ltrim(f.regexp_replace(f.col("tweet_text"),'[\r\n\t\f\v ]+', ' ')))



In [3]:
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover

#Splitting words
tokenizer = Tokenizer(inputCol="tweet_text", outputCol="words")
df = tokenizer.transform(df)

In [4]:
#Lemmatization
import nltk
#nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer() 

In [5]:
pandas_df = df.select("*").toPandas()
pandas_df['lemmatized'] = pandas_df['words'].apply(lambda lst:[lemmatizer.lemmatize(word) for word in lst])

In [6]:
df = spark.createDataFrame(pandas_df)

In [7]:
#Removing stop words
stopwordList = ["u","ur", "amp"] 
stopwordList.extend(StopWordsRemover().getStopWords())
remover = StopWordsRemover(inputCol="lemmatized", outputCol="filtered" ,stopWords=stopwordList)
df = remover.transform(df)

In [8]:
#to get rid of words like v, q, wa
df = df.withColumn("filtered2", f.expr("filter(filtered, x -> not(length(x) < 3))")).where(f.size(f.col("filtered2")) > 0).drop("filtered")

## Bag of words

In [9]:
#Vectorizing and IDF

from pyspark.ml.feature import CountVectorizer, StringIndexer, IDF, HashingTF
from pyspark.ml import Pipeline


cv = CountVectorizer(inputCol="filtered2", outputCol="tf_features")
idf = IDF(inputCol = "tf_features", outputCol = "tf_idf_features")
label_stringIdx = StringIndexer(inputCol = "label", outputCol = "labelIndex")

pipeline = Pipeline(stages=[cv, idf, label_stringIdx])

pipelineFit = pipeline.fit(df)
df = pipelineFit.transform(df)

## Train and test split

In [10]:
(train, test) = df.randomSplit([0.7, 0.3], seed = 100)
print("Training Dataset Count: " + str(train.count()))
print("Test Dataset Count: " + str(test.count()))

Training Dataset Count: 42585
Test Dataset Count: 18328


## Decision Tree Classifier

In [11]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(labelCol="labelIndex", featuresCol="tf_idf_features")
dtmodel = dt.fit(train)
predictions = dtmodel.transform(test)

In [17]:
predictions.select("prediction", "labelIndex", "label").show()

+----------+----------+------+
|prediction|labelIndex| label|
+----------+----------+------+
|       1.0|       3.0|#biden|
|       1.0|       3.0|#biden|
|       1.0|       3.0|#biden|
|       0.0|       3.0|#biden|
|       1.0|       3.0|#biden|
|       1.0|       3.0|#biden|
|       3.0|       3.0|#biden|
|       1.0|       3.0|#biden|
|       1.0|       3.0|#biden|
|       1.0|       3.0|#biden|
|       1.0|       3.0|#biden|
|       1.0|       3.0|#biden|
|       1.0|       3.0|#biden|
|       1.0|       3.0|#biden|
|       3.0|       3.0|#biden|
|       1.0|       3.0|#biden|
|       1.0|       3.0|#biden|
|       1.0|       3.0|#biden|
|       1.0|       3.0|#biden|
|       1.0|       3.0|#biden|
+----------+----------+------+
only showing top 20 rows



In [13]:
import pyspark.sql.functions as col
predictions.groupBy("prediction").count().orderBy(f.col("count").desc()).show()

+----------+-----+
|prediction|count|
+----------+-----+
|       1.0|14720|
|       0.0| 1751|
|       4.0|  745|
|       2.0|  740|
|       3.0|  372|
+----------+-----+



### Evaluation

In [18]:
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Create both evaluators
evaluatorMulti = MulticlassClassificationEvaluator(labelCol="labelIndex", predictionCol="prediction")

# Make predicitons
predictionAndTarget = predictions.select("prediction", "labelIndex")

# Get metrics
acc = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "accuracy"})
f1 = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "f1"})
weightedPrecision = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedPrecision"})
weightedRecall = evaluatorMulti.evaluate(predictionAndTarget, {evaluatorMulti.metricName: "weightedRecall"})
auc = evaluatorMulti.evaluate(predictionAndTarget)

In [19]:
print("ACC: %s" % acc)
print("F1 score: %s" % f1)
print("Weighted Precision: %s" % weightedPrecision)
print("Weighted Recall: %s" % weightedRecall)
print("AUC: %s" % auc)

ACC: 0.38111086861632476
F1 score: 0.34551427520771544
Weighted Precision: 0.6506317035439997
Weighted Recall: 0.3811108686163248
AUC: 0.34551427520771544
