### The type of model (e.g., logistic regression)
* We used LogisticRegression for our first attempt.
### Best hyperparameters used
* We did not tune LogisticRegresssion hyperparameters. 
* As we test different models we will tune the hyperparameters.
### Size of the saved model
* The final size of the model is 732K.
### Performance metrics
* Accuracy with MulticlassMetrics is 81.9%

|  | Predicted Yes | Predicted No |
| --- | --- | --- |
| Actual Yes | 317572 | 78047 |
| Actual No | 73909 | 370115 |


In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,lit
from pyspark.sql import functions as F
from pyspark.mllib.stat import Statistics

from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.feature import *  
from pyspark.ml.feature import Tokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.classification import LogisticRegression

from utils import Tools
tools = Tools('mhk9c')
spark = tools.spark

sc = spark.sparkContext

import os
import glob

# Create features
import emoji
import re
import datetime

In [2]:
df = tools.load_data("russian-troll-tweets-enriched")
df.printSchema()

Done loading from /project/ds5559/team1_sp22/data//russian-troll-tweets-enriched.
root
 |-- external_author_id: string (nullable = true)
 |-- author: string (nullable = true)
 |-- content: string (nullable = true)
 |-- region: string (nullable = true)
 |-- language: string (nullable = true)
 |-- publish_date: string (nullable = true)
 |-- harvested_date: string (nullable = true)
 |-- following: integer (nullable = true)
 |-- followers: integer (nullable = true)
 |-- updates: integer (nullable = true)
 |-- post_type: string (nullable = true)
 |-- account_type: string (nullable = true)
 |-- retweet: integer (nullable = true)
 |-- account_category: string (nullable = true)
 |-- new_june_2018: integer (nullable = true)
 |-- alt_external_id: string (nullable = true)
 |-- tweet_id: string (nullable = true)
 |-- article_url: string (nullable = true)
 |-- tco1_step1: string (nullable = true)
 |-- tco2_step1: string (nullable = true)
 |-- tco3_step1: string (nullable = true)
 |-- curated_conten

In [3]:
# split into test and train
training, test = df.randomSplit([0.6, 0.4], seed=314)

In [4]:
test.createOrReplaceTempView("test")
sqlDF = spark.sql("SELECT tco1_step1_domain, count(tco1_step1_domain) FROM test GROUP BY tco1_step1_domain ORDER BY COUNT(tco1_step1_domain) DESC")
# sqlDF = spark.sql("SELECT count(content) FROM test")
sqlDF.show()

+--------------------+------------------------+
|   tco1_step1_domain|count(tco1_step1_domain)|
+--------------------+------------------------+
|         twitter.com|                  366161|
|                  NA|                  300003|
|              bit.ly|                   40029|
|              ift.tt|                   15574|
|              zpr.io|                    6629|
|               ow.ly|                    4507|
|               fb.me|                    4398|
|  EXERCISEWORKOUT.PW|                    4262|
|    EXERCISEQUOTE.PW|                    3900|
|            youtu.be|                    3798|
|         youtube.com|                    3789|
|       instagram.com|                    3279|
|hedgeaccordingly.com|                    2460|
|             vine.co|                    2268|
|               ln.is|                    2001|
|              goo.gl|                    1767|
|              dld.bz|                    1661|
|           1063.mobi|                  

In [5]:
# Check splits
training_count = training.count()
training_trolls = training.filter(training['label']==1).count()

test_count = test.count()
test_trolls = test.filter(test['label']==1).count()

print(f'Train set count : {training.count()}, ratio of trolls : {training_trolls/training_count}')
print(f'Test set count : {test.count()}, ratio of trolls : {test_trolls/test_count}')

Train set count : 1256405, ratio of trolls : 0.534912707287857
Test set count : 839643, ratio of trolls : 0.533753035516285


In [6]:
# features that we can use without any processing.
feats = ['url_count', 'char_count', 'word_count', 'emoji_count']
# Some thoughts:
# url_hosts and handles are both arrays of texts. It feels like we should be able to do some kind of multi-category one-hot encoding
# A quick fix is to explode these columns and one-hot encode each one.
# 

In [7]:
tok_content = Tokenizer(inputCol="content", outputCol="words")
remover_content = StopWordsRemover(inputCol="words", outputCol="words_filtered")
htf_content = HashingTF(inputCol="words_filtered", outputCol="content_htf", numFeatures=200)  

tok_emoji_text = Tokenizer(inputCol="emoji_text", outputCol="emoji_text_words")
remover_emoji_text = StopWordsRemover(inputCol="emoji_text_words", outputCol="emoji_text_words_filtered")
htf_emoji_text = HashingTF(inputCol="emoji_text_words_filtered", outputCol="emoji_text_htf", numFeatures=200)  

# Instead of using the pre filtered domains, we probably want to use our own, derived from the content.
# An example of how to do this is here : https://sparkbyexamples.com/spark/spark-dataframe-withcolumn/

stringIndexer_d1 = StringIndexer(inputCol="tco1_step1_domain", outputCol="d1_Index", handleInvalid='keep')
ohe_d1 = OneHotEncoder(inputCol="d1_Index", outputCol="d1_vec") 

stringIndexer_d2 = StringIndexer(inputCol="tco2_step1_domain", outputCol="d2_Index", handleInvalid='keep')
ohe_d2 = OneHotEncoder(inputCol="d2_Index", outputCol="d2_vec") 

stringIndexer_d3 = StringIndexer(inputCol="tco3_step1_domain", outputCol="d3_Index", handleInvalid='keep')
ohe_d3 = OneHotEncoder(inputCol="d3_Index", outputCol="d3_vec") 

va = VectorAssembler(inputCols=["content_htf","emoji_text_htf", "d1_vec", "d2_vec", "d3_vec", "url_count", "char_count", "word_count", "emoji_count" ], outputCol="features")  
lr = LogisticRegression(labelCol='label', featuresCol='features', maxIter=10, regParam=0.01)

# Fit the pipeline
pipeline = Pipeline(stages=[
                            tok_content
                            ,remover_content
                            ,htf_content
                            ,tok_emoji_text
                            ,remover_emoji_text
                            ,htf_emoji_text
                            ,stringIndexer_d1
                            ,ohe_d1
                            ,stringIndexer_d2
                            ,ohe_d2
                            ,stringIndexer_d3
                            ,ohe_d3
                            ,va
                            ,lr])
# model = pipeline.fit(training)
model = pipeline.fit(test)


In [None]:
model.write().overwrite().save(f"{tools.data_path}LogisticRegression")

### Load Models that we've fitted already

In [None]:
model = PipelineModel.load(f"{tools.data_path}LogisticRegression")

In [None]:
predictions = model.transform(test)

In [None]:
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import FloatType

In [None]:
predictionsRdd = predictions.select("prediction","label").rdd
predictionsRdd = predictionsRdd.map(lambda p: (float(p.label), (float(p.prediction))))
predictionsRdd.take(5)

In [None]:
metrics = MulticlassMetrics(predictionsRdd)

In [None]:
print(f'Accuracy with MulticlassMetrics is {metrics.accuracy}')
print(metrics.confusionMatrix().toArray())