In [0]:
# load libraries

from pyspark.sql.types import *
from pyspark.sql import functions as F
# from transformers import pipeline
# import torch
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF, Tokenizer, NGram, ChiSqSelector, VectorAssembler, CountVectorizer
from pyspark.ml.classification import LogisticRegression, NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
# from bertopic import BERTopic

In [0]:
# read the clean dataset from my s3 bucket for faster compute
### ::: ### mount_s3_bucket(ACCESS_KEY, SECRET_ACCESS_KEY, 'bigdatapro/Project/clean_dataset/', 'clean_data')

re_schema = StructType([
    StructField('tweet', StringType(), True),
    StructField('followers', IntegerType(), True),
    StructField('location', StringType(), True),
    StructField('date_time', TimestampType(), True),
    StructField('label', IntegerType(), True)])

re_elonmusk = (spark.read
       .option("header", "false")
       .option("delimiter", "\t")
       .schema(re_schema)
       .csv("/mnt/clean_data"))

display(re_elonmusk)

tweet,followers,location,date_time,label
keep in mind trump banned on twitter is an integral element of the j narrative that trump fomented insurrect,9016.0,,2022-11-21T18:11:58Z,0
rt mattgertz elon musk interacting with sycophantic right wing influencers this weekend a thread,469.0,,2022-11-21T18:11:58Z,1
rt mattgertz elon musk interacting with sycophantic right wing influencers this weekend a thread,180.0,"Hawaii, USA",2022-11-21T18:11:59Z,1
rt elizableu i d like to make something else clear i don t work for twitter elon musk any government political party group etc i ru,195.0,Merica,2022-11-21T18:11:59Z,1
rt disclosetv just in elon musk has reinstated rep marjorie taylor greene s r ga personal twitter account,915.0,"North Coast, Ohio USA",2022-11-21T18:11:59Z,1
rt behizytweets breaking elon musk just reinstated marjorie taylor greene s account,203.0,,2022-11-21T18:11:59Z,1
rt w_terrence elon musk should purchase the rights to the the view on abc i would love to see the look on their faces and watch them,3684.0,America,2022-11-21T18:11:59Z,2
rt dashdobrofsky cnn s jake tapper asked hakeem jeffries what his reaction was to donald trump being reinstated on twitter by elon musk,772.0,,2022-11-21T18:12:00Z,1
rt jtanews elon musk bantered with kanye west and trolled the anti defamation league this weekend,8820.0,"Traverse City, MI",2022-11-21T18:12:00Z,1
rt tradutordobr jair bolsonaro elon musk here they call me a myth i don t know why but you really are the myth of our freedom,1598.0,America do Sul,2022-11-21T18:12:00Z,1


In [0]:
re_elonmusk.cache()
re_elonmusk.count()

328562

In [0]:
# pipeline for model 1: Logistic Regression

# perform train test split
X_train, y_test = re_elonmusk.randomSplit([0.8, 0.2], seed=20240531)

# create transformers for the ML pipeline
tokenizer = Tokenizer(inputCol="tweet", outputCol="tokens")
stopword_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
cv = CountVectorizer(vocabSize=2**16, inputCol="filtered", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="1gram_idf", minDocFreq=5)
ngram = NGram(n=2, inputCol="filtered", outputCol="2gram")
ngram_hashingtf = HashingTF(inputCol="2gram", outputCol="2gram_tf", numFeatures=20000)
ngram_idf = IDF(inputCol='2gram_tf', outputCol="2gram_idf", minDocFreq=5) 

# assemble all text features
assembler = VectorAssembler(inputCols=["1gram_idf", "2gram_tf"], outputCol="rawFeatures")

# Chi-square variable selection
selector = ChiSqSelector(numTopFeatures=2**14,featuresCol='rawFeatures', outputCol="features")

# regression model estimator
lr = LogisticRegression(maxIter=100)

# build the pipeline
pipeline = Pipeline(stages=[tokenizer, stopword_remover, cv, idf, ngram, ngram_hashingtf, ngram_idf, assembler, selector, lr])

# pipeline model fitting
pipeline_model = pipeline.fit(X_train)
y_pred = pipeline_model.transform(y_test)

evaluator = MulticlassClassificationEvaluator()
accuracy = y_pred.filter(y_pred.label == y_pred.prediction).count() / float(y_test.count())
roc_auc = evaluator.evaluate(y_pred)

print("Accuracy Score: {0:.4f}".format(accuracy))
print("ROC-AUC: {0:.4f}".format(roc_auc))

Accuracy Score: 0.9284
ROC-AUC: 0.9285


In [0]:
# pipeline for model 2: Naive Bayes

# perform train test split
X_train, y_test = re_elonmusk.randomSplit([0.8, 0.2], seed=20240531)

# create transformers for the ML pipeline
tokenizer = Tokenizer(inputCol="tweet", outputCol="tokens")
stopword_remover = StopWordsRemover(inputCol="tokens", outputCol="filtered")
cv = CountVectorizer(vocabSize=2**16, inputCol="filtered", outputCol='cv')
idf = IDF(inputCol='cv', outputCol="1gram_idf", minDocFreq=5)
ngram = NGram(n=2, inputCol="filtered", outputCol="2gram")
ngram_hashingtf = HashingTF(inputCol="2gram", outputCol="2gram_tf", numFeatures=20000)
ngram_idf = IDF(inputCol='2gram_tf', outputCol="2gram_idf", minDocFreq=5) 

# assemble all text features
assembler = VectorAssembler(inputCols=["1gram_idf", "2gram_tf"], outputCol="rawFeatures")

# Chi-square variable selection
selector = ChiSqSelector(numTopFeatures=2**14,featuresCol='rawFeatures', outputCol="features")

# NaiveBayes model estimator
nb = NaiveBayes(modelType='multinomial')

# build the pipeline
pipeline_1 = Pipeline(stages=[tokenizer, stopword_remover, cv, idf, ngram, ngram_hashingtf, ngram_idf, assembler, selector, nb])

# pipeline model fitting
nb_pipeline = pipeline_1.fit(X_train)
y_pred = nb_pipeline.transform(y_test)

evaluator_1 = MulticlassClassificationEvaluator()
nb_accuracy = y_pred.filter(y_pred.label == y_pred.prediction).count() / float(y_test.count())
nb_roc_auc = evaluator_1.evaluate(y_pred)

print("NB Accuracy Score: {0:.4f}".format(nb_accuracy))
print("NB ROC-AUC: {0:.4f}".format(nb_roc_auc))

NB Accuracy Score: 0.8887
NB ROC-AUC: 0.8905
