In [8]:
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import Tokenizer, NGram, CountVectorizer, IDF
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

spark = SparkSession.builder \
    .appName("SequenceIndexerExample") \
    .master("local[*]") \
    .config("spark.driver.memory", "16g") \
    .config("spark.executor.memory", "16g") \
    .config("spark.executor.cores", "4") \
    .config("spark.task.cpus", "2") \
    .config("spark.sql.shuffle.partitions", "8") \
    .getOrCreate()
spark.sparkContext.setLogLevel("ERROR")


In [10]:
df = spark.read.parquet("dat/sa.parquet")
df.head(10)

[Row(tweet='$BYND - JPMorgan reels in expectations on Beyond Meat https://t.co/bd0xbFGjkT', sentiment=2, url='https://huggingface.co/datasets/zeroshot/twitter-financial-news-sentiment'),
 Row(tweet='$CCL $RCL - Nomura points to bookings weakness at Carnival and Royal Caribbean https://t.co/yGjpT2ReD3', sentiment=2, url='https://huggingface.co/datasets/zeroshot/twitter-financial-news-sentiment'),
 Row(tweet='$CX - Cemex cut at Credit Suisse, J.P. Morgan on weak building outlook https://t.co/KN1g4AWFIb', sentiment=2, url='https://huggingface.co/datasets/zeroshot/twitter-financial-news-sentiment'),
 Row(tweet='$ESS: BTIG Research cuts to Neutral https://t.co/MCyfTsXc2N', sentiment=2, url='https://huggingface.co/datasets/zeroshot/twitter-financial-news-sentiment'),
 Row(tweet='$FNKO - Funko slides after Piper Jaffray PT cut https://t.co/z37IJmCQzB', sentiment=2, url='https://huggingface.co/datasets/zeroshot/twitter-financial-news-sentiment'),
 Row(tweet='$FTI - TechnipFMC downgraded at Ber

In [11]:
unique_sentiments = df.select("sentiment").distinct()
unique_sentiments.show()

+---------+
|sentiment|
+---------+
|        0|
|        1|
|        2|
+---------+



In [12]:
tokenizer = Tokenizer(inputCol="tweet", outputCol="tokens")
ngrams = NGram(n=2, inputCol="tokens", outputCol="bigrams")

vectorizer = CountVectorizer(inputCol="bigrams", outputCol="x", minDF=5)
idf = IDF(inputCol="x", outputCol="features")
logreg = LogisticRegression(featuresCol="features", labelCol="sentiment")

pipeline = Pipeline(stages=[tokenizer, ngrams, vectorizer, idf, logreg])

In [13]:
model = pipeline.fit(df)
ef = model.transform(df)

ef.select("sentiment", "prediction").show()

evaluator = MulticlassClassificationEvaluator(labelCol="sentiment")
f1 = evaluator.evaluate(ef)
print(f"F1 score = {f1}")

spark.stop()

+---------+----------+
|sentiment|prediction|
+---------+----------+
|        2|       2.0|
|        2|       2.0|
|        2|       2.0|
|        2|       2.0|
|        2|       2.0|
|        2|       2.0|
|        2|       2.0|
|        2|       2.0|
|        2|       2.0|
|        2|       1.0|
|        2|       1.0|
|        2|       2.0|
|        2|       2.0|
|        2|       2.0|
|        2|       2.0|
|        2|       0.0|
|        2|       2.0|
|        2|       2.0|
|        2|       1.0|
|        2|       2.0|
+---------+----------+
only showing top 20 rows

F1 score = 0.8753736511902163
