In [15]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql import SparkSession
from pyspark.ml.functions import vector_to_array
from pyspark.sql import functions as F
import argparse

In [37]:
table = 'reddit_table'
model_path = '/user/bda_reddit_pw/models/sentiment_model'
result_path = '/user/bda_reddit_pw/models/results/sentiment_results'
day_shift = 4

spark = SparkSession.builder \
    .master('local[1]') \
    .appName('Sentiment Prediction') \
    .enableHiveSupport() \
    .getOrCreate()

spark.sparkContext.setLogLevel('ERROR')

In [17]:
loaded_model = PipelineModel.load(model_path)

[Stage 19:>                                                         (0 + 1) / 1]                                                                                

In [18]:
test_df = spark.sql(f'SELECT * FROM {table}')

In [19]:
test_df = test_df.withColumnRenamed('selftext', 'text')

In [20]:
test_df.show()

+--------------------+------------+--------------+----+-----+---+----+
|                text| created_utc|cryptocurrency|year|month|day|hour|
+--------------------+------------+--------------+----+-----+---+----+
|disclaimer long b...|1.66458483E9|       cardano|2022|   10|  1|   0|
|bob send address ...|1.66458573E9|       bitcoin|2022|   10|  1|   0|
|back get much res...|1.66458944E9|      dogecoin|2022|   10|  1|   1|
|glaring lack lite...| 1.6645897E9|       bitcoin|2022|   10|  1|   2|
|sitting wallet bl...| 1.6645952E9|       bitcoin|2022|   10|  1|   3|
|huge advocate thi...|1.66459558E9|      ethereum|2022|   10|  1|   3|
|know time chain w...|1.66459661E9|       bitcoin|2022|   10|  1|   3|
|discord invite li...|1.66459686E9|           xrp|2022|   10|  1|   4|
|      safe left know|1.66459994E9|       bitcoin|2022|   10|  1|   4|
|please utilize st...|1.66460083E9|       bitcoin|2022|   10|  1|   5|
|see lot people ha...|1.66460147E9|       bitcoin|2022|   10|  1|   5|
|hello

In [61]:
prediction_raw = loaded_model.transform(test_df)
prediction = (prediction_raw.withColumn("xs", vector_to_array("probability"))).select(['prediction'] + [F.col("xs")[1]] + ['created_utc'] + ['cryptocurrency'])
prediction = prediction.withColumnRenamed('xs[1]', 'probability')
prediction = prediction.withColumn('created_utc', F.date_format(prediction.created_utc.cast('timestamp'), "yyyy-MM-dd HH:00:00"))
prediction.show()

[Stage 121:>                                                        (0 + 1) / 1]

+----------+--------------------+-------------------+--------------+
|prediction|         probability|        created_utc|cryptocurrency|
+----------+--------------------+-------------------+--------------+
|       1.0|   0.999265107599642|2022-10-01 00:00:00|       cardano|
|       1.0|  0.5713289798314397|2022-10-01 00:00:00|       bitcoin|
|       1.0|  0.9999991498271465|2022-10-01 01:00:00|      dogecoin|
|       1.0|  0.9998426257590661|2022-10-01 02:00:00|       bitcoin|
|       1.0|  0.9999994678764528|2022-10-01 03:00:00|       bitcoin|
|       1.0|   0.999999989582975|2022-10-01 03:00:00|      ethereum|
|       1.0|  0.9977463082172485|2022-10-01 03:00:00|       bitcoin|
|       0.0|3.685617606663527E-7|2022-10-01 04:00:00|           xrp|
|       1.0|  0.9994964812952687|2022-10-01 04:00:00|       bitcoin|
|       0.0|2.747458827112581E-8|2022-10-01 05:00:00|       bitcoin|
|       1.0|                 1.0|2022-10-01 05:00:00|       bitcoin|
|       0.0|1.730676131295183...|2

                                                                                

In [64]:
aggregation = prediction.groupBy(['created_utc', 'cryptocurrency']).agg(F.avg('probability'))
aggregation = aggregation.withColumnRenamed('avg(probability)', 'sentiment')
aggregation = aggregation.withColumnRenamed('cryptocurrency', 'cryptocurrency_a')
## version with prediction 0 1 not probability
# results = prediction.groupBy('created_utc').agg(F.avg('prediction'))
# results = results.withColumnRenamed('avg(prediction)', 'avg_prediction')
aggregation = aggregation.withColumn("sub_utc", F.col("created_utc") - F.expr(f'INTERVAL {day_shift} DAYS'))
aggregation.show()



+-------------------+----------------+--------------------+-------------------+
|        created_utc|cryptocurrency_a|           sentiment|            sub_utc|
+-------------------+----------------+--------------------+-------------------+
|2022-10-02 15:00:00|          solana|3.176570032403791...|2022-09-28 15:00:00|
|2022-10-03 05:00:00|         cardano|1.730676131295183...|2022-09-29 05:00:00|
|2022-10-03 13:00:00|        ethereum|  0.9999789907584345|2022-09-29 13:00:00|
|2022-10-02 16:00:00|         cardano|  0.9999999997260065|2022-09-28 16:00:00|
|2022-10-01 13:00:00|        dogecoin|6.622924431098909...|2022-09-27 13:00:00|
|2022-10-02 14:00:00|         cardano| 0.43076400326133435|2022-09-28 14:00:00|
|2022-10-01 03:00:00|         bitcoin|  0.9988728880468507|2022-09-27 03:00:00|
|2022-10-02 20:00:00|         cardano|  0.2614784683538419|2022-09-28 20:00:00|
|2022-10-03 02:00:00|        dogecoin|2.663974094208043...|2022-09-29 02:00:00|
|2022-10-02 11:00:00|        ethereum|  

                                                                                

In [48]:
crypto_df = spark.sql(f'SELECT * FROM crypto_table')
crypto_df = crypto_df.withColumn('date', F.date_format(crypto_df.date.cast('timestamp'), "yyyy-MM-dd HH:00:00"))
crypto_df.show()

+-------------------+------------------+--------------+----+-----+---+----+
|               date|             price|cryptocurrency|year|month|day|hour|
+-------------------+------------------+--------------+----+-----+---+----+
|2022-09-30 20:00:00| 19674.48240468394|       bitcoin|2022|    9| 30|  20|
|2022-09-30 21:00:00|19416.679427599574|       bitcoin|2022|    9| 30|  21|
|2022-09-30 22:00:00|19390.772416700056|       bitcoin|2022|    9| 30|  22|
|2022-09-30 23:00:00|19391.879627904444|       bitcoin|2022|    9| 30|  23|
|2022-10-01 00:00:00| 19476.92659600407|       bitcoin|2022|   10|  1|   0|
|2022-10-01 01:00:00| 19408.74499886766|       bitcoin|2022|   10|  1|   1|
|2022-10-01 02:00:00|19399.760776852163|       bitcoin|2022|   10|  1|   2|
|2022-10-01 03:00:00| 19416.21089144999|       bitcoin|2022|   10|  1|   3|
|2022-10-01 04:00:00| 19403.31901570052|       bitcoin|2022|   10|  1|   4|
|2022-10-01 05:00:00| 19400.51033859877|       bitcoin|2022|   10|  1|   5|
|2022-10-01 

In [72]:
results = aggregation.join(crypto_df, (aggregation.sub_utc ==  crypto_df.date) & (aggregation.cryptocurrency_a ==  crypto_df.cryptocurrency),"left")
results = results.select('date', 'sentiment', 'price', 'cryptocurrency').na.drop()
results.show()

+-------------------+--------------------+------------------+--------------+
|               date|           sentiment|             price|cryptocurrency|
+-------------------+--------------------+------------------+--------------+
|2022-10-01 13:00:00|4.543128821365983E-6| 1322.565708562071|      ethereum|
|2022-10-01 10:00:00|5.297632421630283E-5|  33.1116725786678|        solana|
|2022-10-01 06:00:00|                 1.0|0.4332036445426828|       cardano|
|2022-10-02 06:00:00|                 1.0|0.4315874640817953|       cardano|
|2022-10-01 07:00:00|  0.9998260500941284|19305.490727955035|       bitcoin|
|2022-10-02 04:00:00|3.685617606663527E-7|0.4741694363464702|           xrp|
|2022-10-01 18:00:00|   0.829032552990767|19285.111394561725|       bitcoin|
|2022-10-01 05:00:00|  0.9307959908415653|1328.9380130377112|      ethereum|
|2022-10-01 10:00:00|  0.9983651232724867|19331.107361269576|       bitcoin|
|2022-10-03 13:00:00|  0.9999999989213226|19240.378908822157|       bitcoin|



In [74]:
cmd = f'hdfs dfs -rmr {result_path}'
os.system(cmd)
results.write.csv(result_path)

rmr: DEPRECATED: Please use '-rm -r' instead.


Deleted /user/bda_reddit_pw/models/results/sentiment_results


                                                                                