In [1]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.sql.types import *

In [2]:
df = spark.read.csv("sentiment_data", header = True, inferSchema = True)
df.printSchema()

[Stage 1:>                                                          (0 + 1) / 1]

root
 |-- Text: string (nullable = true)
 |-- Sentiment: integer (nullable = true)



                                                                                

In [3]:
df.count()

6853

In [4]:
df = df.na.drop()
df = df.na.replace(-1, 0)
df = df.withColumn("Sentiment", df.Sentiment.cast('double'))
df.printSchema()

root
 |-- Text: string (nullable = true)
 |-- Sentiment: double (nullable = true)



In [5]:
df.show()

+--------------------+---------+
|                Text|Sentiment|
+--------------------+---------+
|should be in ever...|      1.0|
|No one convinced ...|      0.0|
|I think it can be...|      1.0|
|Bear market is ov...|      1.0|
|I posted this bef...|      1.0|
|The value of WAY ...|      0.0|
|I t imagine being...|      0.0|
|Not trust this ho...|      0.0|
|is Hope Love Many...|      1.0|
|We can no longer ...|      1.0|
|  In short invest in|      1.0|
|I do not feel sor...|      0.0|
|is pretty stable ...|      1.0|
|I don t even know...|      0.0|
|Funny how people ...|      0.0|
|If you lose money...|      0.0|
|        buy more sol|      1.0|
|good time to accu...|      1.0|
|is plain and simp...|      1.0|
|A great way to br...|      1.0|
+--------------------+---------+
only showing top 20 rows



In [6]:
train, test = df.randomSplit([0.7, 0.3], seed=42)

In [7]:
tokenizer = Tokenizer(inputCol="Text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(maxIter=10, regParam=0.001, labelCol='Sentiment')
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

model = pipeline.fit(train)

22/12/17 15:52:29 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/12/17 15:52:29 WARN com.github.fommil.netlib.BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
                                                                                

In [8]:
# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)
prediction.printSchema()

root
 |-- Text: string (nullable = true)
 |-- Sentiment: double (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [10]:
eval = BinaryClassificationEvaluator(labelCol="Sentiment")
eval.evaluate(prediction)

                                                                                

0.7711901435028371

In [20]:
# hist_reddit_schema = StructType(
#    [StructField('subreddit', StringType(), True),
#     StructField('title', StringType(), True),
#     StructField('selftext', StringType(), True),
#     StructField('created_utc', TimestampType(), True),
#    ]
#   )

In [33]:
# df_red = spark.read.csv("historical_reddit", header = True, inferSchema = True)
# df_red.printSchema()

In [31]:
# df_red.show()

+----------+------------------+------+----+---+---+
|11/04/2020|0.9576058280146804|solana|2020| 04| 11|
+----------+------------------+------+----+---+---+
|12/04/2020|0.7847113148208426|solana|2020|  4| 12|
|13/04/2020|0.8759944068917709|solana|2020|  4| 13|
|14/04/2020|0.7867121945458646|solana|2020|  4| 14|
|15/04/2020| 0.666673390515131|solana|2020|  4| 15|
|16/04/2020|0.6376210673084666|solana|2020|  4| 16|
|17/04/2020|0.6923331250859114|solana|2020|  4| 17|
|18/04/2020| 0.657449398309544|solana|2020|  4| 18|
|19/04/2020|0.6769721852162044|solana|2020|  4| 19|
|20/04/2020|0.6094358389483635|solana|2020|  4| 20|
|21/04/2020|0.5347904942320182|solana|2020|  4| 21|
|22/04/2020|0.5728269210770057|solana|2020|  4| 22|
|23/04/2020|0.6848872198147996|solana|2020|  4| 23|
|24/04/2020|0.6217032788666345|solana|2020|  4| 24|
|25/04/2020|0.6227650900853694|solana|2020|  4| 25|
|26/04/2020|0.6470903181859691|solana|2020|  4| 26|
|27/04/2020|0.6398628040694914|solana|2020|  4| 27|
|28/04/2020|

In [None]:
# Prosty model regresji na szeregów czasowych najlpiej na historical 
# Testy na historical dla MSE
# Predykcja powinna opierac się na cenach z ostanich 24h + na obecnym / przeszłym sentymencie z reddita.
# Na razie tylko na podstawie crypto* na nast. godzinę
# Zapisywać i wczytać te modele jakoś?

In [None]:
df_cryp = spark.read.csv("historical_crypto", header = True, inferSchema = True)
df_cryp.printSchema()