In [18]:
from pyspark.sql import SparkSession
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import Row
from pyspark.ml import PipelineModel
from cassandra.cluster import Cluster
from pyspark.ml.regression import RandomForestRegressionModel
from pyspark.sql.functions import col, when
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
import datetime

In [19]:
spark = SparkSession.builder.getOrCreate()

# Cassandra configuration
cassandra_contact_points = ['192.168.1.22']
cassandra_keyspace = 'stock_market' 
cassandra_table = 'coins_prices2'

# Connect to the Cassandra cluster
cluster = Cluster(cassandra_contact_points)
session = cluster.connect()

# Switch to the keyspace
session.set_keyspace(cassandra_keyspace)

In [20]:
table_name_pred = "predprices"

create_table_query = f"""
    CREATE TABLE IF NOT EXISTS {table_name_pred} (
        timestamp TIMESTAMP PRIMARY KEY,
        symbol TEXT,
        prediction TEXT
    )
"""

session.execute(create_table_query)

<cassandra.cluster.ResultSet at 0x7f681bdb33d0>

### AAPL


In [21]:
# Query to retrieve data from the table
symbol = 'AAPL'
query = f"SELECT id, pc, symbol, t FROM {cassandra_table} WHERE symbol='{symbol}';"

# Execute the query
result_set = session.execute(query)

aapl_model = "/home/jovyan/models/aapl_model"
t1 = []
# Charger le modèle RandomForest à partir du répertoire spécifié
rf_model = RandomForestRegressionModel.load(aapl_model)
for row in result_set:
    t1.append(row.pc)
data = [Row(features=Vectors.dense(t1))]
t1_df = spark.createDataFrame(data)

# Create a VectorAssembler to assemble features into a single vector column
assembler = VectorAssembler(inputCols=['features'], outputCol='features_vector')
t1_df = assembler.transform(t1_df)
predictions_on_t1 = rf_model.transform(t1_df)
# Store the predictions in the Cassandra table
for row in predictions_on_t1.collect():
    current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    insert_query = f"""
        INSERT INTO {table_name_pred} (timestamp, symbol, prediction)
        VALUES ('{current_time}', '{symbol}', '{row.prediction}')
    """
    session.execute(insert_query)

### AXP

In [24]:
# Query to retrieve data from the table
symbol = 'AXP'
query = f"SELECT id, pc, symbol, t FROM {cassandra_table} WHERE symbol='{symbol}';"

# Execute the query
result_set = session.execute(query)

axpp_model = "/home/jovyan/models/axpp_model"
t1 = []
# Charger le modèle RandomForest à partir du répertoire spécifié
rf_model = RandomForestRegressionModel.load(axpp_model)
for row in result_set:
    t1.append(row.pc)
data = [Row(features=Vectors.dense(t1))]
t1_df = spark.createDataFrame(data)

# Create a VectorAssembler to assemble features into a single vector column
assembler = VectorAssembler(inputCols=['features'], outputCol='features_vector')
t1_df = assembler.transform(t1_df)
predictions_on_t1 = rf_model.transform(t1_df)
# Store the predictions in the Cassandra table
for row in predictions_on_t1.collect():
    current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    insert_query = f"""
        INSERT INTO {table_name_pred} (timestamp, symbol, prediction)
        VALUES ('{current_time}', '{symbol}', '{row.prediction}')
    """
    session.execute(insert_query)

### AMD

In [26]:
# Query to retrieve data from the table
symbol = 'AMD'
query = f"SELECT id, pc, symbol, t FROM {cassandra_table} WHERE symbol='{symbol}';"

# Execute the query
result_set = session.execute(query)

amdd_model = "/home/jovyan/models/amdd_model"
t1 = []
# Charger le modèle RandomForest à partir du répertoire spécifié
rf_model = RandomForestRegressionModel.load(amdd_model)
for row in result_set:
    t1.append(row.pc)
data = [Row(features=Vectors.dense(t1))]
t1_df = spark.createDataFrame(data)

# Create a VectorAssembler to assemble features into a single vector column
assembler = VectorAssembler(inputCols=['features'], outputCol='features_vector')
t1_df = assembler.transform(t1_df)
predictions_on_t1 = rf_model.transform(t1_df)
# Store the predictions in the Cassandra table
for row in predictions_on_t1.collect():
    current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    insert_query = f"""
        INSERT INTO {table_name_pred} (timestamp, symbol, prediction)
        VALUES ('{current_time}', '{symbol}', '{row.prediction}')
    """
    session.execute(insert_query)

### BA

In [27]:
# Query to retrieve data from the table
symbol = 'BA'
query = f"SELECT id, pc, symbol, t FROM {cassandra_table} WHERE symbol='{symbol}';"

# Execute the query
result_set = session.execute(query)

baa_model = "/home/jovyan/models/baa_model"
t1 = []
# Charger le modèle RandomForest à partir du répertoire spécifié
rf_model = RandomForestRegressionModel.load(baa_model)
for row in result_set:
    t1.append(row.pc)
data = [Row(features=Vectors.dense(t1))]
t1_df = spark.createDataFrame(data)

# Create a VectorAssembler to assemble features into a single vector column
assembler = VectorAssembler(inputCols=['features'], outputCol='features_vector')
t1_df = assembler.transform(t1_df)
predictions_on_t1 = rf_model.transform(t1_df)
# Store the predictions in the Cassandra table
for row in predictions_on_t1.collect():
    current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    insert_query = f"""
        INSERT INTO {table_name_pred} (timestamp, symbol, prediction)
        VALUES ('{current_time}', '{symbol}', '{row.prediction}')
    """
    session.execute(insert_query)

### CAT

In [28]:
# Query to retrieve data from the table
symbol = 'CAT'
query = f"SELECT id, pc, symbol, t FROM {cassandra_table} WHERE symbol='{symbol}';"

# Execute the query
result_set = session.execute(query)

catt_model = "/home/jovyan/models/catt_model"
t1 = []
# Charger le modèle RandomForest à partir du répertoire spécifié
rf_model = RandomForestRegressionModel.load(catt_model)
for row in result_set:
    t1.append(row.pc)
data = [Row(features=Vectors.dense(t1))]
t1_df = spark.createDataFrame(data)

# Create a VectorAssembler to assemble features into a single vector column
assembler = VectorAssembler(inputCols=['features'], outputCol='features_vector')
t1_df = assembler.transform(t1_df)
predictions_on_t1 = rf_model.transform(t1_df)
# Store the predictions in the Cassandra table
for row in predictions_on_t1.collect():
    current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    insert_query = f"""
        INSERT INTO {table_name_pred} (timestamp, symbol, prediction)
        VALUES ('{current_time}', '{symbol}', '{row.prediction}')
    """
    session.execute(insert_query)

### CSCO

In [29]:
# Query to retrieve data from the table
symbol = 'CSCO'
query = f"SELECT id, pc, symbol, t FROM {cassandra_table} WHERE symbol='{symbol}';"

# Execute the query
result_set = session.execute(query)

cscoo_model = "/home/jovyan/models/cscoo_model"
t1 = []
# Charger le modèle RandomForest à partir du répertoire spécifié
rf_model = RandomForestRegressionModel.load(cscoo_model)
for row in result_set:
    t1.append(row.pc)
data = [Row(features=Vectors.dense(t1))]
t1_df = spark.createDataFrame(data)

# Create a VectorAssembler to assemble features into a single vector column
assembler = VectorAssembler(inputCols=['features'], outputCol='features_vector')
t1_df = assembler.transform(t1_df)
predictions_on_t1 = rf_model.transform(t1_df)
# Store the predictions in the Cassandra table
for row in predictions_on_t1.collect():
    current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    insert_query = f"""
        INSERT INTO {table_name_pred} (timestamp, symbol, prediction)
        VALUES ('{current_time}', '{symbol}', '{row.prediction}')
    """
    session.execute(insert_query)

### GOOG

In [30]:
# Query to retrieve data from the table
symbol = 'GOOG'
query = f"SELECT id, pc, symbol, t FROM {cassandra_table} WHERE symbol='{symbol}';"

# Execute the query
result_set = session.execute(query)

googg_model = "/home/jovyan/models/googg_model"
t1 = []
# Charger le modèle RandomForest à partir du répertoire spécifié
rf_model = RandomForestRegressionModel.load(googg_model)
for row in result_set:
    t1.append(row.pc)
data = [Row(features=Vectors.dense(t1))]
t1_df = spark.createDataFrame(data)

# Create a VectorAssembler to assemble features into a single vector column
assembler = VectorAssembler(inputCols=['features'], outputCol='features_vector')
t1_df = assembler.transform(t1_df)
predictions_on_t1 = rf_model.transform(t1_df)
# Store the predictions in the Cassandra table
for row in predictions_on_t1.collect():
    current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    insert_query = f"""
        INSERT INTO {table_name_pred} (timestamp, symbol, prediction)
        VALUES ('{current_time}', '{symbol}', '{row.prediction}')
    """
    session.execute(insert_query)

### GOOGL

In [31]:
# Query to retrieve data from the table
symbol = 'GOOGL'
query = f"SELECT id, pc, symbol, t FROM {cassandra_table} WHERE symbol='{symbol}';"

# Execute the query
result_set = session.execute(query)

googll_model = "/home/jovyan/models/googll_model"
t1 = []
# Charger le modèle RandomForest à partir du répertoire spécifié
rf_model = RandomForestRegressionModel.load(googll_model)
for row in result_set:
    t1.append(row.pc)
data = [Row(features=Vectors.dense(t1))]
t1_df = spark.createDataFrame(data)

# Create a VectorAssembler to assemble features into a single vector column
assembler = VectorAssembler(inputCols=['features'], outputCol='features_vector')
t1_df = assembler.transform(t1_df)
predictions_on_t1 = rf_model.transform(t1_df)
# Store the predictions in the Cassandra table
for row in predictions_on_t1.collect():
    current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    insert_query = f"""
        INSERT INTO {table_name_pred} (timestamp, symbol, prediction)
        VALUES ('{current_time}', '{symbol}', '{row.prediction}')
    """
    session.execute(insert_query)

### AMZN

In [32]:
# Query to retrieve data from the table
symbol = 'AMZN'
query = f"SELECT id, pc, symbol, t FROM {cassandra_table} WHERE symbol='{symbol}';"

# Execute the query
result_set = session.execute(query)

amznn_model = "/home/jovyan/models/amznn_model"
t1 = []
# Charger le modèle RandomForest à partir du répertoire spécifié
rf_model = RandomForestRegressionModel.load(amznn_model)
for row in result_set:
    t1.append(row.pc)
data = [Row(features=Vectors.dense(t1))]
t1_df = spark.createDataFrame(data)

# Create a VectorAssembler to assemble features into a single vector column
assembler = VectorAssembler(inputCols=['features'], outputCol='features_vector')
t1_df = assembler.transform(t1_df)
predictions_on_t1 = rf_model.transform(t1_df)
# Store the predictions in the Cassandra table
for row in predictions_on_t1.collect():
    current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    insert_query = f"""
        INSERT INTO {table_name_pred} (timestamp, symbol, prediction)
        VALUES ('{current_time}', '{symbol}', '{row.prediction}')
    """
    session.execute(insert_query)

### CVX

In [33]:
# Query to retrieve data from the table
symbol = 'CVX'
query = f"SELECT id, pc, symbol, t FROM {cassandra_table} WHERE symbol='{symbol}';"

# Execute the query
result_set = session.execute(query)

cvxx_model = "/home/jovyan/models/cvxx_model"
t1 = []
# Charger le modèle RandomForest à partir du répertoire spécifié
rf_model = RandomForestRegressionModel.load(cvxx_model)
for row in result_set:
    t1.append(row.pc)
data = [Row(features=Vectors.dense(t1))]
t1_df = spark.createDataFrame(data)

# Create a VectorAssembler to assemble features into a single vector column
assembler = VectorAssembler(inputCols=['features'], outputCol='features_vector')
t1_df = assembler.transform(t1_df)
predictions_on_t1 = rf_model.transform(t1_df)
# Store the predictions in the Cassandra table
for row in predictions_on_t1.collect():
    current_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    insert_query = f"""
        INSERT INTO {table_name_pred} (timestamp, symbol, prediction)
        VALUES ('{current_time}', '{symbol}', '{row.prediction}')
    """
    session.execute(insert_query)

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 33706)
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/usr/local/spark/python/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/usr/local/spark/python/pyspark/accumulators.py", line 267, in poll
    if self.rfile in r and func():
                           ^^^^^^
  File "/usr/local/spark/python/pyspark/accumulators.py", line 271, in accum_updates
    num_updates =