In [795]:
import os
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.sql.functions import udf
import json


In [796]:
# Define data paths
current_dir = os.getcwd()
file_name = '../csv_files/lottery_data/*.csv'
file_path = os.path.join(current_dir, file_name)

Data Ingestion

In [797]:
# Initialize PySpark Session
spark = SparkSession.builder.appName("LotteryNumberPrediction").getOrCreate()
original_df = spark.read.csv(file_path, header=True, inferSchema=True)
print(original_df.count())

3420


Data Transformation

In [798]:
original_df = original_df.withColumn("prize_codes", F.regexp_replace(F.col("prize_codes"), " ", ","))
transformation_df = (original_df.groupBy("date")
                     .agg(F.collect_list("prize_codes").alias("numbers_list"))
                     .withColumn("numbers_raw_string", F.concat_ws(",", F.col("numbers_list")))
                     .withColumn("numbers", F.split(F.col("numbers_raw_string"), ","))
                     .withColumn("partition_order", F.lit(1))
                     .select("date", "numbers_raw_string", "numbers", "partition_order"))
transformation_df.show(5, truncate=False)


+---------+---------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+---------------+
|date     |numbers_raw_string                                                                                 |numbers                                                                                                               |partition_order|
+---------+---------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+---------------+
|1-1-2023 |78,025,4128,6164,1016,8291,42816,17683,21052,80002,98666,41587,52863,55898,37552,62140,88528,313644|[78, 025, 4128, 6164, 1016, 8291, 42816, 17683, 21052, 80002, 98666, 41587, 52863, 55898, 37552, 62140, 88528, 313644]|1              |
|1-1-2025 |6

Data Analysis

In [799]:
# We have essential data, start do analysis and do predictions
# Using PySpark, you can perform frequency analysis to determine the most common and least common numbers.
exploded_df = transformation_df.withColumn("number", F.explode(F.col("numbers")))
frequency_df = exploded_df.groupBy("Number").count()
# Show the most frequent numbers
frequency_df.show(10, truncate=False)

+------+-----+
|Number|count|
+------+-----+
|57496 |1    |
|16576 |1    |
|70962 |1    |
|07    |2    |
|04438 |1    |
|29865 |1    |
|68325 |1    |
|70097 |1    |
|39458 |1    |
|44446 |1    |
+------+-----+
only showing top 10 rows



In [800]:
# Hot and Cold Numbers
hot_numbers = frequency_df.orderBy(F.col("count").desc()).limit(5)
hot_numbers.show()
cold_numbers = frequency_df.orderBy(F.col("count").asc()).limit(5)
cold_numbers.show()

+------+-----+
|Number|count|
+------+-----+
|    77|    9|
|    40|    8|
|    67|    8|
|    20|    8|
|    85|    7|
+------+-----+

+------+-----+
|Number|count|
+------+-----+
| 29865|    1|
| 44446|    1|
| 68325|    1|
| 16576|    1|
| 70097|    1|
+------+-----+



In [801]:
# Build dataset for predictions
window = Window.partitionBy("partition_order").orderBy("date")

def split_and_convert_to_vector(raw_string):
    try:
        if raw_string is None or raw_string.strip() == "":
            # Return an empty dense vector if the input is null or an empty string
            return Vectors.dense([])
        else:
            # Otherwise, split the string by commas and convert to a dense vector
            return Vectors.dense([float(num) for num in raw_string.split(",")])
    except Exception:
        return Vectors.dense([])


# (2) Register UDF
split_to_vector_udf = udf(split_and_convert_to_vector, VectorUDT())

# cache for better performance
transformation_df = transformation_df.cache()

# Add columns for previous draw (lagged data)
transformation_df = transformation_df.withColumn("PrevDraw1", F.lag("numbers_raw_string", 1).over(window))  # Lag by 1
transformation_df = transformation_df.withColumn("PrevDraw2", F.lag("numbers_raw_string", 2).over(window))
# (3) Convert PrevDraw1 and PrevDraw2 to dense vectors
transformation_df = transformation_df.withColumn("PrevDraw1Vector", split_to_vector_udf("PrevDraw1"))
transformation_df = transformation_df.withColumn("PrevDraw2Vector", split_to_vector_udf("PrevDraw2"))
# Lag by 2
transformation_df = transformation_df.filter(F.col("PrevDraw1").isNotNull() & F.col("PrevDraw2").isNotNull())
transformation_df.show()


+----------+--------------------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+
|      date|  numbers_raw_string|             numbers|partition_order|           PrevDraw1|           PrevDraw2|     PrevDraw1Vector|     PrevDraw2Vector|
+----------+--------------------+--------------------+---------------+--------------------+--------------------+--------------------+--------------------+
| 1-10-2023|97,841,1867,3645,...|[97, 841, 1867, 3...|              1|67,191,1275,3000,...|78,025,4128,6164,...|[67.0,191.0,1275....|[78.0,25.0,4128.0...|
| 1-11-2023|28,988,9327,3515,...|[28, 988, 9327, 3...|              1|97,841,1867,3645,...|67,191,1275,3000,...|[97.0,841.0,1867....|[67.0,191.0,1275....|
| 1-12-2021|92,637,4242,9713,...|[92, 637, 4242, 9...|              1|28,988,9327,3515,...|97,841,1867,3645,...|[28.0,988.0,9327....|[97.0,841.0,1867....|
| 1-12-2024|81,140,9845,0147,...|[81, 140, 9845, 0...|              1|

                                                                                

In [802]:
# Train a Machine Learning Model
# For simplicity, let's train a **Random Forest Classifier** using Apache Spark's MLlib to predict future numbers based on previous draws.
# Convert numbers into numeric/categorical features
indexer = StringIndexer(inputCol="numbers_raw_string", outputCol="label")
indexer_model = indexer.fit(transformation_df)
transformation_df = indexer.fit(transformation_df).transform(transformation_df)

# Assemble features from PrevDraw1Vector and PrevDraw2Vector
assembler = VectorAssembler(inputCols=["PrevDraw1Vector", "PrevDraw2Vector"], outputCol="features")
final_df = assembler.transform(transformation_df)

# Ensure the features column is properly constructed
final_df = final_df.select("date", "numbers_raw_string", "numbers", "PrevDraw1Vector", "PrevDraw2Vector", "features", "label")
final_df.show(5, truncate=False)

# Split the training and test data
train, test = final_df.randomSplit([0.8, 0.2], seed=40)

# Initialize the model
rf = RandomForestClassifier(featuresCol="features", labelCol="label")
# Train the model
rf_model = rf.fit(train)
# Predict on test data
predictions = rf_model.transform(test)

+---------+---------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|date     |numbers_raw_string                                                                                 |numbers                                                                                                

25/02/17 20:19:50 WARN DAGScheduler: Broadcasting large task binary with size 1316.3 KiB
25/02/17 20:19:50 WARN DAGScheduler: Broadcasting large task binary with size 2.0 MiB


Model evaluation

In [803]:
# Evaluate Accuracy
evaluator_acc = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)
accuracy = evaluator_acc.evaluate(predictions)

# Evaluate F1 Score
evaluator_f1 = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="f1"
)
f1_score = evaluator_f1.evaluate(predictions)

# Print the metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"F1 Score: {f1_score:.4f}")

25/02/17 20:19:51 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB


Accuracy: 0.0000
F1 Score: 0.0000


25/02/17 20:19:52 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB


In [804]:
### **Predict the Next Draw**
# Predict based on the most recent draws
latest_data = final_df.orderBy(F.col("date").desc()).limit(1)
predicted_next = rf_model.transform(latest_data)

label_to_raw_string = indexer_model.labels
# Map predictions back to raw strings
predicted_next = predicted_next.withColumn(
    "predicted_numbers_raw_string",
    F.udf(lambda pred: label_to_raw_string[int(pred)], StringType())(F.col("prediction")))
# Show the predictions
predicted_next_result = predicted_next.select("prediction", "predicted_numbers_raw_string").first()
predicted_numbers_raw_string = predicted_next_result['predicted_numbers_raw_string'].split(",")
prediction_info_hash = {
    "prediction": predicted_next_result['prediction'],
    "g8": [predicted_numbers_raw_string[0]],
    "g7": [predicted_numbers_raw_string[1]],
    "g6": [predicted_numbers_raw_string[2],
           predicted_numbers_raw_string[3],
           predicted_numbers_raw_string[4]],
    "g5": [predicted_numbers_raw_string[5]],
    "g4": [predicted_numbers_raw_string[6],
           predicted_numbers_raw_string[7],
           predicted_numbers_raw_string[8],
           predicted_numbers_raw_string[9],
           predicted_numbers_raw_string[10],
           predicted_numbers_raw_string[11],
           predicted_numbers_raw_string[12]],
    "g3": [predicted_numbers_raw_string[13],
           predicted_numbers_raw_string[14]],
    "g2": [predicted_numbers_raw_string[15]],
    "g1": [predicted_numbers_raw_string[16]],
    "db": [predicted_numbers_raw_string[17]],
}
print(prediction_info_hash)

{'prediction': 302.0, 'g8': ['79'], 'g7': ['099'], 'g6': ['7163', '2021', '9892'], 'g5': ['3247'], 'g4': ['28467', '30091', '11069', '07695', '16123', '33210', '75216'], 'g3': ['56368', '29283'], 'g2': ['25713'], 'g1': ['83642'], 'db': ['871347']}


25/02/17 20:19:53 WARN DAGScheduler: Broadcasting large task binary with size 2040.5 KiB


In [805]:
spark.stop()