# Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
# Updated imports for Functional API and Attention layer
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, AdditiveAttention, GlobalAveragePooling1D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import Callback
from collections import Counter
import matplotlib.pyplot as plt
import os

In [None]:
# Model save file path
MODEL_SAVE_PATH = 'best_lottery_lstm_model_attention.keras'

# 1. DATA LOADING AND PREPARATION

In [None]:
# Load data
df = pd.read_csv('mega_6_45.csv')
result_cols = ['num_1', 'num_2', 'num_3', 'num_4', 'num_5', 'num_6']
results = df[result_cols].values

# Number characteristics
NUM_MAX = 45
SEQUENCE_LENGTH = 10

# >>> FEATURE PARAMETER DECLARATION <<<
T_LOOKBACK = 5
NUM_ADDITIONAL_FEATURES = 2
NUM_TOTAL_FEATURES_FLAT = NUM_MAX * (1 + NUM_ADDITIONAL_FEATURES) # 45 * 3 = 135
NUM_TOTAL_FEATURES = NUM_TOTAL_FEATURES_FLAT # Update this variable
# -------------------------------------

print(f"Total number of historical draws: {len(results)}")
print(f"Sequence length (timesteps) used: {SEQUENCE_LENGTH}")

df.tail()

# 2. PREPROCESSING FUNCTION: ONE-HOT ENCODING & SEQUENCE GENERATION

In [None]:
def create_advanced_sequences(data, seq_length=SEQUENCE_LENGTH, num_max=NUM_MAX, T_lookback=T_LOOKBACK):
    """
    Applies One-Hot Encoding and calculates advanced features (Recency, Frequency) 
    before generating time series sequences for LSTM.
    """
    X, Y = [], []
    num_samples = len(data)

    # 1. One-Hot Encoding
    one_hot_data = np.zeros((num_samples, num_max), dtype=int)
    for i, row in enumerate(data):
        one_hot_data[i, row - 1] = 1

    # 2. Advanced Features
    advanced_features = np.zeros((num_samples, num_max, NUM_ADDITIONAL_FEATURES))

    for i in range(num_samples):
        # Feature 1: Frequency in the last T draws
        if i >= T_lookback:
            recent_draws = data[i - T_lookback:i] # T draws BEFORE draw 'i'
            counts = np.zeros(num_max, dtype=int)
            for draw in recent_draws:
                for num in draw:
                    counts[num - 1] += 1
            advanced_features[i, :, 0] = counts / (T_lookback * 6) # Normalization

        # Feature 2: Coldness (Draws Since Last Drawn - DSLD)
        for j in range(num_max):
            num = j + 1
            dsld = T_lookback * 2 # Default value (very cold)

            # Find the most recent occurrence
            for k in range(i-1, -1, -1):
                if num in data[k]:
                    dsld = i - k
                    break

            # Normalize DSLD: 1/(dsld+1) so smaller value means colder
            advanced_features[i, j, 1] = 1.0 / (dsld + 1)

    # 3. Combine features
    full_feature_data = np.zeros((num_samples, num_max, 1 + NUM_ADDITIONAL_FEATURES))
    full_feature_data[:, :, 0] = one_hot_data
    full_feature_data[:, :, 1:] = advanced_features

    # 4. Create sequences (Reshaping/Flattening)
    for i in range(num_samples - seq_length):
        seq_in = full_feature_data[i:i + seq_length]
        seq_out = one_hot_data[i + seq_length]

        # Flatten correctly: (seq_length, num_max * 3)
        seq_in_flat = seq_in.reshape(seq_length, -1)

        X.append(seq_in_flat)
        Y.append(seq_out)

    return np.array(X), np.array(Y)

# Generate data
X, Y = create_advanced_sequences(results)
print(f"X shape: {X.shape}")
print(f"Y shape: {Y.shape}")

# 3. TRAIN/VALIDATION/TEST SPLIT

In [None]:
test_size = int(len(X) * 0.2)
X_train, X_test = X[:-test_size], X[-test_size:]
Y_train, Y_test = Y[:-test_size], Y[-test_size:]

X_train, X_val, Y_train, Y_val = train_test_split(
    X_train, Y_train, test_size=0.1, shuffle=False
)

print(f"Train set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Test set size: {len(X_test)}")

# 4. METRIC DEFINITION AND DISTRIBUTION FUNCTION

In [None]:
def calculate_hits_array(y_true_one_hot, y_pred_prob):
    """
    Calculates the number of correct hits for EACH draw (Top-6 prediction).
    Returns an array of hit counts.
    """
    # Get the indices of the 6 highest probability numbers (Top-6)
    top_k_indices = np.argpartition(y_pred_prob, -6, axis=-1)[:, -6:]

    # Create One-Hot prediction from Top-6 indices
    y_pred_one_hot = np.zeros_like(y_true_one_hot)
    for i, indices in enumerate(top_k_indices):
        y_pred_one_hot[i, indices] = 1

    # Count the number of hits for each draw
    hits = np.sum(y_true_one_hot * y_pred_one_hot, axis=1)

    return hits

def print_hit_distribution(hits_array):
    """
    Prints the detailed distribution of hits from 0 to 6.
    """
    total_samples = len(hits_array)
    hit_counts = Counter(hits_array)

    print("\n--- Hit Rate Distribution ---")
    for i in range(7): # From 0 hits up to 6 hits
        count = hit_counts.get(i, 0)
        percentage = (count / total_samples) * 100 if total_samples > 0 else 0
        print(f"    {i} hits: {count} times ({percentage:.1f}%)")

# 5. CALLBACK DEFINITION FOR PER-EPOCH EVALUATION

In [None]:
class HitRateCallback(Callback):
    """
    Callback to calculate the average Hit-Rate (number of correct numbers) and 
    Hit-Ratio (0-1 scale) on the Validation set after each Epoch.
    """
    def __init__(self, X_val, Y_val):
        super().__init__()
        self.X_val = X_val
        self.Y_val = Y_val
        # Note: If validation_data=(X_val, Y_val) is provided in model.fit,
        # Keras automatically calculates val_loss before this callback runs.

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}

        # 1. CALCULATE HIT RATE
        y_val_pred = self.model.predict(self.X_val, verbose=0)
        hits_array = calculate_hits_array(self.Y_val, y_val_pred)
        avg_hit_rate_count = np.mean(hits_array)

        # 2. UPDATE LOGS
        logs['val_hit_count'] = avg_hit_rate_count
        logs['val_hit_ratio'] = avg_hit_rate_count / 6 # Hit-Rate ratio (0-1)

        # 3. OPTIONAL: Check and print val_loss
        # val_loss is automatically logged if validation_data is provided

# 6. LSTM MODEL CONSTRUCTION, TRAINING, AND SAVING

## 6.1. Model Construction and Callback Initialization

In [None]:
input_layer = Input(shape=(SEQUENCE_LENGTH, NUM_TOTAL_FEATURES_FLAT), name='input_sequence') # (10, 135)

# 1. LSTM: Extract sequential features. MUST have return_sequences=True for Attention
lstm_out = LSTM(256, return_sequences=True)(input_layer)
lstm_out = Dropout(0.3)(lstm_out)

# 2. Self-Attention Mechanism
attention_output = AdditiveAttention(name='self_attention')([lstm_out, lstm_out]) # Key, Value, Query are lstm_out

# 3. Aggregate information from Attention
attention_pooled = GlobalAveragePooling1D()(attention_output)

# 4. Final Dense classification layer
dense_out = Dense(128, activation='relu')(attention_pooled)
dense_out = Dropout(0.3)(dense_out)
output_layer = Dense(NUM_MAX, activation='sigmoid', name='output_layer')(dense_out) # Output is 45 (for One-Hot)

model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(loss='binary_crossentropy', optimizer=optimizer)

model.summary()

# >>> IMPORTANT CHANGE: INITIALIZE CALLBACK AFTER X_VAL IS AVAILABLE <<<
hit_rate_callback = HitRateCallback(X_val, Y_val)

## 6.2. Training and Saving the LSTM Model

In [None]:
print("\n--- Starting LSTM + ATTENTION Model Training ---")
history = model.fit(
    X_train, Y_train,
    epochs=20,
    batch_size=32,
    validation_data=(X_val, Y_val),
    callbacks=[hit_rate_callback],
    verbose=1
)

model.save(MODEL_SAVE_PATH)
print(f"\nModel saved successfully to: {MODEL_SAVE_PATH}")

## 6.3. PROCESSING AND PRINTING TRAINING HISTORY RESULTS

In [None]:
import pandas as pd

if history is not None and history.history:
    # Convert log history to DataFrame for easy processing
    history_df = pd.DataFrame(history.history)
    history_df.index.name = 'Epoch'
    history_df.index = history_df.index + 1 # Epoch starts from 1

    print("\n" + "=" * 70)
    print("  PERFORMANCE SUMMARY ON VALIDATION SET AFTER TRAINING  ")
    print("=" * 70)

    # Calculate average over all epochs
    avg_hit_count = history_df['val_hit_count'].mean()
    avg_hit_ratio = history_df['val_hit_ratio'].mean()

    print(f"\n  **Average results over all {len(history_df)} Epochs:**")
    print(f"   | Avg val_hit_count : {avg_hit_count:>6.3f} correct numbers")
    print(f"   | Avg val_hit_ratio : {avg_hit_ratio:>6.3f} ({avg_hit_ratio * 100:>6.2f}%)")

    # Best epoch (max val_hit_count)
    best_epoch_index = history_df['val_hit_count'].idxmax()
    best_hit_count = history_df.loc[best_epoch_index, 'val_hit_count']
    best_hit_ratio = history_df.loc[best_epoch_index, 'val_hit_ratio']

    print(f"\n  **Epoch with the Best Performance (Max Hit Count):**")
    print(f"   | Epoch Index         : {best_epoch_index}")
    print(f"   | Best val_hit_count  : {best_hit_count:>6.3f} correct numbers / 6")
    print(f"   | Best val_hit_ratio  : {best_hit_ratio:>6.3f} ({best_hit_ratio * 100:>6.2f}%)")

    # Additional information (if other metrics exist)
    if 'val_loss' in history_df.columns:
        best_val_loss = history_df.loc[best_epoch_index, 'val_loss']
        print(f"\n  **Validation Loss at best epoch:** {best_val_loss:.4f}")

    print("=" * 70)
else:
    print("\nWarning: Training history not found. Please check the `history` variable.")

## 6.4. PLOTTING MODEL PERFORMANCE AFTER TRAINING

In [None]:
def plot_training_history(history):
    """
    Plots the training history for Loss (Train vs Validation) and Validation Hit Count.
    """
    # Ensure data exists
    if not history or not history.history:
        print("No training history (history.history) found to plot.")
        return

    history_dict = history.history

    # Get values from the history object
    loss = history_dict.get('loss')
    val_loss = history_dict.get('val_loss')
    val_hit_count = history_dict.get('val_hit_count')

    if not loss:
        print("Metric 'loss' not found in history.")
        return

    epochs = range(1, len(loss) + 1)

    plt.figure(figsize=(14, 5))


    ## --- PLOT 1: LOSS (Train vs Validation) ---
    plt.subplot(1, 2, 1)
    plt.plot(epochs, loss, 'b-', linewidth=2, label='Train Loss')

    # Plot val_loss only if data exists
    if val_loss is not None:
        plt.plot(epochs, val_loss, 'r-', linewidth=2, label='Validation Loss')
        # Mark the optimal point for Val Loss
        min_val_loss = np.min(val_loss)
        best_epoch_loss = np.argmin(val_loss) + 1
        plt.plot(best_epoch_loss, min_val_loss, 'ro', markersize=5, label=f'Best Val Loss ({min_val_loss:.4f} @{best_epoch_loss})')

    plt.title('Training and Validation Loss (Binary Crossentropy)')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend(loc='upper right')
    plt.grid(True, linestyle='--', alpha=0.6)


    ## --- PLOT 2: VALIDATION HIT COUNT ---
    if val_hit_count is not None:
        plt.subplot(1, 2, 2)
        plt.plot(epochs, val_hit_count, 'g-', linewidth=2, label='Validation Hit Count')

        # Add mean line
        avg_hit = np.mean(val_hit_count)
        plt.axhline(avg_hit, color='orange', linestyle='--', label=f'Avg Hit Count ({avg_hit:.2f})')

        # Mark the best point for Hit Count
        max_val_hit = np.max(val_hit_count)
        best_epoch_hit = np.argmax(val_hit_count) + 1
        plt.plot(best_epoch_hit, max_val_hit, 'go', markersize=5, label=f'Max Hit Count ({max_val_hit:.2f} @{best_epoch_hit})')

        plt.title('Validation Hit Count (Avg Hits / 6)')
        plt.xlabel('Epochs')
        plt.ylabel('Average Number of Hits')
        plt.ylim(bottom=0)
        plt.legend(loc='lower left')
        plt.grid(True, linestyle='--', alpha=0.6)

    plt.suptitle('LSTM + ATTENTION MODEL TRAINING HISTORY ANALYSIS', fontsize=16, y=1.03)
    plt.tight_layout()
    plt.show()

plot_training_history(history) # Call the function with the trained history object

# 7. FINAL EVALUATION ON TEST SET AND DISTRIBUTION DISPLAY

In [None]:
y_test_pred = model.predict(X_test, verbose=0)

# 1. Calculate the array of hits per draw
test_hits_array = calculate_hits_array(Y_test, y_test_pred)

# 2. Calculate the average Hit-Rate
final_avg_hit_rate = np.mean(test_hits_array)

# 3. Print Hit Rate distribution
print_hit_distribution(test_hits_array)

print("\n--- Performance Summary ---")
print(f"Number of draws in Test set: {len(X_test)}")
print(f"AVERAGE Hit-Rate on Test Set: {final_avg_hit_rate:.4f} correct numbers (out of 6)")
print(f"Average correct prediction ratio per sequence: {(final_avg_hit_rate/6)*100:.2f}%")

# 8. USING THE SAVED MODEL TO PREDICT BASED ON THE LATEST 10 DRAWS

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import load_model


NUM_MAX = 45 # Maximum number (1-45)
SEQUENCE_LENGTH = 10 # Length of the input history sequence (Time Step Window)

# Assume the following parameters match the training phase
T_LOOKBACK = 5
NUM_ADDITIONAL_FEATURES = 2
NUM_TOTAL_FEATURES_FLAT = NUM_MAX * (1 + NUM_ADDITIONAL_FEATURES)


try:
    # Replace the path with your actual model file path
    loaded_model = load_model('/content/best_lottery_lstm_model_attention.keras')
    print("Successfully loaded the saved model.")
except Exception as e:
    print(f"Error loading model: {e}")
    # loaded_model = model # Fallback if running immediately after training
    # return


# 1. Load full data (needed to get the entire history for feature calculation)
df_full = pd.read_csv('mega_6_45.csv')
result_cols = ['num_1', 'num_2', 'num_3', 'num_4', 'num_5', 'num_6']
all_results = df_full[result_cols].values

# 2. Create advanced sequences FROM ALL DATA
# X_full will have shape (N_samples, 10, 135)
# Re-use the function used for training
X_full, _ = create_advanced_sequences(
    all_results,
    seq_length=SEQUENCE_LENGTH,
    num_max=NUM_MAX,
    T_lookback=T_LOOKBACK
)

# 3. Extract the last input (the input for the next draw)
# new_input: (1, 10, 135)
new_input = X_full[-1].reshape(1, SEQUENCE_LENGTH, NUM_TOTAL_FEATURES_FLAT)

print(f"New Input shape for prediction: {new_input.shape}")

# Predict probabilities
predicted_prob = loaded_model.predict(new_input, verbose=0)[0]

# Select the 6 numbers with the highest probability (Top-6)
# Get indices of the 6 largest values
predicted_indices = np.argsort(predicted_prob)[::-1][:6]
# Convert indices (0-44) to numbers (1-45)
predicted_numbers = predicted_indices + 1
predicted_numbers.sort() # Sort for readability

# Display the 6 highest probability numbers along with their probabilities
top_6_results = pd.DataFrame({
    'Number': predicted_numbers,
    'Probability': predicted_prob[predicted_indices]
}).sort_values(by='Probability', ascending=False).reset_index(drop=True)

print(f"Last historical data used: {df_full['date'].iloc[-SEQUENCE_LENGTH:].values}")
print(f"\n6 Numbers predicted by the LSTM model as most likely:")
print(f"{predicted_numbers}\n")
print(top_6_results.to_markdown(index=False))