In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import matplotlib.pyplot as plt

# --- 1. Find and Load the Master Dataset ---
# First, let's find the player with the most data to ensure we can train the model.
try:
    df_perf = pd.read_csv('statsbomb_match_performance.csv')
    player_match_counts = df_perf['player_name'].value_counts()
    player_name = player_match_counts.index[0] # Get the top player
    player_name_for_file = player_name.replace(' ', '_')
    
    print(f"Found player with most data: {player_name}")
    
    df_master = pd.read_csv(f'master_dataset_{player_name_for_file}.csv', index_col=0, parse_dates=[0])
    print(f"✅ Successfully loaded master dataset for {player_name}")

except FileNotFoundError:
    print(f"❌ Master dataset for '{player_name}' not found. Please run '03_data_consolidation.ipynb' first to generate it.")
    exit()

# --- 2. Prepare Data for Univariate LSTM ---
market_value_data = df_master[['market_value_eur']].copy()
sequence_length = 5 # A safe sequence length

# --- 3. Data Validation Check ---
if len(market_value_data) <= sequence_length:
    print(f"\n❌ Error: Dataset is still too small to train the model.")
    print(f"   We need more than {sequence_length} data points, but the dataset only has {len(market_value_data)}.")
else:
    print(f"\n✅ Dataset has enough data ({len(market_value_data)} points). Proceeding with training.")
    
    # --- 4. Scale Data and Create Sequences ---
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaled_data = scaler.fit_transform(market_value_data)
    
    X_train, y_train = [], []
    for i in range(sequence_length, len(scaled_data)):
        X_train.append(scaled_data[i-sequence_length:i, 0])
        y_train.append(scaled_data[i, 0])

    X_train, y_train = np.array(X_train), np.array(y_train)
    X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))

    # --- 5. Build and Train the LSTM Model ---
    print("\nBuilding and training the LSTM model...")
    model = Sequential([
        LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)),
        LSTM(units=50),
        Dense(units=1)
    ])
    
    model.compile(optimizer='adam', loss='mean_squared_error')
    model.summary()
    
    history = model.fit(X_train, y_train, epochs=25, batch_size=32, verbose=1)

    # --- 6. Plot Training Loss ---
    plt.figure(figsize=(10, 6))
    plt.plot(history.history['loss'])
    plt.title('Model Training Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.show()

    print("\n✅ Model training complete!")

Found player with most data: Ellen White
✅ Successfully loaded master dataset for Ellen White

❌ Error: Dataset is still too small to train the model.
   We need more than 5 data points, but the dataset only has 1.
