In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import xgboost as xgb
import joblib

# --- Load Dataset ---
data_path = r'C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\feature_engineered\player_features_model_all_imputed.csv'
df = pd.read_csv(data_path)

# --- Encode Categorical Columns ---
work_rate_map = {'low': 0, 'medium': 1, 'high': 2}
df['attacking_work_rate'] = df['attacking_work_rate'].map(work_rate_map)
df['defensive_work_rate'] = df['defensive_work_rate'].map(work_rate_map)

# --- Feature Selection ---
features = [
    'overall_rating', 'potential', 'attacking_work_rate', 'defensive_work_rate',
    'crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys',
    'dribbling', 'curve', 'long_passing', 'ball_control', 'acceleration',
    'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', 'jumping',
    'stamina', 'strength', 'long_shots', 'aggression', 'interceptions',
    'positioning', 'vision', 'penalties', 'marking', 'standing_tackle',
    'sliding_tackle', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
    'gk_reflexes', 'injury_count', 'total_days_out', 'days_per_injury'
]
target = 'overall_rating'

# --- Preprocessing ---
df = df.dropna(subset=features + ['player_name', target])
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df[features + [target]])

# --- Sequence Generation ---
n_steps = 3
def create_sequences(data, n_steps):
    X, y = [], []
    for i in range(len(data) - n_steps):
        seq_x = data[i:i+n_steps, :-1]
        seq_y = data[i+n_steps, -1]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

X, y = create_sequences(scaled_data, n_steps)
player_names = df['player_name'].values[n_steps:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
player_test = player_names[-len(y_test):]

# --- LSTM Model ---
model = Sequential([
    LSTM(64, activation='relu', input_shape=(n_steps, X.shape[2])),
    Dense(1)
])
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train, y_train, epochs=50, validation_data=(X_test, y_test), verbose=0)

# --- LSTM Predictions ---
lstm_preds_scaled = model.predict(X_test).flatten()
dummy_test = np.zeros((len(y_test), len(features)+1))
dummy_preds = np.zeros((len(lstm_preds_scaled), len(features)+1))
dummy_test[:, -1] = y_test
dummy_preds[:, -1] = lstm_preds_scaled
y_test_original = scaler.inverse_transform(dummy_test)[:, -1]
lstm_preds_original = scaler.inverse_transform(dummy_preds)[:, -1]

# --- XGBoost Model ---
X_flat = X.reshape(X.shape[0], -1)
X_train_flat, X_test_flat, _, _ = train_test_split(X_flat, y, test_size=0.2, shuffle=False)
xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1)
xgb_model.fit(X_train_flat, y_train)
xgb_preds = xgb_model.predict(X_test_flat)

# --- Ensemble Prediction ---
final_preds = (lstm_preds_original + xgb_preds) / 2

# --- Save Results ---
ensemble_df = pd.DataFrame({
    "Player": player_test,
    "Actual": y_test_original,
    "LSTM_Pred": lstm_preds_original,
    "XGB_Pred": xgb_preds,
    "Final_Pred": final_preds
})
ensemble_df.to_csv("ensemble_predictions.csv", index=False)
print("✅ Ensemble predictions saved to ensemble_predictions.csv")

# --- Evaluation ---
mae = mean_absolute_error(y_test_original, final_preds)
rmse = mean_squared_error(y_test_original, final_preds) ** 0.5
print(f"✅ Ensemble MAE: {mae:.2f}, RMSE: {rmse:.2f}")

# --- Save Models ---
joblib.dump({
    "scaler": scaler,
    "features": features,
    "target": target,
    "lstm_model": model,
    "xgb_model": xgb_model
}, "ensemble_models.pkl")
print("✅ Models saved to ensemble_models.pkl")

  super().__init__(**kwargs)


[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step
✅ Ensemble predictions saved to ensemble_predictions.csv
✅ Ensemble MAE: 34.20, RMSE: 34.91
✅ Models saved to ensemble_models.pkl


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# --- Load Dataset ---
df = pd.read_csv(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\feature_engineered\player_features_model_all_imputed.csv")

# --- Encode Categorical Columns ---
work_rate_map = {'low': 0, 'medium': 1, 'high': 2}
df['attacking_work_rate'] = df['attacking_work_rate'].map(work_rate_map)
df['defensive_work_rate'] = df['defensive_work_rate'].map(work_rate_map)

# --- Valid Feature Set ---
features = [
    'overall_rating', 'potential', 'attacking_work_rate', 'defensive_work_rate',
    'crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys',
    'dribbling', 'curve', 'long_passing', 'ball_control', 'acceleration',
    'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', 'jumping',
    'stamina', 'strength', 'long_shots', 'aggression', 'interceptions',
    'positioning', 'vision', 'penalties', 'marking', 'standing_tackle',
    'sliding_tackle', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
    'gk_reflexes', 'injury_count', 'total_days_out', 'days_per_injury'
]
target = 'overall_rating'

# --- Drop NaNs and Shuffle ---
df = df.dropna(subset=features + [target])
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# --- Scale Features + Target Together ---
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df[features + [target]])

# --- Create Sequences ---
def create_sequences(data, n_steps):
    X, y = [], []
    for i in range(len(data) - n_steps):
        seq_x = data[i:i+n_steps, :-1]
        seq_y = data[i+n_steps, -1]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

n_steps = 3
X, y = create_sequences(scaled_data, n_steps)

# --- Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# --- Output Shapes ---
print("✅ Data prepared for LSTM modeling")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

✅ Data prepared for LSTM modeling
X_train shape: (69489, 3, 39)
X_test shape: (17373, 3, 39)
y_train shape: (69489,)
y_test shape: (17373,)


In [8]:
# --- Step 1: Data Preparation (from Code 2) ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\feature_engineered\player_features_model_all_imputed.csv")

work_rate_map = {'low': 0, 'medium': 1, 'high': 2}
df['attacking_work_rate'] = df['attacking_work_rate'].map(work_rate_map)
df['defensive_work_rate'] = df['defensive_work_rate'].map(work_rate_map)

features = [
    'overall_rating', 'potential', 'attacking_work_rate', 'defensive_work_rate',
    'crossing', 'finishing', 'heading_accuracy', 'short_passing', 'volleys',
    'dribbling', 'curve', 'long_passing', 'ball_control', 'acceleration',
    'sprint_speed', 'agility', 'reactions', 'balance', 'shot_power', 'jumping',
    'stamina', 'strength', 'long_shots', 'aggression', 'interceptions',
    'positioning', 'vision', 'penalties', 'marking', 'standing_tackle',
    'sliding_tackle', 'gk_diving', 'gk_handling', 'gk_kicking', 'gk_positioning',
    'gk_reflexes', 'injury_count', 'total_days_out', 'days_per_injury'
]
target = 'overall_rating'

df = df.dropna(subset=features + [target])
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df[features + [target]])

def create_sequences(data, n_steps):
    X, y = [], []
    for i in range(len(data) - n_steps):
        seq_x = data[i:i+n_steps, :-1]
        seq_y = data[i+n_steps, -1]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

n_steps = 3
X, y = create_sequences(scaled_data, n_steps)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)

# --- Step 2: KerasTuner LSTM Tuning ---
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import keras_tuner as kt

def build_lstm_model(hp):
    model = keras.Sequential()
    model.add(
        layers.LSTM(
            units=hp.Int("units", min_value=32, max_value=128, step=32),
            activation="relu",
            input_shape=(X_train.shape[1], X_train.shape[2])
        )
    )
    model.add(
        layers.Dropout(
            rate=hp.Choice("dropout", values=[0.0, 0.2, 0.3])
        )
    )
    model.add(layers.Dense(1))
    model.compile(
        optimizer=keras.optimizers.Adam(
            learning_rate=hp.Choice("learning_rate", values=[1e-2, 1e-3, 1e-4])
        ),
        loss="mean_squared_error",
        metrics=["mae"]
    )
    return model

tuner = kt.RandomSearch(
    build_lstm_model,
    objective="val_loss",
    max_trials=5,
    executions_per_trial=1,
    directory="week7_tuning",
    project_name="lstm_tuning"
)

tuner.search(X_train, y_train, epochs=20, validation_data=(X_test, y_test), verbose=1)

# --- Step 3: Rebuild Best Model Manually ---
best_hp = tuner.get_best_hyperparameters(1)[0]

best_lstm_model = keras.Sequential()
best_lstm_model.add(
    layers.LSTM(
        units=best_hp.get("units"),
        activation="relu",
        input_shape=(X_train.shape[1], X_train.shape[2])
    )
)
best_lstm_model.add(
    layers.Dropout(rate=best_hp.get("dropout"))
)
best_lstm_model.add(layers.Dense(1))
best_lstm_model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=best_hp.get("learning_rate")),
    loss="mean_squared_error",
    metrics=["mae"]
)

# --- Step 4: Train Best Model ---
best_lstm_model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test), verbose=1)

# --- Step 5: Evaluate and Save ---
val_loss, val_mae = best_lstm_model.evaluate(X_test, y_test, verbose=0)
print("✅ Best Hyperparameters:")
print(best_hp.values)
print(f"\nTuned LSTM - Validation Loss: {val_loss:.4f}, MAE: {val_mae:.4f}")

best_lstm_model.save("best_lstm_week7.h5")
print("✅ Tuned LSTM model saved as best_lstm_week7.h5")

Reloading Tuner from week7_tuning\lstm_tuning\tuner0.json


  super().__init__(**kwargs)


Epoch 1/20
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 6ms/step - loss: 0.0354 - mae: 0.1319 - val_loss: 0.0143 - val_mae: 0.0946
Epoch 2/20
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 6ms/step - loss: 0.0141 - mae: 0.0937 - val_loss: 0.0138 - val_mae: 0.0932
Epoch 3/20
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m51s[0m 23ms/step - loss: 0.0138 - mae: 0.0929 - val_loss: 0.0138 - val_mae: 0.0930
Epoch 4/20
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 13ms/step - loss: 0.0138 - mae: 0.0927 - val_loss: 0.0137 - val_mae: 0.0927
Epoch 5/20
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 8ms/step - loss: 0.0138 - mae: 0.0924 - val_loss: 0.0137 - val_mae: 0.0927
Epoch 6/20
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 13ms/step - loss: 0.0136 - mae: 0.0923 - val_loss: 0.0137 - val_mae: 0.0926
Epoch 7/20
[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m



✅ Best Hyperparameters:
{'units': 96, 'dropout': 0.0, 'learning_rate': 0.0001}

Tuned LSTM - Validation Loss: 0.0136, MAE: 0.0926
✅ Tuned LSTM model saved as best_lstm_week7.h5


In [9]:
import numpy as np
import xgboost as xgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# === STEP 3: Use tuned LSTM predictions in Ensemble ===

# Get predictions from tuned LSTM
lstm_train_preds = best_lstm_model.predict(X_train).flatten()
lstm_test_preds  = best_lstm_model.predict(X_test).flatten()

# Reshape original features for XGBoost (flatten the time steps)
X_train_reshaped = X_train.reshape(X_train.shape[0], -1)
X_test_reshaped  = X_test.reshape(X_test.shape[0], -1)

# Add tuned LSTM predictions as an extra feature
X_train_ensemble = np.hstack((X_train_reshaped, lstm_train_preds.reshape(-1,1)))
X_test_ensemble  = np.hstack((X_test_reshaped, lstm_test_preds.reshape(-1,1)))

# Train XGBoost model
xgboost_model = xgb.XGBRegressor(
    objective="reg:squarederror",
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
xgboost_model.fit(X_train_ensemble, y_train)

# === STEP 4: Evaluate Models ===

# 1. Tuned LSTM
lstm_rmse = np.sqrt(mean_squared_error(y_test, lstm_test_preds))
lstm_mae  = mean_absolute_error(y_test, lstm_test_preds)
lstm_r2   = r2_score(y_test, lstm_test_preds)

# 2. Ensemble (LSTM + XGBoost)
ensemble_preds = xgboost_model.predict(X_test_ensemble)
ensemble_rmse = np.sqrt(mean_squared_error(y_test, ensemble_preds))
ensemble_mae  = mean_absolute_error(y_test, ensemble_preds)
ensemble_r2   = r2_score(y_test, ensemble_preds)

print("\n=== FINAL MODEL EVALUATION (Week 7) ===")
print(f"Tuned LSTM   → RMSE: {lstm_rmse:.4f}, MAE: {lstm_mae:.4f}, R²: {lstm_r2:.4f}")
print(f"Ensemble     → RMSE: {ensemble_rmse:.4f}, MAE: {ensemble_mae:.4f}, R²: {ensemble_r2:.4f}")


[1m2172/2172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step
[1m543/543[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step

=== FINAL MODEL EVALUATION (Week 7) ===
Tuned LSTM   → RMSE: 0.1168, MAE: 0.0926, R²: -0.0042
Ensemble     → RMSE: 0.1170, MAE: 0.0927, R²: -0.0066


In [10]:
import pandas as pd

# Create comparison DataFrame
results = {
    "Model": ["Tuned LSTM", "Ensemble (LSTM+XGBoost)"],
    "RMSE": [lstm_rmse, ensemble_rmse],
    "MAE": [lstm_mae, ensemble_mae],
    "R2": [lstm_r2, ensemble_r2]
}

results_df = pd.DataFrame(results)

# Save to CSV
results_df.to_csv("week7_model_comparison.csv", index=False)

print("✅ Week 7 model comparison saved as 'week7_model_comparison.csv'")
print(results_df)


✅ Week 7 model comparison saved as 'week7_model_comparison.csv'
                     Model      RMSE       MAE        R2
0               Tuned LSTM  0.116829  0.092613 -0.004170
1  Ensemble (LSTM+XGBoost)  0.116971  0.092661 -0.006597


In [11]:
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb

# Define parameter grid for XGBoost
param_dist = {
    "n_estimators": [100, 200, 300],
    "max_depth": [3, 5, 7, 9],
    "learning_rate": [0.01, 0.05, 0.1, 0.2],
    "subsample": [0.6, 0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0]
}

xgb_reg = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

# Run RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb_reg,
    param_distributions=param_dist,
    n_iter=10,  # try 10 random combinations
    scoring="neg_mean_squared_error",
    cv=3,
    verbose=1,
    n_jobs=-1
)

random_search.fit(X_train_ensemble, y_train)

print("✅ Best XGBoost Parameters:", random_search.best_params_)

# Best model
best_xgb_model = random_search.best_estimator_

# Evaluate tuned XGBoost ensemble
ensemble_preds_tuned = best_xgb_model.predict(X_test_ensemble)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
ensemble_rmse_tuned = np.sqrt(mean_squared_error(y_test, ensemble_preds_tuned))
ensemble_mae_tuned  = mean_absolute_error(y_test, ensemble_preds_tuned)
ensemble_r2_tuned   = r2_score(y_test, ensemble_preds_tuned)

print(f"\n🔹 Tuned Ensemble (LSTM+XGBoost) → RMSE: {ensemble_rmse_tuned:.4f}, MAE: {ensemble_mae_tuned:.4f}, R²: {ensemble_r2_tuned:.4f}")


Fitting 3 folds for each of 10 candidates, totalling 30 fits


1 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\M.ANTONY ROJES\Downloads\Infosys\.venv\lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\M.ANTONY ROJES\Downloads\Infosys\.venv\lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
  File "c:\Users\M.ANTONY ROJES\Downloads\Infosys\.venv\lib\site-packages\xgboost\sklearn.py", line 1143, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "c:\Users\M.ANTONY ROJES\Downloads\Infosys\.venv\lib\site-packages\xgboost\sklearn

✅ Best XGBoost Parameters: {'subsample': 1.0, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.01, 'colsample_bytree': 0.6}

🔹 Tuned Ensemble (LSTM+XGBoost) → RMSE: 0.1166, MAE: 0.0923, R²: -0.0006


In [12]:
# Extend comparison DataFrame
results = {
    "Model": ["Tuned LSTM", "Ensemble (LSTM+XGBoost)", "Tuned Ensemble (XGBoost)"],
    "RMSE": [lstm_rmse, ensemble_rmse, ensemble_rmse_tuned],
    "MAE": [lstm_mae, ensemble_mae, ensemble_mae_tuned],
    "R2": [lstm_r2, ensemble_r2, ensemble_r2_tuned]
}

results_df = pd.DataFrame(results)

# Save updated comparison
results_df.to_csv("week7_model_comparison.csv", index=False)

print("✅ Updated Week 7 comparison saved as 'week7_model_comparison.csv'")
print(results_df)


✅ Updated Week 7 comparison saved as 'week7_model_comparison.csv'
                      Model      RMSE       MAE        R2
0                Tuned LSTM  0.116829  0.092613 -0.004170
1   Ensemble (LSTM+XGBoost)  0.116971  0.092661 -0.006597
2  Tuned Ensemble (XGBoost)  0.116624  0.092347 -0.000638


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(time_steps, X.shape[2])),
    LSTM(32),
    Dense(1)  # predict market value
])
model.compile(optimizer='adam', loss='mse')
history = model.fit(X, y, epochs=50, batch_size=8, verbose=1)


In [None]:
# WEEK 7 - MULTIVARIATE & ENCODER-DECODER

import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# 1. Clean the data
transfer_df['Age'] = pd.to_numeric(transfer_df['Age'], errors='coerce')
transfer_df['Market Value'] = pd.to_numeric(transfer_df['Market Value'], errors='coerce')

transfer_clean = transfer_df[['Market Value', 'Age']].dropna(how="any")

print("transfer_clean shape:", transfer_clean.shape)
print(transfer_clean.head())

# 2. Scale features
scaler = MinMaxScaler()
features = transfer_clean.values
scaled_features = scaler.fit_transform(features)

# 3. Create sequences
time_steps = 5   # hyperparameter: window size
def create_sequences(data, time_steps=5):
    X, y = [], []
    for i in range(len(data) - time_steps):
        X.append(data[i:i+time_steps])
        y.append(data[i+time_steps, 0])  # predict Market Value
    return np.array(X), np.array(y)

X, y = create_sequences(scaled_features, time_steps)
X = X.reshape((X.shape[0], X.shape[1], features.shape[1]))

print("X shape:", X.shape, "y shape:", y.shape)

# 4. Build Encoder-Decoder LSTM
model = Sequential([
    LSTM(64, return_sequences=True, input_shape=(time_steps, features.shape[1])),
    LSTM(32),
    Dense(1)
])
model.compile(optimizer='adam', loss='mse')

# 5. Train model
history = model.fit(X, y, epochs=50, batch_size=8, verbose=1)

# 6. Evaluate model
loss = model.evaluate(X, y, verbose=0)
print("Final MSE Loss:", loss)
