In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import joblib
import os

# --- Load Merged Dataset ---
data_path = r'C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\feature_engineered\player_features_model_all_imputed.csv'
df = pd.read_csv(data_path)

# --- Select Features and Target ---
features = [
    'overall_rating', 'potential', 'crossing', 'finishing', 'heading_accuracy',
    'short_passing', 'volleys', 'dribbling', 'curve', 'long_passing',
    'ball_control', 'acceleration', 'sprint_speed', 'agility', 'reactions',
    'balance', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots',
    'aggression', 'interceptions', 'positioning', 'vision', 'penalties',
    'marking', 'standing_tackle', 'sliding_tackle'
]
target = 'overall_rating'  # You can change this to market value if available

# --- Drop NaNs and Scale ---
df = df.dropna(subset=features + [target])
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df[features + [target]])

# --- Extract Player Names (aligned with sequences) ---
player_names = df['player_name'].values[n_steps:]

# --- Create Time-Series Sequences ---
n_steps = 3
def create_sequences(data, n_steps):
    X, y = [], []
    for i in range(len(data) - n_steps):
        seq_x = data[i:i+n_steps, :-1]
        seq_y = data[i+n_steps, -1]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

X, y = create_sequences(scaled_data, n_steps)

# --- Train-Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
player_test = player_names[-len(y_test):]  # Align player names with test set

# --- Build LSTM Model ---
model = Sequential([
    LSTM(50, activation='relu', input_shape=(n_steps, X.shape[2])),
    Dense(1)
])
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train, y_train, epochs=50, validation_data=(X_test, y_test), verbose=0)

# --- Make Predictions ---
preds_scaled = model.predict(X_test).flatten()

# --- Inverse Transform Predictions ---
dummy_test = np.zeros((len(y_test), len(features)+1))
dummy_preds = np.zeros((len(preds_scaled), len(features)+1))
dummy_test[:, -1] = y_test
dummy_preds[:, -1] = preds_scaled

y_test_original = scaler.inverse_transform(dummy_test)[:, -1]
preds_original = scaler.inverse_transform(dummy_preds)[:, -1]

# --- Save Predictions to CSV with Player Name ---
results_df = pd.DataFrame({
    "Player": player_test,
    "Actual": y_test_original,
    "Predicted": preds_original
})
results_df.to_csv("value_predictions.csv", index=False)
print("✅ Predictions saved to value_predictions.csv")

# --- Save Model and Data to PKL ---
joblib.dump({
    "X_train": X_train,
    "y_train": y_train,
    "X_test": X_test,
    "y_test": y_test,
    "scaler": scaler,
    "features": features,
    "target": target,
    "model": model
}, "market_value_lstm.pkl")
print("✅ Model and data saved to market_value_lstm.pkl")

In [7]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

mae = mean_absolute_error(y_test_original, preds_original)
mse = mean_squared_error(y_test_original, preds_original)
rmse = mse ** 0.5
print(f"MAE: {mae:.2f}, RMSE: {rmse:.2f}")

MAE: 1.74, RMSE: 3.33


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import xgboost as xgb
import joblib
import os

# --- Load Dataset ---
data_path = r'C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\feature_engineered\player_features_model_all_imputed.csv'
df = pd.read_csv(data_path)

# --- Feature Selection ---
features = [
    'overall_rating', 'potential', 'crossing', 'finishing', 'heading_accuracy',
    'short_passing', 'volleys', 'dribbling', 'curve', 'long_passing',
    'ball_control', 'acceleration', 'sprint_speed', 'agility', 'reactions',
    'balance', 'shot_power', 'jumping', 'stamina', 'strength', 'long_shots',
    'aggression', 'interceptions', 'positioning', 'vision', 'penalties',
    'marking', 'standing_tackle', 'sliding_tackle',
    'injury_score', 'sentiment_score'  # New features
]
target = 'overall_rating'  

# --- Preprocessing ---
df = df.dropna(subset=features + [target])
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df[features + [target]])

# --- Sequence Generation ---
n_steps = 3
def create_sequences(data, n_steps):
    X, y = [], []
    for i in range(len(data) - n_steps):
        seq_x = data[i:i+n_steps, :-1]
        seq_y = data[i+n_steps, -1]
        X.append(seq_x)
        y.append(seq_y)
    return np.array(X), np.array(y)

X, y = create_sequences(scaled_data, n_steps)
player_names = df['player_name'].values[n_steps:]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)
player_test = player_names[-len(y_test):]

# --- LSTM Model ---
model = Sequential([
    LSTM(64, activation='relu', input_shape=(n_steps, X.shape[2])),
    Dense(1)
])
model.compile(optimizer='adam', loss='mean_squared_error')
model.fit(X_train, y_train, epochs=50, validation_data=(X_test, y_test), verbose=0)

# --- LSTM Predictions ---
lstm_preds_scaled = model.predict(X_test).flatten()
dummy_test = np.zeros((len(y_test), len(features)+1))
dummy_preds = np.zeros((len(lstm_preds_scaled), len(features)+1))
dummy_test[:, -1] = y_test
dummy_preds[:, -1] = lstm_preds_scaled
y_test_original = scaler.inverse_transform(dummy_test)[:, -1]
lstm_preds_original = scaler.inverse_transform(dummy_preds)[:, -1]

# --- XGBoost Model ---
X_flat = X.reshape(X.shape[0], -1)
X_train_flat, X_test_flat, _, _ = train_test_split(X_flat, y, test_size=0.2, shuffle=False)
xgb_model = xgb.XGBRegressor(n_estimators=100, max_depth=5, learning_rate=0.1)
xgb_model.fit(X_train_flat, y_train)
xgb_preds = xgb_model.predict(X_test_flat)

# --- Ensemble Prediction ---
final_preds = (lstm_preds_original + xgb_preds) / 2

# --- Save Results ---
ensemble_df = pd.DataFrame({
    "Player": player_test,
    "Actual": y_test_original,
    "LSTM_Pred": lstm_preds_original,
    "XGB_Pred": xgb_preds,
    "Final_Pred": final_preds
})
ensemble_df.to_csv("ensemble_predictions.csv", index=False)
print("✅ Ensemble predictions saved to ensemble_predictions.csv")

# --- Evaluation ---
mae = mean_absolute_error(y_test_original, final_preds)
rmse = mean_squared_error(y_test_original, final_preds, squared=False)
print(f"✅ Ensemble MAE: {mae:.2f}, RMSE: {rmse:.2f}")

# --- Save Models ---
joblib.dump({
    "scaler": scaler,
    "features": features,
    "target": target,
    "lstm_model": model,
    "xgb_model": xgb_model
}, "ensemble_models.pkl")
print("✅ Models saved to ensemble_models.pkl")