In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

base_path = Path(r"C:\Users\Dharun Kumar\OneDrive\Desktop\Infosys Springboard")
raw_path = base_path / "final_data.csv"
df = pd.read_csv(raw_path)
df_original = df.copy()
df.head(3)


In [None]:
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]
num_like = [
	"height","age","appearance","goals","assists","yellow_cards","second_yellow_cards","red_cards",
	"goals_conceded","clean_sheets","minutes_played","days_injured","games_injured","award",
	"current_value","highest_value","position_encoded","winger"
]
for c in num_like:
	if c in df.columns:
		df[c] = pd.to_numeric(df[c], errors="coerce")

str_like = ["player","team","name","position"]
for c in str_like:
	if c in df.columns:
		df[c] = df[c].astype("string").str.strip()

basic_info = {
	"shape": df.shape,
	"missing_perc": df.isna().mean().sort_values(ascending=False).head(20),
	"dtypes": df.dtypes,
	"describe_num": df[num_like].describe()
}
basic_info


In [None]:
df["minutes_played"] = df["minutes_played"].fillna(0)
mp = df["minutes_played"].replace(0, np.nan)
per90 = {
	"goals_p90": df["goals"] * 90 / mp,
	"assists_p90": df["assists"] * 90 / mp,
	"yc_p90": df["yellow_cards"] * 90 / mp,
	"s2y_p90": df["second_yellow_cards"] * 90 / mp,
	"rc_p90": df["red_cards"] * 90 / mp,
	"ga_p90": (df["goals"] + df["assists"]) * 90 / mp
}
for k,v in per90.items():
	df[k] = v.replace([np.inf, -np.inf], np.nan).fillna(0)

app = df["appearance"].replace(0, np.nan)
avail = {
	"injury_days_per_app": df["days_injured"] / app,
	"injury_games_share": df["games_injured"] / (df["games_injured"] + df["appearance"]).replace(0, np.nan),
	"availability_rate": df["appearance"] / (df["appearance"] + df["games_injured"]).replace(0, np.nan)
}
for k,v in avail.items():
	df[k] = v.replace([np.inf, -np.inf], np.nan).fillna(0)

value_feats = {
	"value_to_peak_ratio": df["current_value"] / df["highest_value"].replace(0, np.nan),
	"value_per_appearance": df["current_value"] / app,
	"value_per_90": df["current_value"] / df["minutes_played"].replace(0, np.nan) * 90
}
for k,v in value_feats.items():
	df[k] = v.replace([np.inf, -np.inf], np.nan).fillna(0)

pos = df["position"].fillna("").str.lower()
role_grp = np.select(
	[
		pos.str.contains("goalkeeper"),
		pos.str.contains("defender"),
		pos.str.contains("midfield"),
		pos.str.contains("winger|forward|striker|centre-forward|rightwinger|leftwinger")
	],
	["gk","def","mid","att"],
	default="other"
)
df["role_group"] = role_grp.astype("category")

df["is_winger"] = df.get("winger", pd.Series(index=df.index, dtype=float)).fillna(0).astype(int)

career_stage = pd.cut(df["age"], bins=[0,20,24,28,32,100], labels=["teen","u24","peak","u32","veteran"], include_lowest=True)
df["career_stage"] = career_stage.astype("category")

df["discipline_score"] = df[["yellow_cards","second_yellow_cards","red_cards"]].fillna(0).dot([1,2,5]) / (app.replace(0, np.nan))
df["discipline_score"] = df["discipline_score"].replace([np.inf, -np.inf], np.nan).fillna(0)

num_new = [
	"goals_p90","assists_p90","yc_p90","s2y_p90","rc_p90","ga_p90",
	"injury_days_per_app","injury_games_share","availability_rate",
	"value_to_peak_ratio","value_per_appearance","value_per_90","discipline_score"
]
cat_new = ["role_group","career_stage","team","position"]

df_engineered = df.copy()
{
	"engineered_numeric_preview": df_engineered[num_new].head(3),
	"engineered_categoricals_preview": df_engineered[cat_new].head(3)
}


In [None]:
out_dir = base_path / "processed"
out_dir.mkdir(parents=True, exist_ok=True)
engineered_path = out_dir / "engineered_dataset.csv"
df_engineered.to_csv(engineered_path, index=False)
{
	"saved_path": str(engineered_path),
	"rows": len(df_engineered),
	"cols": len(df_engineered.columns),
	"numeric_features": [
		"goals_p90","assists_p90","yc_p90","s2y_p90","rc_p90","ga_p90",
		"injury_days_per_app","injury_games_share","availability_rate",
		"value_to_peak_ratio","value_per_appearance","value_per_90","discipline_score"
	],
	"categorical_features": ["role_group","career_stage","team","position","is_winger"]
}


In [None]:
import sys, subprocess
try:
	import tensorflow as tf
except Exception:
	subprocess.check_call([sys.executable, "-m", "pip", "install", "tensorflow-cpu==2.15.0"], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
	import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

np.random.seed(42)
engineered_path = Path(base_path) / "processed" / "engineered_dataset.csv"
dfe = pd.read_csv(engineered_path)
features_dyn = [
	"ga_p90","availability_rate","injury_games_share","discipline_score"
]
value_col = "current_value"
T_hist = 12
H_forecast = 3

vals = dfe[value_col].fillna(dfe[value_col].median()).to_numpy()
feat_mat = dfe[features_dyn].fillna(dfe[features_dyn].median()).to_numpy()
alpha = 0.6 + 0.2 * (feat_mat[:,0] - np.nanmin(feat_mat[:,0])) / (np.nanmax(feat_mat[:,0]) - np.nanmin(feat_mat[:,0]) + 1e-9)
trend = 0.02 * (feat_mat[:,0] - feat_mat[:,0].mean()) - 0.03 * (feat_mat[:,2] - feat_mat[:,2].mean())
noise_scale = 0.05 + 0.1 * (feat_mat[:,2] - np.nanmin(feat_mat[:,2])) / (np.nanmax(feat_mat[:,2]) - np.nanmin(feat_mat[:,2]) + 1e-9)

hist = np.zeros((len(vals), T_hist))
for i in range(len(vals)):
	series = np.zeros(T_hist)
	level = max(vals[i], 1.0)
	for t in range(T_hist-1, -1, -1):
		eps = np.random.normal(0, noise_scale[i] * level)
		level = alpha[i] * level + (1 - alpha[i]) * vals[i] * (1 + trend[i]) + eps
		series[t] = max(level, 0.0)
	hist[i] = series

dyn_seq = np.zeros((len(vals), T_hist, len(features_dyn)))
for j in range(len(features_dyn)):
	base = feat_mat[:, j:j+1]
	noise = np.random.normal(0, 0.02, size=(len(vals), T_hist))
	dyn = np.repeat(base, T_hist, axis=1) + noise
	dyn = np.clip(dyn, np.percentile(dyn, 0.5), np.percentile(dyn, 99.5))
	dyn_seq[:,:,j] = dyn

y_targets = np.zeros((len(vals), H_forecast))
for i in range(len(vals)):
	future = []
	level = hist[i,-1]
	for h in range(H_forecast):
		eps = np.random.normal(0, noise_scale[i] * level)
		level = alpha[i] * level + (1 - alpha[i]) * vals[i] * (1 + trend[i]) + eps
		future.append(max(level, 0.0))
	y_targets[i] = future

X_univariate = hist.reshape(len(vals), T_hist, 1)
X_multivariate = np.concatenate([hist.reshape(len(vals), T_hist, 1), dyn_seq], axis=2)

idx = np.arange(len(vals))
np.random.shuffle(idx)
train_sz = int(0.8 * len(idx))
train_idx, val_idx = idx[:train_sz], idx[train_sz:]

X_uni_tr, X_uni_va = X_univariate[train_idx], X_univariate[val_idx]
X_multi_tr, X_multi_va = X_multivariate[train_idx], X_multivariate[val_idx]
y_tr_1, y_va_1 = y_targets[train_idx, 0], y_targets[val_idx, 0]
y_tr_seq, y_va_seq = y_targets[train_idx], y_targets[val_idx]

scale_val = np.median(dfe["current_value"].clip(lower=1.0))
X_uni_tr = X_uni_tr / scale_val
X_uni_va = X_uni_va / scale_val
X_multi_tr = X_multi_tr.copy()
X_multi_va = X_multi_va.copy()
X_multi_tr[:,:,0] = X_multi_tr[:,:,0] / scale_val
X_multi_va[:,:,0] = X_multi_va[:,:,0] / scale_val
y_tr_1 = y_tr_1 / scale_val
y_va_1 = y_va_1 / scale_val
y_tr_seq = y_tr_seq / scale_val
y_va_seq = y_va_seq / scale_val

X_uni_tr.shape, X_multi_tr.shape, y_tr_1.shape, y_tr_seq.shape


In [None]:
from tensorflow import keras
from tensorflow.keras import layers

uni_model = keras.Sequential([
	layers.Input(shape=(T_hist,1)),
	layers.LSTM(64, return_sequences=False),
	layers.Dense(32, activation="relu"),
	layers.Dense(1)
])
uni_model.compile(optimizer=keras.optimizers.Adam(1e-3), loss="mse")
h_uni = uni_model.fit(X_uni_tr, y_tr_1, validation_data=(X_uni_va, y_va_1), epochs=25, batch_size=64, verbose=0)

va_pred_uni = uni_model.predict(X_uni_va, verbose=0).ravel()
mae_uni = np.mean(np.abs(va_pred_uni - y_va_1))
rmse_uni = np.sqrt(np.mean((va_pred_uni - y_va_1)**2))
r2_uni = 1 - np.sum((va_pred_uni - y_va_1)**2) / np.sum((y_va_1 - y_va_1.mean())**2 + 1e-9)

plt.figure(figsize=(6,3))
plt.plot(h_uni.history["loss"], label="train")
plt.plot(h_uni.history["val_loss"], label="val")
plt.title("Univariate LSTM Loss")
plt.legend()
plt.tight_layout()
plt.show()

mae_uni, rmse_uni, r2_uni


In [None]:
multi_model = keras.Sequential([
	layers.Input(shape=(T_hist, X_multi_tr.shape[-1])),
	layers.LSTM(64, return_sequences=False),
	layers.Dense(64, activation="relu"),
	layers.Dense(1)
])
multi_model.compile(optimizer=keras.optimizers.Adam(1e-3), loss="mse")
h_multi = multi_model.fit(X_multi_tr, y_tr_1, validation_data=(X_multi_va, y_va_1), epochs=25, batch_size=64, verbose=0)

va_pred_multi = multi_model.predict(X_multi_va, verbose=0).ravel()
mae_multi = np.mean(np.abs(va_pred_multi - y_va_1))
rmse_multi = np.sqrt(np.mean((va_pred_multi - y_va_1)**2))
r2_multi = 1 - np.sum((va_pred_multi - y_va_1)**2) / np.sum((y_va_1 - y_va_1.mean())**2 + 1e-9)

plt.figure(figsize=(6,3))
plt.plot(h_multi.history["loss"], label="train")
plt.plot(h_multi.history["val_loss"], label="val")
plt.title("Multivariate LSTM Loss")
plt.legend()
plt.tight_layout()
plt.show()

mae_multi, rmse_multi, r2_multi


In [None]:
enc_in = layers.Input(shape=(T_hist, X_multi_tr.shape[-1]))
enc_l1 = layers.LSTM(64, return_sequences=False)(enc_in)
rep = layers.RepeatVector(H_forecast)(enc_l1)
dec_l1 = layers.LSTM(64, return_sequences=True)(rep)
out = layers.TimeDistributed(layers.Dense(1))(dec_l1)
encdec_model = keras.Model(enc_in, out)
encdec_model.compile(optimizer=keras.optimizers.Adam(1e-3), loss="mse")

h_enc = encdec_model.fit(X_multi_tr, y_tr_seq.reshape(-1, H_forecast, 1),
	validation_data=(X_multi_va, y_va_seq.reshape(-1, H_forecast, 1)),
	epochs=30, batch_size=64, verbose=0)

va_pred_seq = encdec_model.predict(X_multi_va, verbose=0).squeeze(-1)
mae_seq = np.mean(np.abs(va_pred_seq - y_va_seq))
rmse_seq = np.sqrt(np.mean((va_pred_seq - y_va_seq)**2))
r2_seq = 1 - np.sum((va_pred_seq - y_va_seq)**2) / np.sum((y_va_seq - y_va_seq.mean())**2 + 1e-9)

plt.figure(figsize=(6,3))
plt.plot(h_enc.history["loss"], label="train")
plt.plot(h_enc.history["val_loss"], label="val")
plt.title("Encoder-Decoder LSTM Loss")
plt.legend()
plt.tight_layout()
plt.show()

mae_seq, rmse_seq, r2_seq


In [None]:
models_dir = base_path / "models"
models_dir.mkdir(parents=True, exist_ok=True)

uni_path = models_dir / "univariate_lstm.keras"
multi_path = models_dir / "multivariate_lstm.keras"
encdec_path = models_dir / "encdec_lstm.keras"

uni_model.save(uni_path)
multi_model.save(multi_path)
encdec_model.save(encdec_path)

metrics = pd.DataFrame({
	"model": ["univariate","multivariate","encdec_seq"],
	"mae": [mae_uni, mae_multi, mae_seq],
	"rmse": [rmse_uni, rmse_multi, rmse_seq],
	"r2": [r2_uni, r2_multi, r2_seq]
})
metrics_path = models_dir / "metrics.csv"
metrics.to_csv(metrics_path, index=False)
metrics, str(metrics_path)
