In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Load datasets
market_df = pd.read_csv("/content/Feature Engineering/player_valuations_featured.csv")
injury_df = pd.read_csv("/content/Feature Engineering/Injury_data.csv")
sentiment_df = pd.read_csv("/content/Sentiment_Analysis/final_featured_dataset.csv")

# Convert player IDs to string for consistency
market_df['player_id'] = market_df['player_id'].astype(str)
injury_df['p_id2'] = injury_df['p_id2'].astype(str)
sentiment_df['player_id'] = sentiment_df['player_id'].astype(str)
performance_df['player.name'] = performance_df['player.name'].astype(str)  # if needed

# Convert date columns to datetime
market_df['date'] = pd.to_datetime(market_df['date'])
sentiment_df['date'] = pd.to_datetime(sentiment_df['date'])
# If injury has start_year only, create approximate date
injury_df['start_year'] = pd.to_datetime(injury_df['start_year'], format='%Y', errors='coerce')


In [None]:
merged_df = market_df.merge(
    injury_df, left_on='player_id', right_on='p_id2', how='left', suffixes=('', '_inj')
)


In [None]:
merged_df = merged_df.merge(
    sentiment_df, on=['player_id', 'date'], how='left', suffixes=('', '_sent')
)


In [None]:
# Convert timestamp to datetime
performance_df['timestamp'] = pd.to_datetime(performance_df['timestamp'], errors='coerce')

# Extract date only
performance_df['date'] = performance_df['timestamp'].dt.date


  performance_df['timestamp'] = pd.to_datetime(performance_df['timestamp'], errors='coerce')
  performance_df['date'] = performance_df['timestamp'].dt.date


In [None]:
# Create goals column
performance_df['goals'] = performance_df['type.name'].apply(lambda x: 1 if x == 'Goal' else 0)

# Create assists column (if 'pass.goal_assist' exists, use it)
performance_df['assists'] = performance_df['pass.goal_assist'].apply(lambda x: 1 if x != 0 else 0)

# Create yellow/red cards columns
performance_df['yellow_cards'] = performance_df['bad_behaviour.card.name'].apply(lambda x: 1 if x=='Yellow Card' else 0)
performance_df['red_cards'] = performance_df['bad_behaviour.card.name'].apply(lambda x: 1 if x=='Red Card' else 0)

# Create minutes played approximation per event (if minute exists)
performance_df['minutes_played'] = 1  # or approximate by event counts per match


  performance_df['goals'] = performance_df['type.name'].apply(lambda x: 1 if x == 'Goal' else 0)
  performance_df['assists'] = performance_df['pass.goal_assist'].apply(lambda x: 1 if x != 0 else 0)
  performance_df['yellow_cards'] = performance_df['bad_behaviour.card.name'].apply(lambda x: 1 if x=='Yellow Card' else 0)
  performance_df['red_cards'] = performance_df['bad_behaviour.card.name'].apply(lambda x: 1 if x=='Red Card' else 0)
  performance_df['minutes_played'] = 1  # or approximate by event counts per match


In [None]:
performance_df['timestamp'] = pd.to_datetime(performance_df['timestamp'], errors='coerce')
performance_df['date'] = performance_df['timestamp'].dt.date


In [None]:
agg_funcs = {
    'goals': 'sum',
    'assists': 'sum',
    'minutes_played': 'sum',
    'yellow_cards': 'sum',
    'red_cards': 'sum'
}

perf_agg = performance_df.groupby(['player.name', 'date']).agg(agg_funcs).reset_index()
perf_agg.rename(columns={'player.name': 'player_id'}, inplace=True)
perf_agg['player_id'] = perf_agg['player_id'].astype(str)


In [None]:
# Ensure IDs and dates are consistent
market_df['player_id'] = market_df['player_id'].astype(str)
sentiment_df['player_id'] = sentiment_df['player_id'].astype(str)
injury_df['p_id2'] = injury_df['p_id2'].astype(str)
perf_agg['player_id'] = perf_agg['player_id'].astype(str)

market_df['date'] = pd.to_datetime(market_df['date'])
sentiment_df['date'] = pd.to_datetime(sentiment_df['date'])
perf_agg['date'] = pd.to_datetime(perf_agg['date'])
# Injury dataset might have start_year; approximate date
injury_df['start_year'] = pd.to_datetime(injury_df['start_year'], format='%Y', errors='coerce')

# Merge market + injury
merged_df = market_df.merge(
    injury_df, left_on='player_id', right_on='p_id2', how='left', suffixes=('', '_inj')
)

# Merge with sentiment
merged_df = merged_df.merge(
    sentiment_df, on=['player_id', 'date'], how='left', suffixes=('', '_sent')
)

# Merge with performance
merged_df = merged_df.merge(
    perf_agg, on=['player_id', 'date'], how='left', suffixes=('', '_perf')
)


In [None]:
# Fill numeric columns with 0
num_cols = merged_df.select_dtypes(include=['float64', 'int64']).columns
merged_df[num_cols] = merged_df[num_cols].fillna(0)

# Fill categorical/object columns with 'unknown'
cat_cols = merged_df.select_dtypes(include=['object']).columns
merged_df[cat_cols] = merged_df[cat_cols].fillna('unknown')


In [None]:
# Example features combining market, injury, sentiment, and performance
features = [
    'player_age', 'goals', 'assists', 'minutes_played', 'yellow_cards', 'red_cards',
    'total_transfers', 'total_market_value', 'season_days_injured', 'cumulative_days_injured',
    'minutes_per_game_prev_seasons', 'avg_days_injured_prev_seasons',
    'avg_games_per_season_prev_seasons', 'bmi', 'significant_injury_prev_season',
    'compound_mean', 'compound_std', 'compound_min', 'compound_max',
    'polarity_mean', 'polarity_std'
]

target = ['market_value_in_eur']

X = merged_df[features].values
y = merged_df[target].values


In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler_X = MinMaxScaler()
scaler_y = MinMaxScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y)


In [None]:
import numpy as np

def create_sequences(X, y, n_input, n_output):
    X_seq, y_seq = [], []
    for i in range(len(X) - n_input - n_output + 1):
        X_seq.append(X[i:i+n_input])
        y_seq.append(y[i+n_input:i+n_input+n_output])
    return np.array(X_seq), np.array(y_seq)

n_input = 10  # last 10 windows
n_output = 5  # predict next 5 windows

X_seq, y_seq = create_sequences(X_scaled, y_scaled, n_input, n_output)


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, RepeatVector, TimeDistributed, Dense

n_features = X_seq.shape[2]

# Encoder
encoder_inputs = Input(shape=(n_input, n_features))
encoder_lstm = LSTM(128, activation='relu', return_state=True)
_, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = RepeatVector(n_output)(state_h)
decoder_lstm = LSTM(128, activation='relu', return_sequences=True)
decoder_outputs = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = TimeDistributed(Dense(1))
decoder_outputs = decoder_dense(decoder_outputs)

model = Model(encoder_inputs, decoder_outputs)
model.compile(optimizer='adam', loss='mse')
model.summary()


In [None]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

history = model.fit(
    X_seq, y_seq,
    epochs=50,
    batch_size=128,
    validation_split=0.2,
    callbacks=[early_stop]
)


Epoch 1/50
[1m3104/3104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m221s[0m 71ms/step - loss: 9.0100e-04 - val_loss: 0.0018
Epoch 2/50
[1m3104/3104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 64ms/step - loss: 8.8110e-04 - val_loss: 0.0018
Epoch 3/50
[1m3104/3104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 62ms/step - loss: 8.7567e-04 - val_loss: 0.0018
Epoch 4/50
[1m3104/3104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 62ms/step - loss: 8.6627e-04 - val_loss: 0.0018
Epoch 5/50
[1m3104/3104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m192s[0m 62ms/step - loss: 8.6108e-04 - val_loss: 0.0018
Epoch 6/50
[1m3104/3104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 64ms/step - loss: 8.5954e-04 - val_loss: 0.0020
