In [None]:
from sklearn.metrics import classification_report
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.estimators import BayesianEstimator
from pgmpy.inference import VariableElimination
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

__BINS = 3
# datast
data = pd.read_csv('../../cleaned_data/final_dataset.csv')

# outcome as numerical category
data['result_code'] = data['result'].map(
    {'Home Win': 0, 'Draw': 1, 'Away Win': 2})
data['MatchOutcome'] = data['result_code']

# discretize continuous features. binning.
features_home = ['home_avg_market_value',
                 'home_nationalities', 'home_avg_age', 'home_total_minutes']
features_away = ['away_avg_market_value',
                 'away_nationalities', 'away_avg_age', 'away_total_minutes']

kbins_home = KBinsDiscretizer(
    n_bins=__BINS, encode='ordinal', strategy='uniform')
kbins_away = KBinsDiscretizer(
    n_bins=__BINS, encode='ordinal', strategy='uniform')

data_discrete_home = kbins_home.fit_transform(data[features_home]).astype(int)
data_discrete_away = kbins_away.fit_transform(data[features_away]).astype(int)

# put back discretized values back to data
for i, col in enumerate(features_home):
    data[f'disc_{col}'] = data_discrete_home[:, i]

for i, col in enumerate(features_away):
    data[f'disc_{col}'] = data_discrete_away[:, i]

# feature weight learning
X_home = data[
    [f'disc_{col}' for col in features_home]
]
print(X_home)
X_away = data[
    [f'disc_{col}' for col in features_away]
]
y = data['MatchOutcome']

# split 8/2
X_home_train, X_home_test, X_away_train, X_away_test, y_train, y_test = train_test_split(
    X_home, X_away, y, test_size=0.2, random_state=42)


# learn feature weights with logistic regression
clf_home = LogisticRegression(
    multi_class='multinomial', solver='lbfgs', max_iter=500)
clf_home.fit(X_home_train, y_train)

clf_away = LogisticRegression(
    multi_class='multinomial', solver='lbfgs', max_iter=500)
clf_away.fit(X_away_train, y_train)

# learned weights
weights_home = clf_home.coef_.mean(axis=0)
weights_away = clf_away.coef_.mean(axis=0)

# display weights
weights_df = pd.DataFrame({
    'Feature': features_home + features_away,
    'Weights': list(weights_home) + list(weights_away)
})
print("Learned Feature Weights:\n", weights_df)


# apply weights
data['HomeStrength'] = (X_home*weights_home).sum(axis=1)
data['AwayStrength'] = (X_away*weights_away).sum(axis=1)

# explicitly re-discretize to ensure clear discrete bins
kbins_home_strength = KBinsDiscretizer(
    n_bins=__BINS, encode='ordinal', strategy='uniform')
data['HomeStrength'] = kbins_home_strength.fit_transform(
    data[['HomeStrength']]).astype(int).flatten()

kbins_away_strength = KBinsDiscretizer(
    n_bins=__BINS, encode='ordinal', strategy='uniform')
data['AwayStrength'] = kbins_away_strength.fit_transform(
    data[['AwayStrength']]).astype(int).flatten()

# setup network
"""
disc_home_avg_market_value ──┐
disc_home_nationalities ─────┤
disc_home_avg_age ───────────┤ → HomeStrength ────┐
disc_home_total_minutes ─────┘                    │
                                                  │ → MatchOutcome
disc_away_avg_market_value ──┐                    │
disc_away_nationalities ─────┤                    │
disc_away_avg_age ───────────┤ → AwayStrength ────┘
disc_away_total_minutes ─────┘
"""
structure = [
    ('disc_home_avg_market_value', 'HomeStrength'),
    ('disc_home_nationalities', 'HomeStrength'),
    ('disc_home_avg_age', 'HomeStrength'),
    ('disc_home_total_minutes', 'HomeStrength'),

    ('disc_away_avg_market_value', 'AwayStrength'),
    ('disc_away_nationalities', 'AwayStrength'),
    ('disc_away_avg_age', 'AwayStrength'),
    ('disc_away_total_minutes', 'AwayStrength'),

    ('HomeStrength', 'MatchOutcome'),
    ('AwayStrength', 'MatchOutcome')
]

# Define explicit Bayesian network explicitly using new class
model = DiscreteBayesianNetwork(structure)

# estimate CPTs. BDeu: Bayesian Dirichlet equivalent uniform. avoid zeros.
model.fit(data, estimator=BayesianEstimator, prior_type='BDeu')

# Explicit inference example
infer = VariableElimination(model)

# Explicit query example (using actual discrete values from your data):
# query_result = infer.query(['MatchOutcome'], evidence={
#     'disc_home_avg_market_value': 2,
#     'disc_home_nationalities': 1,
#     'disc_home_avg_age': 2,
#     'disc_home_total_minutes': 1,
#     'disc_away_avg_market_value': 1,
#     'disc_away_nationalities': 2,
#     'disc_away_avg_age': 1,
#     'disc_away_total_minutes': 1
# })

# print(query_result)
# predict explicitly using Bayesian network inference on test data
# Compute test strengths explicitly
data_test = X_home_test.copy()
data_test['HomeStrength'] = (X_home_test * weights_home).sum(axis=1)
data_test['AwayStrength'] = (X_away_test * weights_away).sum(axis=1)

# discretize strengths explicitly (using previously fitted discretizers!)
data_test['HomeStrength'] = kbins_home_strength.transform(
    data_test[['HomeStrength']]).astype(int).flatten()
data_test['AwayStrength'] = kbins_away_strength.transform(
    data_test[['AwayStrength']]).astype(int).flatten()
predictions = []
for idx, row in data_test.iterrows():
    evidence = {
        'disc_home_avg_market_value': row['disc_home_avg_market_value'],
        'disc_home_nationalities': row['disc_home_nationalities'],
        'disc_home_avg_age': row['disc_home_avg_age'],
        'disc_home_total_minutes': row['disc_home_total_minutes'],
        'disc_away_avg_market_value': X_away_test.loc[idx]['disc_away_avg_market_value'],
        'disc_away_nationalities': X_away_test.loc[idx]['disc_away_nationalities'],
        'disc_away_avg_age': X_away_test.loc[idx]['disc_away_avg_age'],
        'disc_away_total_minutes': X_away_test.loc[idx]['disc_away_total_minutes'],
        'HomeStrength': row['HomeStrength'],
        'AwayStrength': row['AwayStrength']
    }
    result = infer.query(['MatchOutcome'], evidence=evidence)
    pred = result.values.argmax()
    predictions.append(pred)

# evaluate explicitly accuracy
print(classification_report(y_test, predictions))

       disc_home_avg_market_value  disc_home_nationalities  disc_home_avg_age  \
0                               0                        3                  2   
1                               0                        2                  2   
2                               0                        2                  1   
3                               0                        2                  2   
4                               0                        2                  1   
...                           ...                      ...                ...   
59600                           0                        2                  1   
59601                           0                        0                  1   
59602                           0                        0                  4   
59603                           0                        1                  2   
59604                           0                        3                  3   

       disc_home_total_minu

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'game_id': 'N', 'date': 'C', 'home_club_id': 'N', 'away_club_id': 'N', 'home_club_goals': 'N', 'away_club_goals': 'N', 'home_avg_market_value': 'N', 'home_nationalities': 'N', 'home_avg_age': 'N', 'home_total_minutes': 'N', 'home_total_goals': 'N', 'home_total_assists': 'N', 'home_total_yellow_cards': 'N', 'home_total_red_cards': 'N', 'away_avg_market_value': 'N', 'away_nationalities': 'N', 'away_avg_age': 'N', 'away_total_minutes': 'N', 'away_total_goals': 'N', 'away_total_assists': 'N', 'away_total_yellow_cards': 'N', 'away_total_red_cards': 'N', 'result': 'C', 'result_code': 'N', 'MatchOutcome': 'N', 'disc_home_avg_market_value': 'N', 'disc_home_nationalities': 'N', 'disc_home_avg_age': 'N', 'disc_home_total_minutes': 'N', 'disc_away_avg_market_value': 'N', 'disc_away_nationalities': 'N', 'disc_away_avg_age': 'N', 'disc_away_total_minutes': 'N', 'HomeStrength': 'N', 'AwayStrengt

Learned Feature Weights:
                  Feature       Weights
0  home_avg_market_value -2.220446e-16
1     home_nationalities -1.635590e-13
2           home_avg_age -3.898502e-14
3     home_total_minutes -4.064341e-14
4  away_avg_market_value -2.535009e-15
5     away_nationalities -3.318364e-13
6           away_avg_age  8.328870e-14
7     away_total_minutes  7.596308e-13
              precision    recall  f1-score   support

           0       0.46      0.98      0.63      5432
           1       0.00      0.00      0.00      2839
           2       0.50      0.05      0.09      3650

    accuracy                           0.46     11921
   macro avg       0.32      0.34      0.24     11921
weighted avg       0.36      0.46      0.31     11921



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
