In [None]:
from pgmpy.models import DiscreteBayesianNetwork
from pgmpy.estimators import BayesianEstimator
from pgmpy.inference import VariableElimination
import pandas as pd
from sklearn.preprocessing import KBinsDiscretizer

# datast
data = pd.read_csv('../../cleaned_data/final_dataset.csv')

# outcome as numerical category
data['result_code'] = data['result'].map(
    {'Home Win': 0, 'Draw': 1, 'Away Win': 2})
data['MatchOutcome'] = data['result_code']

# discretize continuous features. binning.
features_home = ['home_avg_market_value',
                 'home_nationalities', 'home_avg_age', 'home_total_minutes']
features_away = ['away_avg_market_value',
                 'away_nationalities', 'away_avg_age', 'away_total_minutes']

kbins_home = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
kbins_away = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')

data_discrete_home = kbins_home.fit_transform(data[features_home]).astype(int)
data_discrete_away = kbins_away.fit_transform(data[features_away]).astype(int)

# put back discretized values back to data
for i, col in enumerate(features_home):
    data[f'disc_{col}'] = data_discrete_home[:, i]

for i, col in enumerate(features_away):
    data[f'disc_{col}'] = data_discrete_away[:, i]

# for now, now weights for observables -> TeamStrength
data['HomeStrength'] = data[[
    f'disc_{col}' for col in features_home]].sum(axis=1)
data['AwayStrength'] = data[[
    f'disc_{col}' for col in features_away]].sum(axis=1)

# explicitly re-discretize to ensure clear discrete bins
data['HomeStrength'] = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform').fit_transform(
    data[['HomeStrength']]
).astype(int).flatten()

data['AwayStrength'] = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform').fit_transform(
    data[['AwayStrength']]
).astype(int).flatten()

# setup network
"""
disc_home_avg_market_value ──┐
disc_home_nationalities ─────┤
disc_home_avg_age ───────────┤ → HomeStrength ────┐
disc_home_total_minutes ─────┘                    │
                                                  │ → MatchOutcome
disc_away_avg_market_value ──┐                    │
disc_away_nationalities ─────┤                    │
disc_away_avg_age ───────────┤ → AwayStrength ────┘
disc_away_total_minutes ─────┘
"""
structure = [
    ('disc_home_avg_market_value', 'HomeStrength'),
    ('disc_home_nationalities', 'HomeStrength'),
    ('disc_home_avg_age', 'HomeStrength'),
    ('disc_home_total_minutes', 'HomeStrength'),

    ('disc_away_avg_market_value', 'AwayStrength'),
    ('disc_away_nationalities', 'AwayStrength'),
    ('disc_away_avg_age', 'AwayStrength'),
    ('disc_away_total_minutes', 'AwayStrength'),

    ('HomeStrength', 'MatchOutcome'),
    ('AwayStrength', 'MatchOutcome')
]

# Define explicit Bayesian network explicitly using new class
model = DiscreteBayesianNetwork(structure)

# estimate CPTs. BDeu: Bayesian Dirichlet equivalent uniform. avoid zeros.
model.fit(data, estimator=BayesianEstimator, prior_type='BDeu')

# Explicit inference example
infer = VariableElimination(model)

# Explicit query example (using actual discrete values from your data):
query_result = infer.query(['MatchOutcome'], evidence={
    'disc_home_avg_market_value': 2,
    'disc_home_nationalities': 1,
    'disc_home_avg_age': 2,
    'disc_home_total_minutes': 1,
    'disc_away_avg_market_value': 1,
    'disc_away_nationalities': 2,
    'disc_away_avg_age': 1,
    'disc_away_total_minutes': 1
})

print(query_result)

INFO:pgmpy: Datatype (N=numerical, C=Categorical Unordered, O=Categorical Ordered) inferred from data: 
 {'game_id': 'N', 'date': 'C', 'home_club_id': 'N', 'away_club_id': 'N', 'home_club_goals': 'N', 'away_club_goals': 'N', 'home_avg_market_value': 'N', 'home_nationalities': 'N', 'home_avg_age': 'N', 'home_total_minutes': 'N', 'home_total_goals': 'N', 'home_total_assists': 'N', 'home_total_yellow_cards': 'N', 'home_total_red_cards': 'N', 'away_avg_market_value': 'N', 'away_nationalities': 'N', 'away_avg_age': 'N', 'away_total_minutes': 'N', 'away_total_goals': 'N', 'away_total_assists': 'N', 'away_total_yellow_cards': 'N', 'away_total_red_cards': 'N', 'result': 'C', 'result_code': 'N', 'MatchOutcome': 'N', 'disc_home_avg_market_value': 'N', 'disc_home_nationalities': 'N', 'disc_home_avg_age': 'N', 'disc_home_total_minutes': 'N', 'disc_away_avg_market_value': 'N', 'disc_away_nationalities': 'N', 'disc_away_avg_age': 'N', 'disc_away_total_minutes': 'N', 'HomeStrength': 'N', 'AwayStrengt

+-----------------+---------------------+
| MatchOutcome    |   phi(MatchOutcome) |
| MatchOutcome(0) |              0.4443 |
+-----------------+---------------------+
| MatchOutcome(1) |              0.2119 |
+-----------------+---------------------+
| MatchOutcome(2) |              0.3438 |
+-----------------+---------------------+
