In [5]:
import pandas as pd


train = pd.read_csv("train.csv")



In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib

# --- FEATURES & TARGET ---
feature_cols = [
    'home_Cor', 'home_Cro', 'home_Dri', 'home_Fin', 'home_Fir', 'home_Fre',
    'home_Hea', 'home_Lon', 'home_L Th', 'home_Pas', 'home_Pen', 'home_Tck',
    'home_Tec', 'home_Agg', 'home_Ant', 'home_Bra', 'home_Cmp', 'home_Cnt',
    'home_Dec', 'home_Det', 'home_Fla', 'home_Ldr', 'home_OtB', 'home_Pos',
    'home_Tea', 'home_Vis', 'home_Wor', 'home_Acc', 'home_Agi', 'home_Bal',
    'home_Jum', 'home_Pac', 'home_Nat', 'home_Sta', 'home_Str',
    'away_Cor', 'away_Cro', 'away_Dri', 'away_Fin', 'away_Fir', 'away_Fre',
    'away_Hea', 'away_Lon', 'away_L Th', 'away_Pas', 'away_Pen', 'away_Tck',
    'away_Tec', 'away_Agg', 'away_Ant', 'away_Bra', 'away_Cmp', 'away_Cnt',
    'away_Dec', 'away_Det', 'away_Fla', 'away_Ldr', 'away_OtB', 'away_Pos',
    'away_Tea', 'away_Vis', 'away_Wor', 'away_Acc', 'away_Agi', 'away_Bal',
    'away_Jum', 'away_Pac', 'away_Nat', 'away_Sta', 'away_Str'
]

X = train[feature_cols]
y = train["result"]   # contains 'W', 'D', 'L'

# --- LABEL ENCODING ---
le = LabelEncoder()
y_enc = le.fit_transform(y)

# --- TRAIN/TEST SPLIT (optional, but useful) ---
X_train, X_test, y_train, y_test = train_test_split(
    X, y_enc, test_size=0.2, random_state=42
)

# --- RANDOM FOREST ---
clf = RandomForestClassifier(
    n_estimators=600,
    max_depth=None,
    random_state=42,
    class_weight="balanced"   # helpful if W/D/L classes are imbalanced
)

clf.fit(X_train, y_train)

# --- SAVE MODEL ---
joblib.dump(clf, "rf_model.joblib")
joblib.dump(le, "label_encoder.joblib")
joblib.dump(feature_cols, "feature_cols.joblib")

print("Model trained & saved.")


Model trained & saved.


In [9]:
df = pd.read_csv("attributes.csv")

club_avgs = df.groupby("Club").mean(numeric_only=True)





In [10]:
import numpy as np
import joblib

clf = joblib.load("rf_model.joblib")
le = joblib.load("label_encoder.joblib")
feature_cols = joblib.load("feature_cols.joblib")

def predict_match(home_club, away_club, club_avgs):
    row = {}

    # Home attributes
    for attr in club_avgs[home_club]:
        col = f"home_{attr}"
        if col in feature_cols:
            row[col] = club_avgs[home_club][attr]

    # Away attributes
    for attr in club_avgs[away_club]:
        col = f"away_{attr}"
        if col in feature_cols:
            row[col] = club_avgs[away_club][attr]

    # Order columns
    X_input = np.array([row[c] for c in feature_cols]).reshape(1, -1)

    # Predict probabilities
    probs = clf.predict_proba(X_input)[0]

    labels = le.inverse_transform([0, 1, 2])  # corresponds to ['D','L','W'] or similar depending on encoding

    return dict(zip(labels, probs))


In [None]:
probs = predict_match("United", "Liverpool", club_avgs)
print(probs)


In [12]:
df["Club"].unique()


array(['Arsenal', 'Aston Villa', 'Bournemouth', 'Brentford', 'Brighton',
       'Chelsea', 'City', 'Crystal Palace', 'Everton', 'Forest', 'Fulham',
       'Ipswich', 'Leicester', 'Liverpool', 'Newcastle', 'Southampton',
       'Tottenham', 'United', 'West Ham', 'Wolves'], dtype=object)