In [4]:
import sys
import platform

# This will show you the exact path to the Python executable being used
print(f"Executable: {sys.executable}")

# This will confirm the architecture
print(f"Architecture: {platform.architecture()[0]}")

Executable: /Users/lukemossbarger/Downloads/Northwestern Sophomore Year/DL Trading/2026traderintern/venv/bin/python
Architecture: 64bit


In [42]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

df = pd.read_csv('soccermatches.csv')

params = {
    'objective' : 'multi:softprob',
    'num_class' : 3
}

X = df.drop(columns=["Hwins","Awins","IsDraw","goals2H","Hgoals","Agoals","Hgoals1H","Agoals1H"])

y_extract = df[['Hwins', 'Awins', 'IsDraw']]
label_map = {'Hwins': 0, 'Awins': 1, 'IsDraw': 2}

y = y_extract.idxmax(axis=1).map(label_map)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

dtrain = xgb.DMatrix(X_train, label=y_train)

bst = xgb.train(params, dtrain)

dtest = xgb.DMatrix(X_test)
y_pred_probs = bst.predict(dtest)

print("Predicted Probabilities (HWin, AWin, Draw) for first 5 games:")
print(y_pred_probs[:5])

y_pred_labels = np.argmax(y_pred_probs, axis=1)

accuracy = accuracy_score(y_test, y_pred_labels)
print(f"\nAccuracy: {accuracy:.4f}")

Predicted Probabilities (HWin, AWin, Draw) for first 5 games:
[[0.49769008 0.21414854 0.28816134]
 [0.36347178 0.3062131  0.3303151 ]
 [0.55740637 0.16037448 0.2822191 ]
 [0.3829631  0.32602882 0.2910081 ]
 [0.5069728  0.15753654 0.33549067]]

Accuracy: 0.4750


In [45]:
import pandas as pd
import numpy as np

df = pd.read_csv('soccermatches.csv')
df['num_goals'] = df['Agoals'] + df['Hgoals']
df2 = pd.read_csv('soccermatches2.csv')
df2['num_goals'] = df2['Agoals'] + df2['Hgoals']
max_goals_1 = df['num_goals'].max()
max_goals_2 = df2['num_goals'].max()
print(max(max_goals_1, max_goals_2))

12


In [7]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

def tune():
    # 1. Load and Prepare Data
    df = pd.read_csv('soccermatches.csv')

    X_raw = df.drop(columns=["Hwins","Awins","IsDraw","goals2H","Hgoals","Agoals", "Hgoals1H", "Agoals1H"])
    X = X_raw.select_dtypes(include=np.number)

    y_extract = df[['Hwins', 'Awins', 'IsDraw']]
    label_map = {'Hwins': 0, 'Awins': 1, 'IsDraw': 2}
    y = y_extract.idxmax(axis=1).map(label_map)

    # 2. Define Parameter Grid
    param_grid = {
        'n_estimators': [100, 250, 500],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.05, 0.1, 0.2],
        'subsample': [0.8, 1.0]
    }

    # 3. Set Up Model and Grid Search
    xgb_model = xgb.XGBClassifier(objective='count:poisson', use_label_encoder=False, eval_metric='mlogloss')

    grid_search = GridSearchCV(
        estimator=xgb_model,
        param_grid=param_grid,
        scoring='accuracy',
        cv=5,
        n_jobs=-1,
        verbose=1
    )

    # 4. Run Tuning
    grid_search.fit(X, y)

    # 5. Print Results
    print(f"Best Hyperparameters: {grid_search.best_params_}")
    print(f"Best Cross-Validation Accuracy: {grid_search.best_score_:.4f}")

tune()

Fitting 5 folds for each of 54 candidates, totalling 270 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Best Hyperparameters: {'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Best Cross-Validation Accuracy: 0.4892


In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.linear_model import PoissonRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

# 1. Load and Prepare Data
df = pd.read_csv('soccermatches.csv')

X_raw = df.drop(columns=["Hwins","Awins","IsDraw","goals2H","Hgoals","Agoals","Hgoals1H","Agoals1H"])
X = X_raw.select_dtypes(include=np.number)
y = df['Agoals'] + df['Hgoals']

# 2. Create a Pipeline
# This correctly chains the imputer and the model together for cross-validation.
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('regressor', PoissonRegressor(max_iter=10000))
])

# 3. Define the Parameter Grid
# We tune the 'alpha' (regularization strength) of the regressor.
# The '__' syntax targets a parameter within a pipeline step.
param_grid = {
    'regressor__alpha': [0.01, 0.1, 1.0, 10.0, 100.0]
}

# 4. Set Up and Run Grid Search
# 'scoring' is 'neg_mean_absolute_error' because GridSearchCV maximizes a score.
# Maximizing negative MAE is the same as minimizing MAE.
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_mean_absolute_error',
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X, y)

# 5. Print the Results
print(f"Best Hyperparameters: {grid_search.best_params_}")
print(f"Best Cross-Validation MAE: {-grid_search.best_score_:.4f}")

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best Hyperparameters: {'regressor__alpha': 0.1}
Best Cross-Validation MAE: 1.2985


In [18]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.linear_model import PoissonRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.metrics import r2_score
import optuna
from itertools import combinations

In [22]:
# 1. Load and prepare the full dataset
df = pd.read_csv('soccermatches.csv')
df['AGPG_TY'] = df['AG_TY'] / df['AM_TY']
df['HGPG_TY'] = df['HG_TY'] / df['HM_TY']
df['AGPG_LY'] = df['AG_LY'] / df['AM_LY']
df['HGPG_LY'] = df['HG_LY'] / df['HM_LY']

df['AGD_TY_LY'] = df['AG_TY'] - df['AG_LY']
df['HGD_TY_LY'] = df['HG_TY'] - df['HG_LY']

features_to_interact = [
    "HG_LY","HGA_LY","HM_LY","AG_LY","AGA_LY","AM_LY",
    "HG_TY","HGA_TY","HM_TY","AG_TY","AGA_TY","AM_TY"
]

# 2. Generate all unique pairs and create new features
for feature1, feature2 in combinations(features_to_interact, 2):
    # Create a descriptive name for the new column
    new_col_name = f"{feature1}_x_{feature2}"
    
    # Multiply the pair of features to create the new column
    df[new_col_name] = df[feature1] * df[feature2]

drop_cols = ["Hwins","Awins","IsDraw","goals2H","Hgoals","Agoals","Hgoals1H","Agoals1H"]
X = df.drop(columns=drop_cols, errors='ignore').select_dtypes(include=np.number)
y = df['Hgoals'] + df['Agoals']


# Split data once into a training set for tuning and a final test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
def num_goals_preds_optuna():

    # 2. Define the objective function for Optuna
    def objective(trial):
        """
        This function takes a trial, suggests hyperparameters,
        trains a model, and returns its cross-validated MAE.
        """
        # Define the search space for hyperparameters
        params = {
            'objective': 'reg:squarederror',
            'random_state': 42,
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
            'max_depth': trial.suggest_int('max_depth', 3, 10),
            #'subsample': trial.suggest_float('subsample', 0.6, 1.0),
            #'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
            #'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 1.0, log=True) # L2 regularization
        }

        model = xgb.XGBRegressor(**params)

        # Evaluate the model using cross-validation
        score = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_absolute_error', n_jobs=-1)
        # Optuna minimizes, so we return the positive MAE
        return -score.mean()

    # 3. Create and run the Optuna study
    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=50) # Run 50 trials to find the best params
    return study

In [28]:
study = num_goals_preds_optuna()

[I 2025-10-13 20:00:08,923] A new study created in memory with name: no-name-9ce25412-4386-4e6f-92b1-1d3ed1591325
[I 2025-10-13 20:00:13,468] Trial 0 finished with value: 1.3199456214904786 and parameters: {'n_estimators': 515, 'learning_rate': 0.02742124245227122, 'max_depth': 6}. Best is trial 0 with value: 1.3199456214904786.
[I 2025-10-13 20:00:20,730] Trial 1 finished with value: 1.339887309074402 and parameters: {'n_estimators': 538, 'learning_rate': 0.04308999344301347, 'max_depth': 7}. Best is trial 0 with value: 1.3199456214904786.
[I 2025-10-13 20:00:21,931] Trial 2 finished with value: 1.3063752889633178 and parameters: {'n_estimators': 485, 'learning_rate': 0.041082264914774805, 'max_depth': 3}. Best is trial 2 with value: 1.3063752889633178.
[I 2025-10-13 20:00:50,907] Trial 3 finished with value: 1.365479826927185 and parameters: {'n_estimators': 934, 'learning_rate': 0.06809327750582593, 'max_depth': 9}. Best is trial 2 with value: 1.3063752889633178.
[I 2025-10-13 20:00

In [29]:
print("Study finished!")
print(f"Best trial's MAE: {study.best_value:.4f}")
print("Best hyperparameters found:")
print(study.best_params)

# 4. Train the final model with the best hyperparameters
best_params = study.best_params
final_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42, **best_params)

final_model.fit(X_train, y_train)

# 5. Evaluate the final model on the unseen test set
y_pred = final_model.predict(X_test)
final_mae = mean_absolute_error(y_test, y_pred)
final_r2 = r2_score(y_test, y_pred)

print("\n--- Final Model Performance on Test Set ---")
print(f"MAE: {final_mae:.4f}")
print(f"R2 Score: {final_r2:.4f}")

Study finished!
Best trial's MAE: 1.3022
Best hyperparameters found:
{'n_estimators': 317, 'learning_rate': 0.030369227466089777, 'max_depth': 3}

--- Final Model Performance on Test Set ---
MAE: 1.3114
R2 Score: 0.0325


In [24]:
final_model.feature_importances_

array([0.01325375, 0.00726338, 0.01175765, 0.00675348, 0.0101836 ,
       0.01456332, 0.0112119 , 0.00807254, 0.01039486, 0.01582178,
       0.01058504, 0.01321434, 0.00856185, 0.01035517, 0.00761774,
       0.00564191, 0.01119582, 0.0207424 , 0.01497057, 0.0266114 ,
       0.01076366, 0.01078389, 0.00927387, 0.01199676, 0.00928864,
       0.01345425, 0.00926368, 0.01476245, 0.01092656, 0.01004674,
       0.00866152, 0.01306356, 0.01031584, 0.00974956, 0.00884903,
       0.00968245, 0.01097511, 0.01299638, 0.01024641, 0.00971518,
       0.00899831, 0.01081002, 0.01036504, 0.01087699, 0.0107192 ,
       0.01200688, 0.00918987, 0.01089194, 0.01296311, 0.00992589,
       0.01290701, 0.01193145, 0.01218913, 0.01112413, 0.01056786,
       0.01063012, 0.00884005, 0.01586586, 0.01194403, 0.00955223,
       0.01047268, 0.01177449, 0.00940214, 0.01116708, 0.01073733,
       0.01178944, 0.01184787, 0.0097262 , 0.00984081, 0.01249218,
       0.01162551, 0.01006279, 0.01138041, 0.00922073, 0.01096

In [25]:
final_model.feature_names_in_

array(['HG_LY', 'HGA_LY', 'HM_LY', 'AG_LY', 'AGA_LY', 'AM_LY', 'HG_TY',
       'HGA_TY', 'HM_TY', 'AG_TY', 'AGA_TY', 'AM_TY', 'Hrelegated',
       'Hpromoted', 'Arelegated', 'Apromoted', 'AGPG_TY', 'HGPG_TY',
       'AGPG_LY', 'HGPG_LY', 'AGD_TY_LY', 'HGD_TY_LY', 'HG_LY_x_HGA_LY',
       'HG_LY_x_HM_LY', 'HG_LY_x_AG_LY', 'HG_LY_x_AGA_LY',
       'HG_LY_x_AM_LY', 'HG_LY_x_HG_TY', 'HG_LY_x_HGA_TY',
       'HG_LY_x_HM_TY', 'HG_LY_x_AG_TY', 'HG_LY_x_AGA_TY',
       'HG_LY_x_AM_TY', 'HGA_LY_x_HM_LY', 'HGA_LY_x_AG_LY',
       'HGA_LY_x_AGA_LY', 'HGA_LY_x_AM_LY', 'HGA_LY_x_HG_TY',
       'HGA_LY_x_HGA_TY', 'HGA_LY_x_HM_TY', 'HGA_LY_x_AG_TY',
       'HGA_LY_x_AGA_TY', 'HGA_LY_x_AM_TY', 'HM_LY_x_AG_LY',
       'HM_LY_x_AGA_LY', 'HM_LY_x_AM_LY', 'HM_LY_x_HG_TY',
       'HM_LY_x_HGA_TY', 'HM_LY_x_HM_TY', 'HM_LY_x_AG_TY',
       'HM_LY_x_AGA_TY', 'HM_LY_x_AM_TY', 'AG_LY_x_AGA_LY',
       'AG_LY_x_AM_LY', 'AG_LY_x_HG_TY', 'AG_LY_x_HGA_TY',
       'AG_LY_x_HM_TY', 'AG_LY_x_AG_TY', 'AG_LY_x_AGA_TY',
