In [1]:
%run "../Functions/functions_v1.ipynb"

# Ingest and clean upcoming games

In [3]:
#read scraped incoming matches
current_fixtures = pd.read_csv("../Data/upcoming_matches.csv")

#Clean up the raw scraped data
current_fixtures['date'] = current_fixtures["date"].apply(convert_current_date)
current_fixtures[["elo_1", "elo_2"]] = current_fixtures["elo"].apply(lambda x: split_space(x))
current_fixtures["elo_diff_1"] = current_fixtures["elo_1"].apply(int) - current_fixtures["elo_2"].apply(int)
current_fixtures["elo_diff_2"] = -current_fixtures["elo_diff_1"]
current_fixtures = current_fixtures.drop(["rank", "elo", "win_pct"], axis = 1)
current_fixtures

Unnamed: 0,date,team1,team2,location,scrape_date,elo_1,elo_2,elo_diff_1,elo_diff_2
0,2024-07-27,El Salvador,Guatemala,Friendly in the United States,2024-07-23,1389,1485,-96,96
1,2024-09-02,Fiji,Solomon Islands,Friendly tournament in Fiji,2024-07-23,1198,1240,-42,42
2,2024-09-04,Anguilla,Turks and Caicos Islands,CONCACAF Nations League C in Anguilla,2024-07-23,599,690,-91,91
3,2024-09-04,British Virgin Islands,Cayman Islands,CONCACAF Nations League C in the British Virgi...,2024-07-23,651,825,-174,174
4,2024-09-04,US Virgin Islands,Bahamas,CONCACAF Nations League C in the US Virgin Isl...,2024-07-23,618,810,-192,192
...,...,...,...,...,...,...,...,...,...
464,2025-09-14,Bolivia,Brazil,World Cup qualifier in Bolivia,2024-07-23,1551,2021,-470,470
465,2025-09-14,Chile,Uruguay,World Cup qualifier in Chile,2024-07-23,1723,1987,-264,264
466,2025-09-14,Ecuador,Argentina,World Cup qualifier in Ecuador,2024-07-23,1871,2168,-297,297
467,2025-09-14,Peru,Paraguay,World Cup qualifier in Peru,2024-07-23,1710,1661,49,-49


# Create xG predictions

In [5]:
#Import the two models
gb_model = joblib.load('../Models/sklearn_gradient_boosted_model.pkl')
lin_reg_model = joblib.load('../Models/linreg_model.pkl')

In [6]:
#get the correct inputs for the xG Models
team_1_xg_df = current_fixtures[["elo_1", "elo_2", "elo_diff_1"]].rename(columns = {

        "elo_1":"elo",
        "elo_2":"opp_elo",
        "elo_diff_1":"elo_diff",
    })


team_2_xg_df = current_fixtures[["elo_2", "elo_1", "elo_diff_2"]].rename(columns = {

        "elo_2":"elo",
        "elo_1":"opp_elo",
        "elo_diff_2":"elo_diff",
    })


#forecast the xgs for each team based on the two different models
current_fixtures["gb_xg_1"] = gb_model.predict(team_1_xg_df)
current_fixtures["gb_xg_2"] = gb_model.predict(team_2_xg_df)

current_fixtures["lr_xg_1"] = lin_reg_model.predict(team_1_xg_df)
current_fixtures["lr_xg_2"] = lin_reg_model.predict(team_2_xg_df)

# Create T1 / D / T2 Probabilities

In [8]:
#get ELO probabilities
elo_historical_average = pd.read_csv('../Models/elo_win_rates.csv')
current_fixtures["elo_probs"] = current_fixtures.apply(lambda row: get_historical_elo(row['elo_diff_1'], elo_historical_average), axis = 1)

#Get implied odds by using either the lr or gb model to forecast xG, and then simulate 500 games using poisson simulations
current_fixtures['lr_probs'] = current_fixtures.apply(lambda row: possion_sim(row['lr_xg_1'], row['lr_xg_2'], 500), axis=1)
current_fixtures['gb_probs'] = current_fixtures.apply(lambda row: possion_sim(row['gb_xg_1'], row['gb_xg_2'], 500), axis=1)

In [9]:
current_fixtures[["date", "team1", "team2", "elo_probs", "lr_probs", "gb_probs"]]

Unnamed: 0,date,team1,team2,elo_probs,lr_probs,gb_probs
0,2024-07-27,El Salvador,Guatemala,"[0.2847682119205298, 0.2947019867549669, 0.420...","[0.24, 0.26, 0.5]","[0.476, 0.232, 0.292]"
1,2024-09-02,Fiji,Solomon Islands,"[0.314569536423841, 0.3096026490066225, 0.3758...","[0.342, 0.242, 0.416]","[0.194, 0.554, 0.252]"
2,2024-09-04,Anguilla,Turks and Caicos Islands,"[0.2847682119205298, 0.2947019867549669, 0.420...","[0.314, 0.2, 0.486]","[0.48, 0.256, 0.264]"
3,2024-09-04,British Virgin Islands,Cayman Islands,"[0.2559139784946236, 0.1870967741935484, 0.556...","[0.214, 0.216, 0.57]","[0.172, 0.218, 0.61]"
4,2024-09-04,US Virgin Islands,Bahamas,"[0.2559139784946236, 0.1870967741935484, 0.556...","[0.166, 0.208, 0.626]","[0.252, 0.196, 0.552]"
...,...,...,...,...,...,...
464,2025-09-14,Bolivia,Brazil,"[0.1347826086956521, 0.0260869565217391, 0.839...","[0.004, 0.108, 0.888]","[0.098, 0.094, 0.808]"
465,2025-09-14,Chile,Uruguay,"[0.2315950920245398, 0.1134969325153374, 0.654...","[0.074, 0.202, 0.724]","[0.024, 0.184, 0.792]"
466,2025-09-14,Ecuador,Argentina,"[0.2315950920245398, 0.1134969325153374, 0.654...","[0.086, 0.192, 0.722]","[0.374, 0.314, 0.312]"
467,2025-09-14,Peru,Paraguay,"[0.3124484748557296, 0.3759274525968673, 0.311...","[0.424, 0.286, 0.29]","[0.224, 0.224, 0.552]"


# Ensemble the results
Since the Gradient boosted regression preforms worse than the linear regression, the ensemble predictions will be just based on a 50 / 50 split of the elo forecast and linear regression based poisson simulation

In [11]:
#In this function we take the average of the T1 / D / T2 probabilities from the linear regression model and the ELO model
current_fixtures["ensemble_preds"] = current_fixtures.apply(lambda row: calculate_average(row['lr_probs'], row['elo_probs']), axis=1)

#we rebased the aveage so that all probabilities sum to 1
current_fixtures["ensemble_preds"] = current_fixtures["ensemble_preds"].apply(rebase_estimate)

In [12]:
#Parse out the probabilities into their own seperate columns
current_fixtures[['team_1_w', 'draw', 'team_2_w']] = pd.DataFrame(current_fixtures['ensemble_preds'].tolist(), index=current_fixtures.index)

# Clean up Final Results to get a

In [13]:
to_publish = current_fixtures[["date", "location", "team1", "team2", 'team_1_w', 'draw', 'team_2_w']]
to_publish["game"] = current_fixtures["team1"] + " vs " + current_fixtures["team2"]
to_publish["team_1"] = to_publish["team_1_w"].apply(odds_round)
to_publish["draw"] = to_publish["draw"].apply(odds_round)
to_publish["team_2"] = to_publish["team_2_w"].apply(odds_round)
to_publish = to_publish[["date", "location", "game", "team_1", "draw", "team_2"]]
to_publish

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_publish["game"] = current_fixtures["team1"] + " vs " + current_fixtures["team2"]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_publish["team_1"] = to_publish["team_1_w"].apply(odds_round)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  to_publish["draw"] = to_publish["draw"].apply(odds_round)

Unnamed: 0,date,location,game,team_1,draw,team_2
0,2024-07-27,Friendly in the United States,El Salvador vs Guatemala,0.26,0.28,0.46
1,2024-09-02,Friendly tournament in Fiji,Fiji vs Solomon Islands,0.33,0.28,0.40
2,2024-09-04,CONCACAF Nations League C in Anguilla,Anguilla vs Turks and Caicos Islands,0.30,0.25,0.45
3,2024-09-04,CONCACAF Nations League C in the British Virgi...,British Virgin Islands vs Cayman Islands,0.23,0.20,0.56
4,2024-09-04,CONCACAF Nations League C in the US Virgin Isl...,US Virgin Islands vs Bahamas,0.21,0.20,0.59
...,...,...,...,...,...,...
464,2025-09-14,World Cup qualifier in Bolivia,Bolivia vs Brazil,0.07,0.07,0.86
465,2025-09-14,World Cup qualifier in Chile,Chile vs Uruguay,0.15,0.16,0.69
466,2025-09-14,World Cup qualifier in Ecuador,Ecuador vs Argentina,0.16,0.15,0.69
467,2025-09-14,World Cup qualifier in Peru,Peru vs Paraguay,0.37,0.33,0.30


In [25]:
to_publish.to_csv('../Data/latest_odds.csv', index=False) #save current version