## Fantasy Score Predictor

# Data Preprocessing

In [27]:
import pandas as pd
import numpy as np

matches = pd.read_csv("Data/full_match_dat_all_countries.csv")
ratings = pd.read_csv("Data/team_ratings.csv")
ratings = ratings[pd.to_numeric(ratings["global_rank"], errors="coerce").notna()]

# Keep only needed columns
ratings = ratings[["team_name", "elo_rating"]]
matches = matches.merge(
    ratings.rename(columns={"team_name": "Country", "elo_rating": "team_elo"}),
    on="Country", how="left")
matches = matches.merge(
    ratings.rename(columns={"team_name": "Opp Team", "elo_rating": "opp_elo"}),
    on="Opp Team",
    how="left"
)

In [35]:
matches.columns


Index(['Country', 'Date', 'Player', 'Mins', 'Opp Team', 'Pos', 'Gls', 'Ast',
       'PKwon', 'PK', 'PKatt', 'Shots', 'SoT', 'TklW', 'CrdY', 'CrdR', 'OG',
       'GA', 'Saves', 'Save%', 'PK Non SO Conceded', 'PK SO scorer',
       'GK PK SO saves', 'team_elo', 'opp_elo', 'Mins per 90', 'Gls_per_90',
       'Ast_per_90', 'Shots_per_90', 'SoT_per_90', 'CrdY_per_90',
       'CrdR_per_90', 'GA_per_90', 'Saves_per_90', 'Fantasy Points'],
      dtype='object')

Shortening the list of Positions in my dataset to just be FWD, MID, DEF, GK

FWD = Striker, Winger, Attacking Midfield

MID = LM, RM, CM, DM

DEF = CB, WB, LB, RB

GK = Goalkeeper

If they played multiple I looked at the specific player and made a judgment call based on their primary position

In [29]:
np.unique(matches.loc[:,'Pos'])

array(['AM', 'AM,CM', 'AM,DM', 'AM,FW', 'AM,FW,RW', 'AM,LM', 'AM,LW',
       'AM,LW,CM', 'AM,LW,RW', 'AM,RW', 'AM,RW,RM', 'AM,WB', 'CB',
       'CB,LB', 'CB,LM', 'CB,RB', 'CM', 'CM,AM', 'CM,AM,LM', 'CM,CB',
       'CM,DM', 'CM,DM,LM', 'CM,FW', 'CM,LB', 'CM,LM', 'CM,LM,LW',
       'CM,RB,DM', 'CM,RM', 'CM,RW', 'DF', 'DF,FW', 'DF,MF', 'DM',
       'DM,AM', 'DM,CB', 'DM,CM', 'DM,LM', 'DM,RB', 'DM,RM', 'DM,RW',
       'FW', 'FW,AM', 'FW,CB', 'FW,CM', 'FW,LM', 'FW,LW', 'FW,LW,RW',
       'FW,MF', 'FW,RB', 'FW,RM', 'FW,RW', 'GK', 'LB', 'LB,CB', 'LB,FW',
       'LB,LM', 'LB,LW', 'LB,RB', 'LB,RM', 'LB,WB', 'LB,WB,LM', 'LM',
       'LM,AM', 'LM,CM', 'LM,DM', 'LM,FW', 'LM,LB', 'LM,LW', 'LM,RM',
       'LM,RM,CM', 'LM,RW', 'LM,RW,AM', 'LM,RW,RM', 'LW', 'LW,AM',
       'LW,CM', 'LW,CM,DM', 'LW,CM,LM', 'LW,FW', 'LW,LB', 'LW,LB,WB',
       'LW,LM', 'LW,RM', 'LW,RM,LM', 'LW,RM,RW', 'LW,RW', 'LW,RW,RM',
       'MF', 'RB', 'RB,CB', 'RB,LB', 'RB,RM', 'RB,RW', 'RB,WB', 'RM',
       'RM,AM,LM', 'RM,CM', '

In [36]:
matches.loc[matches['Pos']=='AM','Pos'] = "FWD"
matches.loc[matches['Pos']=='AM,CM','Pos'] = "MID"
matches.loc[matches['Pos']=='AM,DM','Pos'] = "MID"
matches.loc[matches['Pos']=='AM,FW','Pos'] = "FWD"
matches.loc[matches['Pos']=='AM,FW,RW','Pos'] = "FWD"
matches.loc[matches['Pos']=='AM,LM','Pos'] = "MID"
matches.loc[matches['Pos']=='AM,LW','Pos'] = "FWD"
matches.loc[matches['Pos']=='AM,LW,CM','Pos'] = "FWD"
matches.loc[matches['Pos']=='AM,LW,RW','Pos'] = "FWD"
matches.loc[matches['Pos']=='AM,RW','Pos'] = "FWD"
matches.loc[matches['Pos']=='AM,RW,RM','Pos'] = "FWD"
matches.loc[matches['Pos']=='AM,WB','Pos'] = "MID"
matches.loc[matches['Pos']=='CB','Pos'] = "DEF"
matches.loc[matches['Pos']=='CB,LB','Pos'] = "DEF"
matches.loc[matches['Pos']=='CB,LM','Pos'] = "DEF"
matches.loc[matches['Pos']=='CB,RB','Pos'] = "DEF"
matches.loc[matches['Pos']=='CM','Pos'] = "MID"
matches.loc[matches['Pos']=='CM,AM','Pos'] = "MID"
matches.loc[matches['Pos']=='CM,AM,LM','Pos'] = "MID"
matches.loc[matches['Pos']=='CM,CB','Pos'] = "MID"
matches.loc[matches['Pos']=='CM,DM','Pos'] = "MID"
matches.loc[matches['Pos']=='CM,DM,LM','Pos'] = "MID"
matches.loc[matches['Pos']=='CM,FW','Pos'] = "MID"
matches.loc[matches['Pos']=='CM,LB','Pos'] = "MID"
matches.loc[matches['Pos']=='CM,LM','Pos'] = "MID"
matches.loc[matches['Pos']=='CM,LM,LW','Pos'] = "MID"
matches.loc[matches['Pos']=='CM,RB,DM','Pos'] = "MID"
matches.loc[matches['Pos']=='CM,RM','Pos'] = "MID"
matches.loc[matches['Pos']=='CM,RW','Pos'] = "MID"
matches.loc[matches['Pos']=='DF','Pos'] = "DEF"
matches.loc[matches['Pos']=='DF,FW','Pos'] = "DEF"
matches.loc[matches['Pos']=='DF,MF','Pos'] = "DEF"
matches.loc[matches['Pos']=='DM','Pos'] = "MID"
matches.loc[matches['Pos']=='DM,CB','Pos'] = "MID"
matches.loc[matches['Pos']=='DM,CM','Pos'] = "MID"
matches.loc[matches['Pos']=='DM,LM','Pos'] = "MID"
matches.loc[matches['Pos']=='DM,RB','Pos'] = "DEF"
matches.loc[matches['Pos']=='DM,RM','Pos'] = "MID"
matches.loc[matches['Pos']=='DM,RW','Pos'] = "MID"
matches.loc[matches['Pos']=='DM,AM','Pos'] = "MID"
matches.loc[matches['Pos']=='FW','Pos'] = "FWD"
matches.loc[matches['Pos']=='FW,AM','Pos'] = "FWD"
matches.loc[matches['Pos']=='FW,CB','Pos'] = "FWD"
matches.loc[matches['Pos']=='FW,CM','Pos'] = "FWD"
matches.loc[matches['Pos']=='FW,LM','Pos'] = "FWD"
matches.loc[matches['Pos']=='FW,LW','Pos'] = "FWD"
matches.loc[matches['Pos']=='FW,LW,RW','Pos'] = "FWD"
matches.loc[matches['Pos']=='FW,MF','Pos'] = "FWD"
matches.loc[matches['Pos']=='FW,RB','Pos'] = "FWD"
matches.loc[matches['Pos']=='FW,RM','Pos'] = "FWD"
matches.loc[matches['Pos']=='FW,RW','Pos'] = "FWD"
matches.loc[matches['Pos']=='LB','Pos'] = "DEF"
matches.loc[matches['Pos']=='LB,CB','Pos'] = "DEF"
matches.loc[matches['Pos']=='LB,FW','Pos'] = "DEF"
matches.loc[matches['Pos']=='LB,LM','Pos'] = "DEF"
matches.loc[matches['Pos']=='LB,LW','Pos'] = "DEF"
matches.loc[matches['Pos']=='LB,RB','Pos'] = "DEF"
matches.loc[matches['Pos']=='LB,RM','Pos'] = "DEF"
matches.loc[matches['Pos']=='LB,WB','Pos'] = "DEF"
matches.loc[matches['Pos']=='LB,WB,LM','Pos'] = "MID"
matches.loc[matches['Pos']=='LB,WB,RM','Pos'] = "DEF"
matches.loc[matches['Pos']=='LM','Pos'] = "MID"
matches.loc[matches['Pos']=='LM,AM','Pos'] = "MID"
matches.loc[matches['Pos']=='LM,CM','Pos'] = "MID"
matches.loc[matches['Pos']=='LM,DM','Pos'] = "MID"
matches.loc[matches['Pos']=='LM,LB','Pos'] = "MID"
matches.loc[matches['Pos']=='LM,LW','Pos'] = "FWD"
matches.loc[matches['Pos']=='LM,FW','Pos'] = "FWD"
matches.loc[matches['Pos']=='LM,RM','Pos'] = "MID"
matches.loc[matches['Pos']=='LM,RM,CM','Pos'] = "MID"
matches.loc[matches['Pos']=='LM,RW','Pos'] = "FWD"
matches.loc[matches['Pos']=='LM,RW,AM','Pos'] = "FWD"
matches.loc[matches['Pos']=='LM,RW,RM','Pos'] = "FWD"
matches.loc[matches['Pos']=='LW','Pos'] = "FWD"
matches.loc[matches['Pos']=='LW,AM','Pos'] = "FWD"
matches.loc[matches['Pos']=='LW,CM','Pos'] = "FWD"
matches.loc[matches['Pos']=='LW,CM,DM','Pos'] = "FWD"
matches.loc[matches['Pos']=='LW,CM,LM','Pos'] = "MID"
matches.loc[matches['Pos']=='LW,RW','Pos'] = "FWD"
matches.loc[matches['Pos']=='LW,RW,RM','Pos'] = "FWD"
matches.loc[matches['Pos']=='LW,FW','Pos'] = "FWD"
matches.loc[matches['Pos']=='LW,LB,WB','Pos'] = "FWD"
matches.loc[matches['Pos']=='LW,LM','Pos'] = "FWD"
matches.loc[matches['Pos']=='LW,LB','Pos'] = "FWD"
matches.loc[matches['Pos']=='LW,RM','Pos'] = "FWD"
matches.loc[matches['Pos']=='LW,RM,LM','Pos'] = "FWD"
matches.loc[matches['Pos']=='LW,RM,RW','Pos'] = "MID"
matches.loc[matches['Pos']=='MF','Pos'] = "MID"
matches.loc[matches['Pos']=='RB','Pos'] = "DEF"
matches.loc[matches['Pos']=='RB,CB','Pos'] = "DEF"
matches.loc[matches['Pos']=='RB,LB','Pos'] = "DEF"
matches.loc[matches['Pos']=='RB,RM','Pos'] = "DEF"
matches.loc[matches['Pos']=='RB,RW','Pos'] = "DEF"
matches.loc[matches['Pos']=='RB,WB','Pos'] = "DEF"
matches.loc[matches['Pos']=='RM','Pos'] = "MID"
matches.loc[matches['Pos']=='RM,AM,LM','Pos'] = "MID"
matches.loc[matches['Pos']=='RM,CM','Pos'] = "MID"
matches.loc[matches['Pos']=='RM,CM,AM','Pos'] = "MID"
matches.loc[matches['Pos']=='RM,DM','Pos'] = "MID"
matches.loc[matches['Pos']=='RM,FW','Pos'] = "FWD"
matches.loc[matches['Pos']=='RM,FW,LW','Pos'] = "FWD"
matches.loc[matches['Pos']=='RM,LB','Pos'] = "MID"
matches.loc[matches['Pos']=='RM,LM','Pos'] = "MID"
matches.loc[matches['Pos']=='RM,LM','Pos'] = "MID"
matches.loc[matches['Pos']=='RM,LM,FW','Pos'] = "MID"
matches.loc[matches['Pos']=='RM,LW','Pos'] = "MID"
matches.loc[matches['Pos']=='RM,LW,RW','Pos'] = "MID"
matches.loc[matches['Pos']=='RM,RB','Pos'] = "MID"
matches.loc[matches['Pos']=='RM,RW','Pos'] = "MID"
matches.loc[matches['Pos']=='RM,RW,CM','Pos'] = "MID"
matches.loc[matches['Pos']=='RM,RW,LM','Pos'] = "MID"
matches.loc[matches['Pos']=='RM,RW,LW','Pos'] = "FWD"
matches.loc[matches['Pos']=='RW','Pos'] = "FWD"
matches.loc[matches['Pos']=='RW,AM','Pos'] = "FWD"
matches.loc[matches['Pos']=='RW,AM,DM','Pos'] = "FWD"
matches.loc[matches['Pos']=='RW,CM','Pos'] = "FWD"
matches.loc[matches['Pos']=='RW,FW','Pos'] = "FWD"
matches.loc[matches['Pos']=='RW,FW,LW','Pos'] = "FWD"
matches.loc[matches['Pos']=='RW,LB,RM','Pos'] = "FWD"
matches.loc[matches['Pos']=='RW,LM','Pos'] = "FWD"
matches.loc[matches['Pos']=='RW,LW','Pos'] = "FWD"
matches.loc[matches['Pos']=='RW,LW,LM','Pos'] = "FWD"
matches.loc[matches['Pos']=='RW,RB','Pos'] = "FWD"
matches.loc[matches['Pos']=='RW,RB,AM','Pos'] = "FWD"
matches.loc[matches['Pos']=='RW,RB,LB','Pos'] = "FWD"
matches.loc[matches['Pos']=='RW,RM','Pos'] = "FWD"
matches.loc[matches['Pos']=='WB','Pos'] = "DEF"
matches.loc[matches['Pos']=='WB,AM','Pos'] = "FWD"
matches.loc[matches['Pos']=='WB,CB','Pos'] = "DEF"
matches.loc[matches['Pos']=='WB,CM,RB','Pos'] = "DEF"
matches.loc[matches['Pos']=='WB,LB','Pos'] = "DEF"
matches.loc[matches['Pos']=='WB,RB','Pos'] = "DEF"

In [37]:
np.unique(matches.loc[:,'Pos'])

array(['DEF', 'FWD', 'GK', 'MID'], dtype=object)

Creating the per 90 metrics

In [32]:
matches['Mins per 90'] = matches['Mins'] / 90

stats_to_normalize = ['Gls', 'Ast', 'Shots', 'SoT', 'CrdY', 'CrdR', 'GA', 'Saves']

for stat in stats_to_normalize:
    if stat in matches.columns:
        matches[f'{stat}_per_90'] = matches[stat] / matches['Mins per 90']


Calculating the fantasy points each match

In [33]:
fantasy_pt = {
    'App':1,
    'Min':1,
    'Ast':3,
    'CrdY':-1,
    'CrdR':-3,
    'OG':-2,
    'PK Won':2,
    #Scrape PK conceded
    'Pen Con':-1,
    'GK DEF Clean':5,
    'Add Gls Con':-1,
    'GK Gls':9,
    'Pen Save':3,
    'Every 3 Saves':1,
    'DEF Gls':7,
    'MID Clean':1,
    'MID Gls':6,
    'Every 3 Tkls':1,
    'FWD Gls':5,
    'Every 2 FWD SoT':1
}

fant_pts = []

for i in range(len(matches)):
    pt = 0
    player = matches.loc[i,:]
    
    if player['Mins'] > 0:
        pt += fantasy_pt['App']
    if player['Mins'] >= 60:
        pt += fantasy_pt['Min']
    if player['Ast'] > 0:
        pt += player['Ast']*fantasy_pt['Ast']
    if player['CrdY'] > 0:
        pt += player['CrdY']*fantasy_pt['CrdY']
    if player['CrdR'] > 0:
        pt += player['CrdR']*fantasy_pt['CrdR']
    if player['OG'] > 0:
        pt += player['OG']*fantasy_pt['OG']
    if player['PKwon'] > 0:
        pt += player['PKwon']*fantasy_pt['PK Won']
    if player['PK Non SO Conceded'] > 0:
        pt += player['PK Non SO Conceded']*fantasy_pt['Pen Con']
    if player['GA'] == 0 and (player['Pos'] == 'GK' or player['Pos'] == 'DEF'):
        pt += fantasy_pt['GK DEF Clean']
    if player['GA'] >= 1:
        pt += (player['GA']-1)*fantasy_pt['Add Gls Con']
    if player['Pos'] == 'GK' and player['Gls'] >= 1:
        pt += player['Gls']*fantasy_pt['GK Gls']
    if player['Pos'] == 'GK':
        pt += (player['Saves']-player['PK Non SO Conceded'])*fantasy_pt['Pen Save']
    if player['Saves'] >= 3:
        saves_by3 = player['Saves']%3
        pt += saves_by3 * fantasy_pt['Every 3 Saves']
    if player['Pos'] == 'DEF' and player['Gls'] > 0:
        pt += player['Gls']*fantasy_pt['DEF Gls']   
    if player['Pos'] == 'MID' and player['GA'] == 0:
        pt += fantasy_pt['MID Clean']
    if player['Pos'] == 'MID' and player['Gls'] > 0:
        pt += player['Gls']*fantasy_pt['MID Gls']
    if player['Pos'] == 'FWD' and player['Gls'] > 0:
        pt += player['Gls']*fantasy_pt['FWD Gls']
    if player['Pos'] == 'FWD' and player['SoT'] >= 2:
        sot_by2 = player['SoT']%2
        pt += sot_by2*fantasy_pt['Every 2 FWD SoT']
    fant_pts.append(pt)
    
matches['Fantasy Points'] = fant_pts

Add opponent strength metric and some lagging average stat to account for form of the player

In [None]:
team_ratings = pd.read_csv('Data/team_ratings.csv')

team_ratings = team_ratings.drop('team_name',axis=1)

team_ratings = team_ratings.iloc[1:].reset_index(drop=True)

columns = ['global_rank', 'team_code', 'team_name', 'elo_rating',
       'rank_max', 'rating_max', 'rank_avg', 'rating_avg', 'rank_min',
       'rating_min', 'rank_chg_3m', 'rating_chg_3m', 'rank_chg_6m',
       'rating_chg_6m', 'rank_chg_1y', 'rating_chg_1y', 'rank_chg_2y',
       'rating_chg_2y', 'rank_chg_5y', 'rating_chg_5y', 'rank_chg_10y',
       'rating_chg_10y', 'matches_total', 'matches_home', 'matches_away',
       'matches_neutral', 'matches_wins', 'matches_losses', 'matches_draws',
       'goals_for', 'goals_against']



team_ratings.columns = columns

team_ratings.head()

ValueError: Length mismatch: Expected axis has 30 elements, new values have 31 elements

In [62]:
matches = matches.fillna(0)
matches_onehot = pd.get_dummies(matches, columns=["Pos"], prefix="Pos")

df_grouped = (
    matches_onehot.groupby(["Country", "Player"])
      .mean(numeric_only=True)
      .reset_index()
)
df_grouped = df_grouped.drop(columns=["Country","Player"])
# df_grouped.head()
matches.columns
# TODO: join this table with the match info data so we can get if they won/loss

Index(['Country', 'Date', 'Player', 'Mins', 'Opp Team', 'Pos', 'Gls', 'Ast',
       'PKwon', 'PK', 'PKatt', 'Shots', 'SoT', 'TklW', 'CrdY', 'CrdR', 'OG',
       'GA', 'Saves', 'Save%', 'PK Non SO Conceded', 'PK SO scorer',
       'GK PK SO saves', 'team_elo', 'opp_elo', 'Mins per 90', 'Gls_per_90',
       'Ast_per_90', 'Shots_per_90', 'SoT_per_90', 'CrdY_per_90',
       'CrdR_per_90', 'GA_per_90', 'Saves_per_90', 'Fantasy Points'],
      dtype='object')

In [40]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score


In [44]:
X = df_grouped.drop(columns=["Fantasy Points"])
y = df_grouped["Fantasy Points"]

model = LinearRegression()
kf = KFold(n_splits=5, shuffle=True, random_state=42)

mse_scores = cross_val_score(
    model,
    X,
    y,
    cv=kf,
    scoring="neg_mean_squared_error"
)

rmse_scores = np.sqrt(-mse_scores)

print("RMSE per fold:", rmse_scores)
print("Average RMSE:", rmse_scores.mean())

# TODO: Figure out what we want to predict. The model is trained, but what do we do with it.
# One example is calculate the scores of players on USA. We would have to exclude USA players from the training data,
# but then we could see what their predicted values are.

RMSE per fold: [0.98237088 0.88163709 0.84771008 0.83392577 0.81780982]
Average RMSE: 0.8726907285834651


# Predicting team USA's fantasy points:

In [60]:
matches_usa = matches[matches['Country'] == "United States"]

# look for USA players
players = [
    "Matthew Freese",
    "Mark McKenzie",
    "Auston Trusty",
    "Alex Freeman",
    "Sebastian Berhalter",
    "Aidan Morris",
    "Cristian Roldan",
    "John Tolkin",
    "Max Arfsten",
    "Sergi√±o Dest",
    "Tanner Tessmann",
    "Haji Wright",
    "Folarin Balogun",
    "Diego Luna",
    "Giovanni Reyna",
    "Timothy Tillman",
    "Brenden Aaronson",
    "Patrick Schulte",
    "Joe Scally",
    "Ricardo Pepi",
    "Jonathan Klinsmann",
    "Roman Celentano",
    "Tim Ream",
    "Miles Robinson"]
# Match USA players
matches_usa = matches_usa[matches_usa["Player"].isin(players)]
# Average their stats per player
matches_usa = matches_usa.groupby("Player", as_index=False).mean(numeric_only=True)

# Get their next game info
opp_country = "Uruguay" # Most recent team they played, Nov 18 2025
uruguay_elo = ratings.loc[ratings["team_name"] == opp_country, "elo_rating"].iloc[0]
matches_usa["opp_elo"] = uruguay_elo

# remove past fantasy points column
matches_usa = matches_usa.drop(columns=["Fantasy Points"])
matches_usa = pd.get_dummies(matches, columns=["Pos"], prefix="Pos")

matches_usa.columns


Index(['Country', 'Date', 'Player', 'Mins', 'Opp Team', 'Gls', 'Ast', 'PKwon',
       'PK', 'PKatt', 'Shots', 'SoT', 'TklW', 'CrdY', 'CrdR', 'OG', 'GA',
       'Saves', 'Save%', 'PK Non SO Conceded', 'PK SO scorer',
       'GK PK SO saves', 'team_elo', 'opp_elo', 'Mins per 90', 'Gls_per_90',
       'Ast_per_90', 'Shots_per_90', 'SoT_per_90', 'CrdY_per_90',
       'CrdR_per_90', 'GA_per_90', 'Saves_per_90', 'Fantasy Points', 'Pos_DEF',
       'Pos_FWD', 'Pos_GK', 'Pos_MID'],
      dtype='object')