In [2]:
import os
import sys
from pathlib import Path
if Path(os.getcwd()).name == "notebooks":
    os.chdir("../")
    print(f"Working dir changed to {os.getcwd()}")
sys.path.append("src")

In [3]:
import sqlite3
import yaml
import pandas as pd
import re
from thefuzz import process

In [4]:
def load_yaml_as_dict(yaml_file):
    with open(yaml_file) as stream:
        return yaml.safe_load(stream)
data_params = load_yaml_as_dict("conf/base/parameters/data.yml")
model_params = load_yaml_as_dict("conf/base/parameters/model.yml")

In [5]:
def select_relevant_rows(processed_data):
    ma_cols = processed_data.filter(regex=r"\w+_ma\d+").columns
    ma_cols = set([re.sub(r"_ma\d+", "", col) for col in ma_cols])
    excluded = [
        "value",
        "att_total",
        "home_att_total",
        "away_att_total",
        "def_total",
        "home_def_total",
        "away_def_total",
        "fpl_points",
    ]
    ma_cols = [col for col in ma_cols if col not in excluded]
    non_features = ["cached", "start", "match_points", "league_points"]
    non_features += processed_data.filter(regex="_elo$").columns.tolist()
    non_features += processed_data.filter(regex="_opp$").columns.tolist()
    to_drop = [col for col in ma_cols + non_features if col in processed_data.columns]
    processed_data = processed_data.drop(to_drop, axis=1)
    return processed_data

In [69]:
conn = sqlite3.connect("data/fpl.db")

fpl_data = pd.read_sql(f"select full_name, season, sum(total_points) from raw_fpl_data group by full_name, season", conn)

conn.close()

In [76]:
player_name_mapping = pd.read_csv("data/preprocess/player_name_mapping.csv")

search_for = "Kaine Kesler-Hayden"
matched = process.extract(
    search_for, player_name_mapping["fpl_name"].tolist(), limit=10, 
)
top_matches = [m for m in matched if m[1] > 60]
if not top_matches:
    top_matches = matched

for fpl_name, score in top_matches:
    data = fpl_data.query(f"full_name == '{fpl_name}'")
    data = pd.merge(player_name_mapping[["fbref_name", "fpl_name"]], data, how="right", right_on="full_name", left_on="fpl_name").drop("full_name", axis=1)
    display(data)

Unnamed: 0,fbref_name,fpl_name,season,sum(total_points)
0,Kaine Kesler-Hayden,Kaine Kesler Hayden,2022-2023,0.0


Unnamed: 0,fbref_name,fpl_name,season,sum(total_points)
0,,Kaine Kesler-Hayden,2023-2024,2.0


Unnamed: 0,fbref_name,fpl_name,season,sum(total_points)
0,Isaac Hayden,Isaac Hayden,2017-2018,41.0
1,Isaac Hayden,Isaac Hayden,2018-2019,69.0
2,Isaac Hayden,Isaac Hayden,2019-2020,73.0
3,Isaac Hayden,Isaac Hayden,2020-2021,40.0
4,Isaac Hayden,Isaac Hayden,2021-2022,28.0
5,Isaac Hayden,Isaac Hayden,2023-2024,0.0


Unnamed: 0,fbref_name,fpl_name,season,sum(total_points)
0,,Kaine Hayden,2019-2020,0.0
1,,Kaine Hayden,2020-2021,0.0
2,,Kaine Hayden,2021-2022,0.0


In [74]:
matched = process.extract(
    search_for, player_name_mapping["fbref_name"].tolist(), limit=10, 
)
matched

[('Diego Costa', 86),
 ('Kevin De Bruyne', 86),
 ('David de Gea', 86),
 ('Marten de Roon', 86),
 ('Hélder Costa', 86),
 ('Donny van de Beek', 86),
 ('Micky van de Ven', 86),
 ('Ederson', 64),
 ('Eric Dier', 60),
 ('Oscar', 60)]

In [75]:
player_name_mapping.query("fbref_name == 'José Izquierdo'")

Unnamed: 0,fbref_id,fbref_name,fpl_name,fuzzy_score,total_points,review,duplicated,missing_matchlogs
596,ca2bc3df,José Izquierdo,José Heriberto Izquierdo Mena,100.0,,,,
