In [1]:
import os
import sys
from pathlib import Path
if Path(os.getcwd()).name == "notebooks":
    os.chdir("../")
    print(f"Working dir changed to {os.getcwd()}")
sys.path.append("src")

In [2]:
import sqlite3
import yaml
import pandas as pd
import re
from thefuzz import process

In [3]:
def load_yaml_as_dict(yaml_file):
    with open(yaml_file) as stream:
        return yaml.safe_load(stream)
data_params = load_yaml_as_dict("conf/base/parameters/data.yml")
model_params = load_yaml_as_dict("conf/base/parameters/model.yml")

In [4]:
def select_relevant_rows(processed_data):
    ma_cols = processed_data.filter(regex=r"\w+_ma\d+").columns
    ma_cols = set([re.sub(r"_ma\d+", "", col) for col in ma_cols])
    excluded = [
        "value",
        "att_total",
        "home_att_total",
        "away_att_total",
        "def_total",
        "home_def_total",
        "away_def_total",
        "fpl_points",
    ]
    ma_cols = [col for col in ma_cols if col not in excluded]
    non_features = ["cached", "start", "match_points", "league_points"]
    non_features += processed_data.filter(regex="_elo$").columns.tolist()
    non_features += processed_data.filter(regex="_opp$").columns.tolist()
    to_drop = [col for col in ma_cols + non_features if col in processed_data.columns]
    processed_data = processed_data.drop(to_drop, axis=1)
    return processed_data

In [14]:
conn = sqlite3.connect("data/fpl.db")

fpl_data = pd.read_sql(f"select full_name, season, sum(total_points) from raw_fpl_data group by full_name, season", conn)

conn.close()

In [9]:
player_name_mapping = pd.read_csv("data/preprocess/player_mapping/player_name_mapping.csv")

In [46]:
search_for = "Carlton Morris"
matched = process.extract(
    search_for, player_name_mapping["fpl_name"].tolist(), limit=10, 
)
top_matches = [m for m in matched if m[1] > 60]
if not top_matches:
    top_matches = matched

for fpl_name, score in top_matches[:3]:
    data = fpl_data.query(f"full_name == '{fpl_name}'")
    data = pd.merge(player_name_mapping[["fbref_name", "fpl_name"]], data, how="right", right_on="full_name", left_on="fpl_name").drop("full_name", axis=1)
    display(data)

Unnamed: 0,fbref_name,fpl_name,season,sum(total_points)
0,,Carlton Morris,2023-2024,140.0


Unnamed: 0,fbref_name,fpl_name,season,sum(total_points)
0,Carlos Vinícius,Carlos Vinícius Alves Morais,2022-2023,72.0
1,Carlos Vinícius,Carlos Vinícius Alves Morais,2023-2024,22.0


Unnamed: 0,fbref_name,fpl_name,season,sum(total_points)
0,,Carlos Vinicius Alves Morais,2020-2021,13.0


In [48]:
matched = process.extract(
    "carlton morris", player_name_mapping["fbref_name"].tolist(), limit=10, 
)
matched

[('Oscar', 68),
 ('Sean Morrison', 67),
 ('Ravel Morrison', 64),
 ('Rodri', 60),
 ('Antony', 60),
 ('Cafú', 60),
 ('Aaron Mooy', 58),
 ('Ramiro Funes Mori', 58),
 ('Alberto Moreno', 57),
 ('James Morrison', 57)]