In [1]:
import os
import sys
from pathlib import Path
if Path(os.getcwd()).name == "notebooks":
    os.chdir("../")
    print(f"Working dir changed to {os.getcwd()}")
sys.path.append("src")

In [2]:
import sqlite3
import yaml
import pandas as pd
import re
from thefuzz import process

In [3]:
def load_yaml_as_dict(yaml_file):
    with open(yaml_file) as stream:
        return yaml.safe_load(stream)
data_params = load_yaml_as_dict("conf/base/parameters/data.yml")
model_params = load_yaml_as_dict("conf/base/parameters/model.yml")

In [4]:
def select_relevant_rows(processed_data):
    ma_cols = processed_data.filter(regex=r"\w+_ma\d+").columns
    ma_cols = set([re.sub(r"_ma\d+", "", col) for col in ma_cols])
    excluded = [
        "value",
        "att_total",
        "home_att_total",
        "away_att_total",
        "def_total",
        "home_def_total",
        "away_def_total",
        "fpl_points",
    ]
    ma_cols = [col for col in ma_cols if col not in excluded]
    non_features = ["cached", "start", "match_points", "league_points"]
    non_features += processed_data.filter(regex="_elo$").columns.tolist()
    non_features += processed_data.filter(regex="_opp$").columns.tolist()
    to_drop = [col for col in ma_cols + non_features if col in processed_data.columns]
    processed_data = processed_data.drop(to_drop, axis=1)
    return processed_data

In [14]:
conn = sqlite3.connect("data/fpl.db")

fpl_data = pd.read_sql(f"select full_name, season, sum(total_points) from raw_fpl_data group by full_name, season", conn)

conn.close()

In [9]:
player_name_mapping = pd.read_csv("data/preprocess/player_mapping/player_name_mapping.csv")

In [87]:
search_for = "Danny Ward"
matched = process.extract(
    search_for, player_name_mapping["fpl_name"].tolist(), limit=10, 
)
top_matches = [m for m in matched if m[1] > 60]
if not top_matches:
    top_matches = matched

for fpl_name, score in top_matches[:3]:
    data = fpl_data.query(f"full_name == '{fpl_name}'")
    data = pd.merge(player_name_mapping[["fbref_name", "fpl_name"]], data, how="right", right_on="full_name", left_on="fpl_name").drop("full_name", axis=1)
    display(data)

Unnamed: 0,fbref_name,fpl_name,season,sum(total_points)
0,Danny Ward,Danny Ward,2017-2018,0.0
1,Danny Ward,Danny Ward,2018-2019,22.0
2,Danny Ward,Danny Ward,2019-2020,0.0
3,Danny Ward,Danny Ward,2020-2021,0.0
4,Danny Ward,Danny Ward,2021-2022,3.0
5,Danny Ward,Danny Ward,2022-2023,91.0


Unnamed: 0,fbref_name,fpl_name,season,sum(total_points)
0,James Ward-Prowse,James Ward-Prowse,2016-2017,92.0
1,James Ward-Prowse,James Ward-Prowse,2017-2018,84.0
2,James Ward-Prowse,James Ward-Prowse,2018-2019,94.0
3,James Ward-Prowse,James Ward-Prowse,2019-2020,117.0
4,James Ward-Prowse,James Ward-Prowse,2020-2021,156.0
5,James Ward-Prowse,James Ward-Prowse,2021-2022,159.0
6,James Ward-Prowse,James Ward-Prowse,2022-2023,142.0
7,James Ward-Prowse,James Ward-Prowse,2023-2024,136.0


Unnamed: 0,fbref_name,fpl_name,season,sum(total_points)
0,Danny Rose,Danny Rose,2016-2017,84.0
1,Danny Rose,Danny Rose,2017-2018,24.0
2,Danny Rose,Danny Rose,2018-2019,86.0
3,Danny Rose,Danny Rose,2019-2020,47.0
4,Danny Rose,Danny Rose,2020-2021,0.0
5,Danny Rose,Danny Rose,2021-2022,7.0


In [88]:
matched = process.extract(
    search_for, player_name_mapping["fbref_name"].tolist(), limit=10, 
)
matched

[('Danny Ward', 100),
 ('Danny Drinkwater', 86),
 ('James Ward-Prowse', 86),
 ('Danny Rose', 70),
 ('Danny Williams', 67),
 ('Antony', 65),
 ('Danny Simpson', 63),
 ('Danny Welbeck', 63),
 ('Danny Ings', 63),
 ('Danilo', 60)]

In [166]:
conn = sqlite3.connect("data/fpl.db")

fpl_data = pd.read_sql(
    f"""
    select season, team, full_name, opponent_team, opponent_team_name, fixture, was_home, total_points 
    from raw_fpl_data
    """, conn
)

conn.close()