In [3]:
import os
import sys
from pathlib import Path
if Path(os.getcwd()).name == "notebooks":
    os.chdir("../")
    print(f"Working dir changed to {os.getcwd()}")
sys.path.append("src")

In [60]:
import sqlite3
import yaml
import pandas as pd
import re
from thefuzz import process

In [8]:
def load_yaml_as_dict(yaml_file):
    with open(yaml_file) as stream:
        return yaml.safe_load(stream)
data_params = load_yaml_as_dict("conf/base/parameters/data.yml")
model_params = load_yaml_as_dict("conf/base/parameters/model.yml")

In [15]:
def select_relevant_rows(processed_data):
    ma_cols = processed_data.filter(regex=r"\w+_ma\d+").columns
    ma_cols = set([re.sub(r"_ma\d+", "", col) for col in ma_cols])
    excluded = [
        "value",
        "att_total",
        "home_att_total",
        "away_att_total",
        "def_total",
        "home_def_total",
        "away_def_total",
        "fpl_points",
    ]
    ma_cols = [col for col in ma_cols if col not in excluded]
    non_features = ["cached", "start", "match_points", "league_points"]
    non_features += processed_data.filter(regex="_elo$").columns.tolist()
    non_features += processed_data.filter(regex="_opp$").columns.tolist()
    to_drop = [col for col in ma_cols + non_features if col in processed_data.columns]
    processed_data = processed_data.drop(to_drop, axis=1)
    return processed_data

In [62]:
conn = sqlite3.connect("data/fpl.db")

intermediate_data = pd.read_sql(f"select * from intermediate_data", conn)
intermediate_data = select_relevant_rows(intermediate_data)

conn.close()

In [53]:
highest_points_without_match_log = (
    intermediate_data[
        intermediate_data["player"].isna() & intermediate_data["fpl_points"] != 0
    ]
    .groupby("fpl_name")[["fpl_points"]]
    .sum()
    .sort_values("fpl_points", ascending=False)
)
highest_points_without_match_log

Unnamed: 0_level_0,fpl_points
fpl_name,Unnamed: 1_level_1
Rúben Diogo da Silva Neves,199.0
Thiago Thiago,92.0
Ricardo Domingos Barbosa Pereira,62.0
Bamidele Alli,35.0
Fabio Carvalho,12.0
Rúben Gonçalo Silva Nascimento Vinagre,1.0
Marcus Bettinelli,-1.0
Marek Rodák,-1.0
Nohan Kenneh,-1.0


In [85]:
player_name_mapping = pd.read_csv("data/preprocess/player_mapping/player_name_mapping.csv")
search_for = "Allan Nyom"
player_name_mapping[player_name_mapping["fpl_name"] == search_for]

Unnamed: 0,fbref_name,fpl_name,fuzzy_score,total_points,review,duplicated


In [87]:
matched = process.extract(
    search_for, player_name_mapping["fpl_name"].tolist(), limit=10, 
)
matched = [m for m in matched if m[1] > 60]
if matched:
    print(matched)

[('Allan Marques Loureiro', 86), ('Allan Marques Loureiro', 86), ('Allan Saint-Maximin', 86), ('Allan Tchaptchet', 86), ('Allan-Roméo Nyom', 86), ('Thomas Allan', 69), ('Allan Campbell', 63), ('Allan McGregor', 63)]


In [81]:
player_name_mapping.query("fpl_name == 'Matthew James'")

Unnamed: 0,season,fbref_name,fpl_name,fuzzy_score,total_points,review,duplicated
4346,2016-2017,,Matthew James,,1.0,True,
