In [9]:
import sqlite3
import pandas as pd
import numpy as np
from datetime import timedelta
import requests
from tqdm import tqdm
import re

import sys
sys.path.append("..")

from src.fpl.pipelines.optimization_pipeline.fpl_api import get_fpl_base_data

In [10]:
connection = sqlite3.connect("../data/fpl.db")
raw_data = pd.read_sql_query("""SELECT *
                             FROM raw_fpl_data 
                             where full_name = 'Bukayo Saka'""", connection)
# raw_data["start"] = pd.to_datetime(raw_data["date"])
raw_data.head()

Unnamed: 0,season,round,element,full_name,team,position,fixture,opponent_team,opponent_team_name,total_points,...,ict_index,value,transfers_balance,selected,transfers_in,transfers_out,expected_goals,expected_goal_involvements,expected_assists,expected_goals_conceded
0,2018-2019,18.0,563,Bukayo Saka,,,171,4,,0.0,...,0.0,45.0,0.0,0.0,0.0,0.0,,,,
1,2018-2019,19.0,563,Bukayo Saka,,,181,3,,0.0,...,0.0,45.0,52.0,96.0,73.0,21.0,,,,
2,2018-2019,20.0,563,Bukayo Saka,,,196,12,,0.0,...,0.0,45.0,29.0,167.0,65.0,36.0,,,,
3,2018-2019,21.0,563,Bukayo Saka,,,201,9,,1.0,...,0.0,45.0,25.0,234.0,74.0,49.0,,,,
4,2018-2019,22.0,563,Bukayo Saka,,,220,19,,0.0,...,0.0,45.0,71.0,426.0,167.0,96.0,,,,


In [12]:
current_season = "2023-2024"
current_year = int(re.findall(r'\d+', current_season)[0])
[i for i in range(2016, current_year+1)]

[2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023]

In [145]:
columns_df1 = set(raw_data.columns)
columns_df2 = set(current_season_data.columns)

# Find columns that are only in df1
only_in_df1 = columns_df1 - columns_df2
print("Columns only in df1: ", only_in_df1)

# Find columns that are only in df2
only_in_df2 = columns_df2 - columns_df1
print("Columns only in df2: ", only_in_df2)


Columns only in df1:  set()
Columns only in df2:  set()


In [96]:
def refresh_fpl_names_mapping():
    

    elements_team = pd.read_csv("./data/raw/backtest_data/merged_gw.csv")[
        ["name", "team", "position"]
    ]
    latest_elements_team, _, _, _ = get_fpl_base_data()
    elements_team = latest_elements_team.merge(
        elements_team,
        left_on=["full_name", "name"],
        right_on=["name", "team"],
        suffixes=("", "_y"),
    )
    elements_team = (
        elements_team[["web_name", "short_name", "position"]]
        .drop_duplicates()
        .reset_index(drop=True)
    )

    path = "./data/raw/theFPLkiwi/FPL_projections_22_23/"
    files = [
        os.path.join(path, f)
        for f in os.listdir(path)
        if os.path.isfile(os.path.join(path, f))
    ]
    pred_pts_data = pd.DataFrame(columns=["FPL name", "Team", "Pos"])
    fpl_name_dict = pd.read_csv(
        "./data/raw/theFPLkiwi/ID_Dictionary.csv", encoding="cp1252"
    )[["Name", "FPL name"]]
    for file in files:
        df = pd.read_csv(file)
        df = fpl_name_dict.merge(df, on="Name")
        unique_rows = df[["FPL name", "Team", "Pos", "Price"]].drop_duplicates()
        pred_pts_data = pd.concat([pred_pts_data, unique_rows], ignore_index=True)

    pred_pts_data = pred_pts_data.rename({"FPL name": "pred_pts_fpl_name"}, axis=1)
    pred_pts_data = pred_pts_data.drop_duplicates(
        subset=["pred_pts_fpl_name", "Team", "Pos"], keep="last"
    ).reset_index(drop=True)

    tqdm.pandas(desc="Resolving FPL names in predicted pts data")
    pred_pts_data["matched"] = pred_pts_data.progress_apply(
        lambda row: fuzzy_match(row, elements_team), axis=1
    )
    pred_pts_data["same"] = (
        pred_pts_data["pred_pts_fpl_name"] == pred_pts_data["matched"]
    )
    pred_pts_data = pred_pts_data.sort_values(["same", "Price"], ascending=False)
    pred_pts_data = pred_pts_data.drop_duplicates(["Team", "matched"])
    pred_pts_data["fpl_name"] = pred_pts_data["matched"]
    pred_pts_data = pred_pts_data[["pred_pts_fpl_name", "fpl_name"]].reset_index(
        drop=True
    )

    pred_pts_data.to_csv(
        "./src/fpl/pipelines/model_pipeline/fpl_names_mapping.csv", index=False
    )
    return None