In [2]:
import pandas as pd
base_url = "https://raw.githubusercontent.com/olbauday/FPL-Elo-Insights/main/data/2025-2026"
by_tournament = f"{base_url}/By%20Tournament/Premier%20League"
urls = {
    "teams": f"{base_url}/teams.csv",
    "players": f"{by_tournament}/GW11/players.csv",  # snapshot (or replace GW11 with latest)
    "playerstats": f"{base_url}/playerstats.csv",
    "gameweek_summaries": f"{base_url}/gameweek_summaries.csv"
}


In [3]:
teams = pd.read_csv(urls["teams"])
playerstats = pd.read_csv(urls["playerstats"])
gameweek_summaries = pd.read_csv(urls["gameweek_summaries"])
players = pd.read_csv(urls["players"])

In [4]:
gw_data = []
for i in range(1, 39):  # 1‚Äì38 GWs
    url = f"{by_tournament}/GW{i}/player_gameweek_stats.csv"
    try:
        df = pd.read_csv(url)
        df["gameweek"] = i
        gw_data.append(df)
    except Exception:
        pass  # skip future GWs that aren't released yet

player_gw_stats = pd.concat(gw_data, ignore_index=True)

  player_gw_stats = pd.concat(gw_data, ignore_index=True)


In [5]:
# Standardize column names for merging
player_gw_stats = player_gw_stats.rename(columns={"id": "player_id"})
playerstats = playerstats.rename(columns={"id": "player_id"})
players = players.rename(columns={"player_id": "player_id"})  # already fine
teams = teams.rename(columns={"id": "team_id"})


In [6]:
merged_df = (
    player_gw_stats
    .merge(playerstats, on="player_id", suffixes=("_gw", "_season"))
    .merge(players, on="player_id", how="left")  # adds team_code + position
    .merge(teams, left_on="team_code", right_on="team_id", how="left", suffixes=("", "_team"))
)


In [7]:
print(merged_df.shape)
print(merged_df.columns[:25])
#Each row = one player in one gameweek.
#Each column = a metric from one of those 4 sources.
#You have about 79,000 rows √ó 194 columns, meaning:
#roughly 2,000+ players √ó 35‚Äì40 gameweeks worth of data
#every single numeric and categorical variable combined into one master table.

(95169, 194)
Index(['player_id', 'first_name_gw', 'second_name_gw', 'web_name_gw',
       'status_gw', 'news_gw', 'news_added_gw', 'now_cost_gw',
       'now_cost_rank_gw', 'now_cost_rank_type_gw', 'selected_by_percent_gw',
       'selected_rank_gw', 'selected_rank_type_gw', 'form_gw', 'form_rank_gw',
       'form_rank_type_gw', 'event_points_gw', 'cost_change_event_gw',
       'cost_change_event_fall_gw', 'cost_change_start_gw',
       'cost_change_start_fall_gw', 'transfers_in_event_gw',
       'transfers_out_event_gw', 'value_form_gw', 'value_season_gw'],
      dtype='object')


In [8]:
import numpy as np
df_clean = merged_df.copy()
# 1Ô∏è‚É£ Drop columns that are mostly missing
threshold = 0.6
too_many_missing = df_clean.columns[df_clean.isnull().mean() > threshold]
df_clean = df_clean.drop(columns=too_many_missing)
# 2Ô∏è‚É£ Separate numeric and non-numeric columns
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns
non_numeric_cols = df_clean.select_dtypes(exclude=[np.number]).columns
# 3Ô∏è‚É£ Fill numeric columns with median
df_clean[numeric_cols] = df_clean[numeric_cols].fillna(df_clean[numeric_cols].median())
# 4Ô∏è‚É£ Fill categorical columns with 'Unknown' + infer_objects() fix
df_clean[non_numeric_cols] = (
    df_clean[non_numeric_cols]
    .fillna("Unknown")
    .infer_objects(copy=False)   # ‚úÖ <‚Äî This is the exact line that fixes the FutureWarning
)
# 5Ô∏è‚É£ Verify cleaning
print("‚úÖ Data cleaned:")
print("Rows:", df_clean.shape[0], "| Columns:", df_clean.shape[1])
print("Remaining NaNs:", df_clean.isnull().sum().sum())

‚úÖ Data cleaned:
Rows: 95169 | Columns: 177
Remaining NaNs: 0


  .fillna("Unknown")


In [10]:
#target variable
df_clean['next_gw_points'] = df_clean.groupby('player_id')['event_points_gw'].shift(-1)

In [12]:
# --- Feature Engineering (fixed for your dataset) ---

df_clean = df_clean.sort_values(['player_id', 'gameweek'])

# Rolling averages (form over last 3 gameweeks)
for col in ['event_points_gw', 'goals_scored_gw', 'assists_gw', 'expected_goals_gw', 'expected_assists_gw']:
    if col in df_clean.columns:
        df_clean[f'{col}_roll3'] = (
            df_clean.groupby('player_id')[col]
            .transform(lambda x: x.rolling(3, min_periods=1).mean())
        )

# Team form (average points per team per GW)
team_form = (
    df_clean.groupby(['team_id', 'gameweek'])['event_points_gw']
    .mean()
    .reset_index(name='team_avg_points')
)
df_clean = df_clean.merge(team_form, on=['team_id', 'gameweek'], how='left')

# Opponent difficulty (proxy using team defensive and overall strength)
df_clean = df_clean.merge(
    teams[['team_id','strength_defence_home', 'strength_defence_away']],
    on='team_id', how='left'
)

# ‚úÖ Use available team strength columns safely
def_cols = [c for c in ['strength_defence_home', 'strength_defence_away'] if c in df_clean.columns]
if def_cols:
    df_clean['opp_difficulty_proxy'] = df_clean[def_cols].mean(axis=1)
    df_clean['team_strength_avg'] = df_clean[def_cols].mean(axis=1)
else:
    # fallback if those cols missing ‚Äî use 'strength' as proxy
    df_clean['opp_difficulty_proxy'] = df_clean['strength']
    df_clean['team_strength_avg'] = df_clean['strength']


print("‚úÖ Feature engineering complete.")
print("New columns added:", [c for c in df_clean.columns if 'roll3' in c or 'avg' in c or 'diff' in c])


‚úÖ Feature engineering complete.
New columns added: ['event_points_gw_roll3', 'goals_scored_gw_roll3', 'assists_gw_roll3', 'expected_goals_gw_roll3', 'expected_assists_gw_roll3', 'team_avg_points_x', 'team_avg_points_y', 'opp_difficulty_proxy', 'team_strength_avg']


In [14]:
# --- Feature selection for prediction ---

# Target variable: next gameweek points
y = df_clean['next_gw_points']

# Feature columns
feature_cols = [
    # Player performance and form
    'form_gw', 'points_per_game_gw', 'value_form_gw', 'selected_by_percent_gw',
    'minutes_gw', 'total_points_gw',
    'goals_scored_gw', 'assists_gw', 'clean_sheets_gw',
    'bps_gw', 'ict_index_gw',
    'expected_goals_gw', 'expected_assists_gw', 'expected_goal_involvements_gw',
    'expected_goals_conceded_gw',
    'influence_gw', 'creativity_gw', 'threat_gw',

    # Rolling averages (short-term form)
    'event_points_gw_roll3', 'goals_scored_gw_roll3',
    'assists_gw_roll3', 'expected_goals_gw_roll3', 'expected_assists_gw_roll3',

    # Team-level features
    'team_avg_points', 'team_strength_avg',

    # Opponent difficulty proxy
    'opp_difficulty_proxy',

    # Season-level performance
    'form_season', 'points_per_game_season', 'total_points_season',
    'expected_goals_season', 'expected_assists_season',
    'expected_goal_involvements_season', 'value_form_season', 'value_season_season',
    'influence_season', 'creativity_season', 'threat_season', 'ict_index_season'
]

# Filter only available columns (avoids KeyErrors)
feature_cols = [col for col in feature_cols if col in df_clean.columns]

# Define X
X = df_clean[feature_cols]

print("‚úÖ Features ready for modeling:")
print("X shape:", X.shape)
print("y shape:", y.shape)
print("Features used:", feature_cols)


‚úÖ Features ready for modeling:
X shape: (95169, 37)
y shape: (95169,)
Features used: ['form_gw', 'points_per_game_gw', 'value_form_gw', 'selected_by_percent_gw', 'minutes_gw', 'total_points_gw', 'goals_scored_gw', 'assists_gw', 'clean_sheets_gw', 'bps_gw', 'ict_index_gw', 'expected_goals_gw', 'expected_assists_gw', 'expected_goal_involvements_gw', 'expected_goals_conceded_gw', 'influence_gw', 'creativity_gw', 'threat_gw', 'event_points_gw_roll3', 'goals_scored_gw_roll3', 'assists_gw_roll3', 'expected_goals_gw_roll3', 'expected_assists_gw_roll3', 'team_strength_avg', 'opp_difficulty_proxy', 'form_season', 'points_per_game_season', 'total_points_season', 'expected_goals_season', 'expected_assists_season', 'expected_goal_involvements_season', 'value_form_season', 'value_season_season', 'influence_season', 'creativity_season', 'threat_season', 'ict_index_season']


In [15]:
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score

# Drop rows where target is missing or invalid
df_model = df_clean.copy()

# Remove NaN, inf, or -inf from target column
df_model = df_model[
    df_model['next_gw_points'].notnull() & 
    (~df_model['next_gw_points'].isin([np.inf, -np.inf]))
]

# Replace infinite values in features with NaN and then fill
X = df_model[feature_cols].replace([np.inf, -np.inf], np.nan).fillna(0)
y = df_model['next_gw_points'].astype(float)

print("‚úÖ Cleaned model data:")
print("Rows:", X.shape[0], "| Columns:", X.shape[1])
print("Remaining NaNs in y:", y.isna().sum())


# Split data (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training set:", X_train.shape)
print("Test set:", X_test.shape)

# Initialize model
model = XGBRegressor(
    n_estimators=300,         # Number of boosting rounds
    learning_rate=0.05,       # Step size shrinkage
    max_depth=6,              # Max depth of trees
    subsample=0.8,            # Row sampling
    colsample_bytree=0.8,     # Feature sampling
    random_state=42,
    objective='reg:squarederror'
)

# Train model
model.fit(X_train, y_train)

print("‚úÖ Model training complete!")

# Predict on test set
y_pred = model.predict(X_test)

# Evaluate
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"üìä Mean Absolute Error (MAE): {mae:.3f}")
print(f"üìà R¬≤ Score: {r2:.3f}")

import matplotlib.pyplot as plt

# --- PREDICT FOR ALL PLAYERS (including those without y) ---

# Clean the feature data for all rows
X_all = df_clean[feature_cols].replace([np.inf, -np.inf], np.nan).fillna(0)

# Generate predictions using the trained model
df_clean['predicted_next_points'] = model.predict(X_all)

# Show top 10 predicted players for next GW
predictions = (
    df_clean[['player_id', 'web_name_gw', 'team_id', 'predicted_next_points']]
    .drop_duplicates('player_id')
    .sort_values('predicted_next_points', ascending=False)
)

print("üîÆ Top 10 predicted players for next GW:")
print(predictions.head(10))



‚úÖ Cleaned model data:
Rows: 94421 | Columns: 37
Remaining NaNs in y: 0
Training set: (75536, 37)
Test set: (18885, 37)
‚úÖ Model training complete!
üìä Mean Absolute Error (MAE): 0.151
üìà R¬≤ Score: 0.925
üîÆ Top 10 predicted players for next GW:
       player_id  web_name_gw  team_id  predicted_next_points
10692         82      Semenyo      7.0              15.465090
69960        531      Ballard      7.0              14.565966
792            7    Calafiori      3.0              13.097381
78672        597  Richarlison      6.0              12.906957
69168        525         Wood     17.0              12.679049
56628        430      Haaland      7.0              12.291391
87120        661      Ekitik√©     14.0              10.895704
53988        410        Lewis      7.0              10.799732
76692        582        Kudus      6.0              10.163502
56232        427    Reijnders      7.0              10.098228


In [16]:
import unicodedata

# --- Helper: Normalize text (remove accents, lowercase) ---
def normalize_text(text):
    if isinstance(text, str):
        text = unicodedata.normalize('NFKD', text)
        text = ''.join([c for c in text if not unicodedata.combining(c)])
        return text.lower().strip()
    return str(text).lower().strip()


# --- INTERACTIVE PREDICTION LOOKUP (accent-insensitive + full name support) ---
def get_player_prediction(df, player_input):
    """
    Search for player by web_name, first name, second name, full name, or player_id
    and return their predicted next GW points.
    Accent-insensitive and flexible for partial matches.
    """
    player_input_norm = normalize_text(player_input)

    # Create normalized versions of relevant columns for easy matching
    df_search = df.copy()
    df_search['web_name_norm'] = df_search['web_name_gw'].apply(normalize_text)
    df_search['first_name_norm'] = df_search['first_name_gw'].apply(normalize_text)
    df_search['second_name_norm'] = df_search['second_name_gw'].apply(normalize_text)
    df_search['full_name_norm'] = (  # combine first + second name for full name search
        df_search['first_name_norm'].fillna('') + ' ' + df_search['second_name_norm'].fillna('')
    ).str.strip()
    df_search['player_id_str'] = df_search['player_id'].astype(str)

    # Flexible search mask
    mask = (
        df_search['web_name_norm'].str.contains(player_input_norm, na=False)
        | df_search['first_name_norm'].str.contains(player_input_norm, na=False)
        | df_search['second_name_norm'].str.contains(player_input_norm, na=False)
        | df_search['full_name_norm'].str.contains(player_input_norm, na=False)
        | df_search['player_id_str'].str.contains(player_input_norm, na=False)
    )

    results = df_search.loc[
        mask,
        ['player_id', 'first_name_gw', 'second_name_gw', 'web_name_gw', 'team_id', 'predicted_next_points']
    ]

    if results.empty:
        print("‚ö†Ô∏è No matching player found. Try a different name or player ID.")
    else:
        print(f"‚úÖ Predicted Next GW Points for players matching '{player_input}':")
        display(results.drop_duplicates('player_id').sort_values('predicted_next_points', ascending=False))


# üîπ Ask user for input
user_query = input("Enter player web name, first/second name, full name, or player ID: ")
get_player_prediction(df_clean, user_query)



Enter player web name, first/second name, full name, or player ID:  haaland


‚úÖ Predicted Next GW Points for players matching 'haaland':


Unnamed: 0,player_id,first_name_gw,second_name_gw,web_name_gw,team_id,predicted_next_points
56628,430,Erling,Haaland,Haaland,7.0,12.291391


In [17]:
# ===============================================
# üß† Logistic Regression: FPL Player Recommendation
# ===============================================

import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

# 1Ô∏è‚É£ Ensure necessary columns exist
required_cols = [
    'predicted_next_points', 'now_cost_gw', 'position',
    'form_gw', 'team_strength_avg', 'opp_difficulty_proxy'
]
missing = [c for c in required_cols if c not in df_clean.columns]
if missing:
    print(f"‚ö†Ô∏è Missing columns: {missing}. Filling with defaults.")
    for c in missing:
        df_clean[c] = 0

# 2Ô∏è‚É£ Value-for-money feature
df_clean['value_for_money'] = df_clean['predicted_next_points'] / df_clean['now_cost_gw']

# 3Ô∏è‚É£ Create target labels (FPL logic-based)
def categorize_player(row):
    if row['predicted_next_points'] >= 10 or row['value_for_money'] >= 1.5:
        return 'Start'
    elif row['predicted_next_points'] >= 6 or row['value_for_money'] >= 1.0:
        return 'Bench'
    else:
        return 'Sell'

df_clean['label'] = df_clean.apply(categorize_player, axis=1)

# 4Ô∏è‚É£ Prepare features and target
features = [
    'predicted_next_points', 'now_cost_gw', 'value_for_money',
    'form_gw', 'team_strength_avg', 'opp_difficulty_proxy', 'position'
]
X = df_clean[features]
y = df_clean['label']

# 5Ô∏è‚É£ Preprocessing: scale numeric & one-hot encode position
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), [
        'predicted_next_points', 'now_cost_gw', 'value_for_money',
        'form_gw', 'team_strength_avg', 'opp_difficulty_proxy'
    ]),
    ('cat', OneHotEncoder(), ['position'])
])

# 6Ô∏è‚É£ Build pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(multi_class='multinomial', max_iter=1000, random_state=42))
])

# 7Ô∏è‚É£ Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 8Ô∏è‚É£ Train model
model.fit(X_train, y_train)
print("‚úÖ Logistic Regression model trained successfully!")

# 9Ô∏è‚É£ Evaluate model
acc = model.score(X_test, y_test)
print(f"üìä Model accuracy on test set: {acc:.2f}")

# üîü Predict recommendations for all players
df_clean['recommendation'] = model.predict(X)

import re

# üß© Interactive Recommendation Function
def get_fpl_recommendation():
    user_input = input("Enter player name or ID: ").strip().lower()

    # Try to match player by ID or any part of their name
    if user_input.isdigit():
        player = df_clean[df_clean['player_id'] == int(user_input)]
    else:
        # Normalize names like Jo√£o ‚Üí joao for easier matching
        normalized_input = re.sub(r'[^a-z0-9 ]', '', user_input)
        df_clean['normalized_name'] = (
            df_clean['full_name'].str.lower().replace({r'[^a-z0-9 ]': ''}, regex=True)
        )
        df_clean['normalized_webname'] = (
            df_clean['web_name_gw'].astype(str).str.lower().replace({r'[^a-z0-9 ]': ''}, regex=True)
        )
        player = df_clean[
            df_clean['normalized_name'].str.contains(normalized_input, na=False)
            | df_clean['normalized_webname'].str.contains(normalized_input, na=False)
        ]

    if player.empty:
        print("‚ùå Player not found. Try again with different spelling or ID.")
        return

    # Predict using the logistic regression model
    X_player = player[[
        'predicted_next_points', 'now_cost_gw', 'value_for_money',
        'form_gw', 'team_strength_avg', 'opp_difficulty_proxy', 'position'
    ]]
    recommendation = model.predict(X_player)[0]

    # Display clean output
    print("\nüéØ Player Recommendation:")
    print("‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ")
    print(f"üßç‚Äç‚ôÇÔ∏è Name: {player['full_name'].iloc[0]}")

    # Handle team name gracefully (avoiding .iloc on strings)
    if 'team_name_final' in player.columns and not player['team_name_final'].isna().all():
        team_name = player['team_name_final'].iloc[0]
    elif 'team_name' in player.columns and not player['team_name'].isna().all():
        team_name = player['team_name'].iloc[0]
    else:
        team_name = "Unknown"

    print(f"üèüÔ∏è Team: {team_name}")
    print(f"üéØ Predicted Next GW Points: {player['predicted_next_points'].iloc[0]:.2f}")
    print(f"üí∞ Cost: ¬£{player['now_cost_gw'].iloc[0]:.1f}m")
    print(f"‚öñÔ∏è Value for Money: {player['value_for_money'].iloc[0]:.2f}")
    print(f"üìä Position: {player['position'].iloc[0]}")
    print(f"üß© Recommendation: {recommendation.upper()}")
    print("‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ")


# ‚úÖ Ensure full_name exists before recommendation lookup
if 'full_name' not in df_clean.columns:
    first = df_clean.get('first_name_gw', pd.Series('', index=df_clean.index)).fillna('')
    second = df_clean.get('second_name_gw', pd.Series('', index=df_clean.index)).fillna('')
    df_clean['full_name'] = (first + ' ' + second).str.strip()

# üîÅ Run interactively
get_fpl_recommendation()




‚úÖ Logistic Regression model trained successfully!
üìä Model accuracy on test set: 0.99


Enter player name or ID:  haaland



üéØ Player Recommendation:
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üßç‚Äç‚ôÇÔ∏è Name: Erling Haaland
üèüÔ∏è Team: Unknown
üéØ Predicted Next GW Points: 12.29
üí∞ Cost: ¬£14.0m
‚öñÔ∏è Value for Money: 0.88
üìä Position: Forward
üß© Recommendation: START
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ


In [23]:
# ===============================================
# üîÅ Interactive Replacement Suggestion Function
# ===============================================

def suggest_replacements_interactive(df=df_clean, top_n=3):
    """
    Ask user for player name and suggest top replacements
    based on predicted points, value for money, and form.
    Ensures no duplicate players appear.
    """
    import re

    user_input = input("Enter player name to find replacements: ").strip().lower()
    normalized_input = re.sub(r'[^a-z0-9 ]', '', user_input)

    # Normalize names for matching
    df['full_name_norm'] = (
        (df['first_name_gw'].astype(str) + ' ' + df['second_name_gw'].astype(str))
        .str.lower().replace({r'[^a-z0-9 ]': ''}, regex=True)
    )
    df['web_name_norm'] = (
        df['web_name_gw'].astype(str).str.lower().replace({r'[^a-z0-9 ]': ''}, regex=True)
    )

    # Find matching player(s)
    target = df[
        df['full_name_norm'].str.contains(normalized_input, na=False)
        | df['web_name_norm'].str.contains(normalized_input, na=False)
    ]

    if target.empty:
        print("‚ùå Player not found. Try again.")
        return

    # Pick first match if multiple
    target = target.iloc[0]
    position = target['position']
    team_name = target.get('team_name_final', target.get('team_name', 'Unknown'))
    print(f"\nüí° Finding replacements for {target['first_name_gw']} {target['second_name_gw']} ({position}) from {team_name}...\n")

    # Filter same-position players
    candidates = df[df['position'] == position].copy()

    # Remove duplicates (keep one row per player)
    candidates = candidates.drop_duplicates(subset=['player_id'])

    # Remove the same player
    candidates = candidates[candidates['player_id'] != target['player_id']]

    # Compute overall score
    candidates['score'] = (
        candidates['predicted_next_points'] * 0.5 +
        candidates['value_for_money'] * 0.3 +
        candidates['form_gw'] * 0.2
    )

    # Get top N
    top_replacements = candidates.sort_values(by='score', ascending=False).head(top_n)

    print("‚ú® Top 3 Recommended Replacements:")
    print("‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ")

    for _, row in top_replacements.iterrows():
        print(f"üßç‚Äç‚ôÇÔ∏è {row['first_name_gw']} {row['second_name_gw']} ({row['position']})")
        print(f"üí´ Predicted Pts: {row['predicted_next_points']:.2f}")
        print(f"üí∞ Value for Money: {row['value_for_money']:.2f}")
        print(f"üî• Form: {row['form_gw']:.2f}")
        print("‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ")

# üîπ Run interactively
suggest_replacements_interactive()


Enter player name to find replacements:  haaland



üí° Finding replacements for Erling Haaland (Forward) from Crystal Palace...

‚ú® Top 3 Recommended Replacements:
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üßç‚Äç‚ôÇÔ∏è Richarlison de Andrade (Forward)
üí´ Predicted Pts: 12.91
üí∞ Value for Money: 1.99
üî• Form: 13.00
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üßç‚Äç‚ôÇÔ∏è Chris Wood (Forward)
üí´ Predicted Pts: 12.68
üí∞ Value for Money: 1.69
üî• Form: 13.00
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
üßç‚Äç‚ôÇÔ∏è Hugo Ekitik√© (Forward)
üí´ Predicted Pts: 10.90
üí∞ Value for Money: 1.27
üî• Form: 11.00
‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ


In [24]:
# ===========================================
# üß† AUTO-FIX TEAM IDS & NAMES (robust mapping)
# ===========================================

import numpy as np
import pandas as pd

# ‚úÖ Reference map (official FPL-like ID ‚Üî Name)
team_map_df = pd.DataFrame({
    'team_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
    'team_name': [
        'Arsenal', 'Aston Villa', 'Bournemouth', 'Brentford', 'Brighton',
        'Chelsea', 'Crystal Palace', 'Everton', 'Fulham', 'Ipswich Town',
        'Leicester City', 'Liverpool', 'Manchester City', 'Manchester United',
        'Newcastle United', 'Nottingham Forest', 'Southampton',
        'Tottenham Hotspur', 'West Ham United', 'Wolverhampton Wanderers'
    ]
})

# üß© Create normalized versions for matching
team_map_df['team_name_norm'] = team_map_df['team_name'].str.lower().str.replace(r'[^a-z0-9 ]', '', regex=True)

# üß† Step 1: Try to normalize and map if `team_id` isn‚Äôt 1‚Äì20 yet
if 'team_id' in df_clean.columns:
    # Convert team_id to numeric safely
    df_clean['team_id'] = pd.to_numeric(df_clean['team_id'], errors='coerce')

    # If IDs look invalid (not in 1‚Äì20 range), fallback to team name mapping
    invalid_mask = ~df_clean['team_id'].isin(range(1, 21))
    if invalid_mask.sum() > 0 and 'team_name_final' in df_clean.columns:
        df_clean['team_name_final_norm'] = df_clean['team_name_final'].astype(str).str.lower().str.replace(r'[^a-z0-9 ]', '', regex=True)
        df_clean = df_clean.merge(
            team_map_df[['team_id', 'team_name_norm']],
            left_on='team_name_final_norm',
            right_on='team_name_norm',
            how='left',
            suffixes=('', '_mapped')
        )
        # Fill missing numeric IDs with mapped ones
        df_clean['team_id'] = df_clean['team_id'].fillna(df_clean['team_id_mapped'])
        df_clean.drop(columns=['team_name_norm', 'team_name_final_norm', 'team_id_mapped'], inplace=True, errors='ignore')

# üß† Step 2: Map team names for both home and opponent teams
df_clean['team_name_final'] = df_clean['team_id'].map(dict(zip(team_map_df['team_id'], team_map_df['team_name'])))
if 'opponent_team' in df_clean.columns:
    df_clean['opp_name_final'] = df_clean['opponent_team'].map(dict(zip(team_map_df['team_id'], team_map_df['team_name'])))
else:
    df_clean['opp_name_final'] = np.nan

# üß† Step 3: Fill missing values cleanly
df_clean['team_name_final'] = df_clean['team_name_final'].fillna("Unknown")
df_clean['opp_name_final'] = df_clean['opp_name_final'].fillna("Unknown")




In [25]:
# ===============================================
# üèüÔ∏è Detect Fixture Data + Show Fixtures for Latest Gameweek (Safe version)
# ===============================================

# üß© Check team code ‚Üî name mapping

# Select only relevant columns from your teams dataset
team_mapping = teams[['team_id', 'name', 'short_name']].sort_values('team_id').reset_index(drop=True)

# Display it nicely
print("üìò Team Code ‚Üî Name Mapping:")
display(team_mapping)

def detect_fixtures_df():
    candidates = []
    for name, obj in globals().items():
        if isinstance(obj, pd.DataFrame):
            # Convert all column names to strings safely
            cols = set(map(str.lower, map(str, obj.columns)))
            if {'home_team', 'away_team', 'gameweek'}.issubset(cols):
                candidates.append(name)
    return candidates[0] if candidates else None

# 1Ô∏è‚É£ Detect fixture DataFrame
fixtures_var_name = detect_fixtures_df()

if fixtures_var_name:
    fixtures_df = globals()[fixtures_var_name]
    print(f"‚úÖ Detected fixtures dataframe: {fixtures_var_name}")
else:
    print("‚ö†Ô∏è Could not detect any fixture dataset ‚Äî please load one containing columns ['home_team', 'away_team', 'gameweek']")
    fixtures_df = None

# 2Ô∏è‚É£ Get latest gameweek
if 'gameweek' in df_clean.columns:
    latest_gw = df_clean['gameweek'].max()
    print(f"üìÖ Latest Gameweek in Data: GW{latest_gw}")
else:
    print("‚ö†Ô∏è 'gameweek' column not found in df_clean.")
    latest_gw = None

# 3Ô∏è‚É£ Prepare team map
if 'teams' in globals() and not teams.empty:
    team_map = teams[['team_id', 'name', 'short_name']].rename(columns={'team_id': 'team_code'})
else:
    print("‚ö†Ô∏è 'teams' DataFrame not found.")
    team_map = pd.DataFrame()

# 4Ô∏è‚É£ Show fixtures
if fixtures_df is not None and latest_gw and not team_map.empty:
    gw_fixtures = (
        fixtures_df[fixtures_df['gameweek'] == latest_gw][['home_team', 'away_team']]
        .merge(team_map, left_on='home_team', right_on='team_code', how='left')
        .rename(columns={'name': 'Home Team Name', 'short_name': 'Home Short Name'})
        .merge(team_map, left_on='away_team', right_on='team_code', how='left')
        .rename(columns={'name': 'Away Team Name', 'short_name': 'Away Short Name'})
        [['home_team', 'Home Team Name', 'Home Short Name', 'away_team', 'Away Team Name', 'Away Short Name']]
    )

    print(f"\nüìã Fixtures for Gameweek {latest_gw}:")
    display(gw_fixtures)
else:
    print("‚ö†Ô∏è Could not generate fixtures ‚Äî fixture or team mapping missing.")
for name, obj in globals().items():
    if isinstance(obj, pd.DataFrame):
        print(name, list(obj.columns)[:10])


üìò Team Code ‚Üî Name Mapping:


Unnamed: 0,team_id,name,short_name
0,1,Arsenal,ARS
1,2,Aston Villa,AVL
2,3,Burnley,BUR
3,4,Bournemouth,BOU
4,5,Brentford,BRE
5,6,Brighton,BHA
6,7,Chelsea,CHE
7,8,Crystal Palace,CRY
8,9,Everton,EVE
9,10,Fulham,FUL


‚ö†Ô∏è Could not detect any fixture dataset ‚Äî please load one containing columns ['home_team', 'away_team', 'gameweek']
üìÖ Latest Gameweek in Data: GW11
‚ö†Ô∏è Could not generate fixtures ‚Äî fixture or team mapping missing.
teams ['code', 'team_id', 'name', 'short_name', 'strength', 'strength_overall_home', 'strength_overall_away', 'strength_attack_home', 'strength_attack_away', 'strength_defence_home']
playerstats ['player_id', 'status', 'chance_of_playing_next_round', 'chance_of_playing_this_round', 'now_cost', 'now_cost_rank', 'now_cost_rank_type', 'cost_change_event', 'cost_change_event_fall', 'cost_change_start']
gameweek_summaries ['id', 'name', 'deadline_time', 'average_entry_score', 'finished', 'data_checked', 'highest_scoring_entry', 'deadline_time_epoch', 'highest_score', 'is_previous']
players ['player_code', 'player_id', 'first_name', 'second_name', 'web_name', 'team_code', 'position']
df ['id', 'first_name', 'second_name', 'web_name', 'status', 'news', 'news_added', '

In [27]:
# ======================================================
# üßπ Clean duplicates before analysis
# ======================================================

# Keep only the most recent gameweek entry per player
df_latest = (
    df_clean.sort_values(['player_id', 'gameweek'], ascending=[True, False])
    .drop_duplicates(subset='player_id', keep='first')
)

# Compute value for money again (in case we dropped some columns)
df_latest['value_for_money'] = df_latest['predicted_next_points'] / (df_latest['now_cost_gw'] / 10)

# ======================================================
# 1Ô∏è‚É£ Top 5 Players by Predicted Points per Position
# ======================================================
top5_predicted = (
    df_latest[['player_id', 'full_name', 'position', 'team_name_final', 'predicted_next_points']]
    .dropna(subset=['predicted_next_points'])
    .sort_values(['position', 'predicted_next_points'], ascending=[True, False])
    .groupby('position', group_keys=False)
    .head(5)
)

print("‚ö° Top 5 Players by Predicted Points per Position:")
print(top5_predicted)

# ======================================================
# 2Ô∏è‚É£ Top 5 Players by Value for Money
# ======================================================
top5_vfm = (
    df_latest[['player_id', 'full_name', 'position', 'team_name_final', 'value_for_money']]
    .dropna(subset=['value_for_money'])
    .sort_values(['position', 'value_for_money'], ascending=[True, False])
    .groupby('position', group_keys=False)
    .head(5)
)

print("\nüí∏ Top 5 Players by Value-for-Money per Position:")
print(top5_vfm)

# ======================================================
# 3Ô∏è‚É£ All Teams by Difficulty (Average Opponent Difficulty)
# ======================================================
team_difficulty = (
    df_latest.groupby('team_name_final')['opp_difficulty_proxy']
    .mean()
    .reset_index(name='avg_opponent_difficulty')
    .sort_values('avg_opponent_difficulty', ascending=True)
)

print("\nüß± All Teams by Opponent Difficulty (lower = easier):")
print(team_difficulty)

# ======================================================
# 4Ô∏è‚É£ All Teams by Total Predicted Points
# ======================================================
team_predicted_points = (
    df_latest.groupby('team_name_final')['predicted_next_points']
    .sum()
    .reset_index(name='total_predicted_points')
    .sort_values('total_predicted_points', ascending=False)
)

print("\n‚öΩ All Teams by Total Predicted Points:")
print(team_predicted_points)


‚ö° Top 5 Players by Predicted Points per Position:
       player_id                      full_name    position  \
648            5   Gabriel dos Santos Magalh√£es    Defender   
34308        260                     Marc Gu√©hi    Defender   
33912        257                Maxence Lacroix    Defender   
54240        411                  Nico O'Reilly    Defender   
49356        374                Ibrahima Konat√©    Defender   
56748        430                 Erling Haaland     Forward   
37344        283           Jean-Philippe Mateta     Forward   
32856        249  Jo√£o Pedro Junqueira de Jesus     Forward   
87900        666                Viktor Gy√∂keres     Forward   
23484        178                  Danny Welbeck     Forward   
41436        314                     Bernd Leno  Goalkeeper   
48432        367           Giorgi Mamardashvili  Goalkeeper   
120            1              David Raya Mart√≠n  Goalkeeper   
29028        220           Robert Lynch S√°nchez  Goalkeeper

In [26]:
# Find all unique gameweeks present
unique_gws = sorted(df_clean['gameweek'].dropna().unique())

print(f"üìÖ Gameweeks in dataset: {unique_gws}")

# Find the latest (most recent) gameweek
latest_gw = df_clean['gameweek'].max()
print(f"‚úÖ Latest gameweek in your dataset: GW{int(latest_gw)}")

# If you kept df_latest from the last code:
print(f"üß† Predictions are based on latest available data ‚Äî most likely for GW{int(latest_gw + 1)} (the next gameweek).")


üìÖ Gameweeks in dataset: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10), np.int64(11)]
‚úÖ Latest gameweek in your dataset: GW11
üß† Predictions are based on latest available data ‚Äî most likely for GW12 (the next gameweek).
