## LIBRARY IMPORTS

In [None]:
from main import *
from feature import *
from pre_process_utils import *
import pandas as pd
from pandas import Period
import matplotlib.pyplot as plt
import pickle
import random
import numpy as np
from tqdm import tqdm
from collections import Counter
import chess
from stockfish import Stockfish
import re
from datetime import datetime
import ast
from scipy.stats import linregress
from tqdm.autonotebook import tqdm 
from collections import defaultdict
from functools import reduce
import chess


## DATASET IMPORTS & PRE PROCESSING

In [2]:
df = pd.read_csv('player_sample.csv')
df.reset_index(drop=True, inplace=True)
df.drop(columns=['Unnamed: 0'], inplace = True)

In [3]:
df['monthly_played_games'] = df['monthly_played_games'].apply(convert_monthly_games_to_dict)

In [4]:
df = filter_last_month_players(df) # only player reaching the last month

In [5]:
df['active_months'] = df['monthly_played_games'].apply(lambda x: len(eval(x)) if isinstance(x, str) else len(x))

In [6]:
final_months = pd.period_range(start='2014-01', end='2015-12', freq='M')

In [7]:
df['has_18_recent_months'] = df['monthly_played_games'].apply(has_enough_recent_months)
df = df[df['has_18_recent_months']] #only players with games played with data in the last 18 months counting from the last month

In [8]:
df.drop(columns='has_18_recent_months', inplace=True)
df.rename(columns={'abolsute_delta_elo': 'absolute_delta_elo', 'played_id': 'player_id'}, inplace=True)

## FEATURE ENGINEERING

In [9]:
mu = df['final_elo_rating'].mean()
media_fina_elo = df['final_elo_rating'].median()
print(f'The final Elo rating median is {media_fina_elo}')
print(f'The final Elo rating average is {mu}')
sigma = df['final_elo_rating'].std()
print(f'The std deviation for the final Elo rating average is {sigma}')
k = 0.5
months = df['active_months']
stockfish_path = '/Users/luismeireles/Desktop/Python_programming/chess_25/stockfish/stockfish-macos-m1-apple-silicon'

The final Elo rating median is 1662.5
The final Elo rating average is 1666.3740920096852
The std deviation for the final Elo rating average is 184.1142247745298


# Adjusted Growth Rate

In [10]:
df['growth_rate'] = df['absolute_delta_elo']/months
df['percentile'] = norm.cdf((df['final_elo_rating'] - mu)/ sigma)
df['adjusted_growth_rate'] = df['growth_rate'] * (k + df['percentile'])

# Flattening the data into Long Format/Panel

In [11]:
# Apply safe conversion (only if needed)
df['monthly_played_games'] = df['monthly_played_games'].apply(safe_parse_dict)
df['max_elo_per_month'] = df['max_elo_per_month'].apply(safe_parse_dict)

# 1. Normalize monthly_played_games into long format
games_records = []
for _, row in df.iterrows():
    mpg = row['monthly_played_games']
    if isinstance(mpg, dict):
        for k, v in mpg.items():
            games_records.append({
                'player_id': row['player_id'],
                'month': pd.to_datetime(k, format='%m-%Y').strftime('%Y-%m'),
                'games_played': v
            })
games_df = pd.DataFrame(games_records)

# 2. Normalize max_elo_per_month into long format
elo_records = []
for _, row in df.iterrows():
    elo = row['max_elo_per_month']
    if isinstance(elo, dict):
        for k, v in elo.items():
            elo_records.append({
                'player_id': row['player_id'],
                'month': pd.to_datetime(k, format='%Y-%m').strftime('%Y-%m'),
                'monthly_max_elo': v
            })
elo_df = pd.DataFrame(elo_records)

# 3. Merge on player_id and month — only keep rows where both values exist
panel_df = pd.merge(games_df, elo_df, on=['player_id', 'month'], how='inner')

# 4. Sort chronologically
panel_df = panel_df.sort_values(by=['player_id', 'month']).reset_index(drop=True)

In [12]:
players_list = df['player_id'].to_list()

# Importing relevant Game Info For The Selected Players // Filter by the List of Players // Preprocessing Dates

In [13]:
games_df = pd.read_feather('df_final.feather')
player_games = games_df[
    games_df['White'].isin(players_list) | games_df['Black'].isin(players_list)
].copy()

In [14]:
player_games[['player_id', 'player_color']] = player_games.apply(assign_player_id_and_color, axis=1, args=(players_list,))

In [15]:
player_games = preprocess_games(player_games)

In [16]:
player_games = player_games.copy()
player_games['month_year'] = pd.to_datetime(player_games['UTCDate'], errors='coerce').dt.to_period("M")

In [17]:
entropy_2gram_df = compute_monthly_ngram_entropy_df(player_games, n_gram_size=2)


In [18]:
panel_df['month'] = pd.to_datetime(panel_df['month']).dt.to_period("M")

# Merge the entropy into the panel (for 2-grams)
panel_df = panel_df.merge(
    entropy_2gram_df[['player_id', 'month', 'entropy']],
    on=['player_id', 'month'],
    how='left'
)

panel_df = panel_df.rename(columns={'entropy': 'entropy_2gram'})

# Trend Features (slopes, etc.) for Bigram Entropy

In [19]:
trend_rows = []

for player_id, group in panel_df.groupby("player_id"):
    row = {"player_id": player_id}

    for feature in ["entropy_2gram"]:  # or add more like 'entropy_3gram'
        series = group.set_index("month")[feature]
        weights = group.set_index("month")["games_played"]  # your game count column
        row.update(compute_trend_features(series, weights, prefix=feature))

    trend_rows.append(row)

trend_df = pd.DataFrame(trend_rows)

# Trend Features (slopes, etc.) for Engagement

In [20]:
engagement_rows = []

for player_id, group in panel_df.groupby("player_id"):
    row = {"player_id": player_id}

    series = group.set_index("month")["games_played"]
    weights = series.copy()

    row.update(compute_trend_features(series, weights, prefix="engagement", include_consistency=True))
    engagement_rows.append(row)

engagement_df = pd.DataFrame(engagement_rows)

In [21]:
spacing_data = compute_monthly_spacing(games_df, players_list)


100%|██████████| 826/826 [12:00<00:00,  1.15it/s]


In [22]:
df["monthly_spacing"] = df["player_id"].map(spacing_data)

In [23]:
panel_spacing_df = flatten_spacing_all_metrics(spacing_data)

In [24]:
# First, make sure players_list is a set for faster lookup
players_set = set(players_list)

# White mask
white_mask = games_df['White'].isin(players_set)
black_mask = games_df['Black'].isin(players_set)

# Create empty columns
games_df['player_id'] = None
games_df['player_elo'] = np.nan
games_df['opponent_elo'] = np.nan

# Fill where player is White
games_df.loc[white_mask, 'player_id'] = games_df.loc[white_mask, 'White']
games_df.loc[white_mask, 'player_elo'] = games_df.loc[white_mask, 'WhiteElo']
games_df.loc[white_mask, 'opponent_elo'] = games_df.loc[white_mask, 'BlackElo']

# Fill where player is Black
games_df.loc[black_mask, 'player_id'] = games_df.loc[black_mask, 'Black']
games_df.loc[black_mask, 'player_elo'] = games_df.loc[black_mask, 'BlackElo']
games_df.loc[black_mask, 'opponent_elo'] = games_df.loc[black_mask, 'WhiteElo']

# Drop rows where player_id is still None (not in players_list)
games_df = games_df.dropna(subset=['player_id'])


  games_df.loc[white_mask, 'player_elo'] = games_df.loc[white_mask, 'WhiteElo']
  games_df.loc[white_mask, 'opponent_elo'] = games_df.loc[white_mask, 'BlackElo']


# Trend Features (slopes, etc.) for Challenge

In [25]:
games_df['UTCDate'] = pd.to_datetime(games_df['UTCDate'], errors='coerce')
games_df['month'] = games_df['UTCDate'].dt.to_period('M')
games_df['elo_diff'] = pd.to_numeric(games_df['player_elo'], errors='coerce') - pd.to_numeric(games_df['opponent_elo'], errors='coerce')


In [27]:
games_df['win_prob'] = games_df['elo_diff'].apply(elo_win_probability)


In [29]:
zone_counts = (
    games_df
    .groupby(['player_id', 'month', 'challenge_zone'])
    .size()
    .unstack(fill_value=0)
    .reset_index()
)

zone_cols = ['optimal', 'underchallenged', 'overchallenged']
zone_counts[zone_cols] = zone_counts[zone_cols].div(
    zone_counts[zone_cols].sum(axis=1), axis=0
)


In [30]:
games_df['challenge_zone'] = games_df['win_prob'].apply(categorize_by_win_prob)

In [31]:
panel_df = panel_df.merge(zone_counts, on=["player_id", "month"], how="left")


In [32]:
challenge_rows = []

for player_id, group in panel_df.groupby("player_id"):
    row = {"player_id": player_id}

    series = group.set_index("month")["optimal"]
    weights = group.set_index("month")["games_played"]

    row.update(compute_trend_features(series, weights, prefix="optimal_challenge"))
    challenge_rows.append(row)

challenge_df = pd.DataFrame(challenge_rows)

## Computing Tactical Efficieny from Stockfish

In [43]:
target_games = player_games[player_games['player_id'].isin(players_list)].copy()


In [46]:
stockfish = Stockfish(path="stockfish/stockfish-macos-m1-apple-silicon", parameters={"Threads": 2, "Minimum Thinking Time": 30})
stockfish.set_depth(10)

In [None]:
#sampled_efficiency = compute_avg_normalized_efficiency_sampled(target_games, sample_size=8) #If you want to rerun the model just discomment

Sampling per player-month:   0%|          | 0/19473 [00:00<?, ?it/s]

KeyboardInterrupt: 

## Importing Data From Stockfish Prediction rounds until p(win) > .70

In [33]:
tactical_df=pd.read_csv('sampled_efficiency_norm.csv')

In [34]:
tactical_df["month"] = pd.to_datetime(tactical_df["month_year"]).dt.to_period("M")
tactical_df = tactical_df.drop(columns=["month_year", "Unnamed: 0"])
# Merge on player_id and month
tactical_df = tactical_df.rename(columns={"avg_normalized_efficiency": "tactical_efficiency"})
panel_df = panel_df.merge(tactical_df, on=["player_id", "month"], how="left")


In [35]:
tactical_rows = []

for player_id, group in panel_df.groupby("player_id"):
    row = {"player_id": player_id}

    series = group.set_index("month")["tactical_efficiency"]
    weights = group.set_index("month")["games_played"]

    row.update(compute_trend_features(series, weights, prefix="tactical"))
    tactical_rows.append(row)

tactical_trend_df = pd.DataFrame(tactical_rows)

In [36]:
spacing_rows = []

for idx, row in df.iterrows():
    player_id = row["player_id"]
    spacing_dict = row["monthly_spacing"]

    if not isinstance(spacing_dict, dict):
        continue

    for month, metrics in spacing_dict.items():
        flat_row = {
            "player_id": player_id,
            "month": month
        }
        flat_row.update(metrics)
        spacing_rows.append(flat_row)

spacing_df = pd.DataFrame(spacing_rows)

#Adjust date format
spacing_df["month"] = pd.to_datetime(spacing_df["month"], format="%Y-%m").dt.to_period("M")


In [37]:
panel_df = panel_df.merge(spacing_df, on=["player_id", "month"], how="left")


In [38]:
panel_df.to_csv('panel_data.csv')

In [39]:
spacing_trend_rows = []

for player_id, group in spacing_df.groupby("player_id"):
    row = {"player_id": player_id}
    
    weights = group.set_index("month")["games_played"] if "games_played" in group else None

    for feature in [
        "weighted_mean_days_between_sessions",
        "std_days_between_sessions",
        "max_days_between_sessions",
        "percent_sessions_within_2_days",
        "num_sessions_last_14_days"
    ]:
        series = group.set_index("month")[feature]
        row.update(compute_trend_features(series, weights, prefix=f"spacing_{feature}"))

    spacing_trend_rows.append(row)

spacing_trend_df = pd.DataFrame(spacing_trend_rows)

In [None]:
dfs = [trend_df, engagement_df, spacing_trend_df, tactical_trend_df, challenge_df]  
trend_df = reduce(lambda left, right: pd.merge(left, right, on="player_id", how="outer"), dfs)


In [41]:
trend_df.to_csv('trend_df.csv')

In [42]:
df.to_csv('sampled_players_df_2.csv')