In [129]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt

In [130]:
years = range(2018, 2024)
dfs = []

base_path = '../../data/clean/complex/final/player_data_{}.csv'
for year in years:
    df = pd.read_csv(base_path.format(year))
    df['Year'] = year
    df['Score'] = 101 - df['Rank']  # Higher score = better player
    dfs.append(df)

train_df = pd.concat(dfs, ignore_index=True)
print(f"Combined training shape: {train_df.shape}")
train_df.head()


Combined training shape: (140, 80)


Unnamed: 0,Player,adr,assists__per__round,b_adr,b_assists__per__round,b_damage__per__round,b_deaths__per__round,b_dpr,b_grenade_dmg__per__round,b_headshot_%,...,vs_top20,vs_top30,vs_top5,MVPs,EVPs,Rank,Year,Score,b_rating_1.0,m_rating_1.0
0,s1mple,87.2,0.1,87.4,0.11,87.4,0.59,0.59,2.7,41.7,...,1.34,1.34,1.35,6,4,1,2018,100,,
1,device,81.4,0.12,80.8,0.12,80.8,0.59,0.59,5.1,32.3,...,1.24,1.24,1.23,7,1,2,2018,99,,
2,NiKo,84.8,0.13,86.6,0.13,86.6,0.63,0.63,3.6,48.3,...,1.2,1.21,1.21,2,7,3,2018,98,,
3,electronic,84.5,0.14,82.2,0.13,82.2,0.63,0.63,4.6,48.7,...,1.16,1.18,1.2,0,7,4,2018,97,,
4,dupreeh,79.8,0.14,78.7,0.14,78.7,0.63,0.63,3.2,49.7,...,1.17,1.16,1.17,1,8,5,2018,96,,


In [131]:
# Load 2024 test data
test_df = pd.read_csv('../../data/clean/complex/final/player_data_2024.csv')
test_df['Year'] = 2024
test_df['Score'] = 0  # Placeholder


In [132]:
# Cell 3 - Normalize and align rating columns
def unify_rating_columns(df):
    if 'rating_2.1' in df.columns:
        df['rating'] = df['rating_2.1']
    elif 'rating_2.0' in df.columns:
        df['rating'] = df['rating_2.0']
    elif 'rating_1.0' in df.columns:
        df['rating'] = df['rating_1.0']
    else:
        df['rating'] = np.nan
    for col in ['rating_2.1', 'rating_2.0', 'rating_1.0']:
        if col in df.columns:
            df.drop(columns=col, inplace=True)
    return df

train_df = unify_rating_columns(train_df)
test_df = unify_rating_columns(test_df)

id_cols = ['Player', 'HLTV_ID', 'Rank', 'Year']
feature_cols = [col for col in train_df.columns if col not in id_cols + ['Score']]
shared_cols = list(set(feature_cols) & set(test_df.columns))

train_df[shared_cols] = train_df[shared_cols].replace(-1.0, np.nan).fillna(train_df[shared_cols].mean())
test_df[shared_cols] = test_df[shared_cols].replace(-1.0, np.nan).fillna(train_df[shared_cols].mean())


In [None]:
# Preprocess

# Cell 4 - Scaling and SelectKBest
scaler = StandardScaler()
X_train = scaler.fit_transform(train_df[shared_cols])
y_train = train_df['Score'].values
X_test = scaler.transform(test_df[shared_cols])

# Select top 50 correlated features
selector = SelectKBest(score_func=f_regression, k=50)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

selected_feature_names = [shared_cols[i] for i in selector.get_support(indices=True)]
print(f"Top features: {selected_feature_names}")



Top features: ['b_vs_top30', 'k_per_d_ratio', 'MVPs', 'b_dpr', 'b_damage__per__round', 'kast', 'b_kast', 'b_vs_top10', 'vs_top30', 'rating', 'b_adr', 'b_rating', 'vs_top10', 'EVPs', 'b_kpr', 'impact', 'b_vs_top20', 'dpr', 'deaths__per__round', 'damage__per__round', 'm_damage__per__round', 'b_vs_top5', 'total_kills', 'b_impact', 'kills__per__round', 'b_k_per_d_ratio', 'vs_top20', 'vs_top5', 'adr', 'kpr']


In [134]:
# Cell 5 - Grid search for best hyperparameters
param_grid = {
    'C': [0.1, 1, 10],
    'epsilon': [0.01, 0.1, 0.5],
    'kernel': ['rbf', 'poly'],
    'degree': [2, 3],  # Only used for 'poly' kernel
    'gamma': ['scale']
}

svr = SVR()
grid = GridSearchCV(svr, param_grid, scoring='neg_mean_squared_error', cv=5, verbose=2, n_jobs=-1)
grid.fit(X_train_selected, y_train)

print("🔍 Best params:", grid.best_params_)
print("📉 Best CV MSE:", -grid.best_score_)


Fitting 5 folds for each of 36 candidates, totalling 180 fits
[CV] END C=0.1, degree=2, epsilon=0.01, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END C=0.1, degree=2, epsilon=0.01, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END C=0.1, degree=2, epsilon=0.01, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END C=0.1, degree=2, epsilon=0.01, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END C=0.1, degree=2, epsilon=0.01, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END C=0.1, degree=2, epsilon=0.01, gamma=scale, kernel=poly; total time=   0.0s
[CV] END C=0.1, degree=2, epsilon=0.01, gamma=scale, kernel=poly; total time=   0.0s
[CV] END C=0.1, degree=2, epsilon=0.01, gamma=scale, kernel=poly; total time=   0.0s
[CV] END C=0.1, degree=2, epsilon=0.01, gamma=scale, kernel=poly; total time=   0.0s
[CV] END C=0.1, degree=2, epsilon=0.01, gamma=scale, kernel=poly; total time=   0.0s
[CV] END C=0.1, degree=2, epsilon=0.1, gamma=scale, kernel=rbf; total time=   0.0s
[CV] END C

In [135]:
# Cell 6 - Final training using best model
best_model = grid.best_estimator_
best_model.fit(X_train_selected, y_train)


Training Loop

In [136]:
# Cell 7 - Predict and rank 2024
preds_2024 = best_model.predict(X_test_selected)
test_df['PredictedScore'] = preds_2024
top_20_2024 = test_df.sort_values(by='PredictedScore', ascending=False).head(20)

print("\n🏆 Top 20 Predicted Players for 2024:")
print(top_20_2024[['Player', 'PredictedScore']])



🏆 Top 20 Predicted Players for 2024:
       Player  PredictedScore
1      m0NESY      101.047493
2       ZywOo       97.129423
3        NiKo       94.322286
14   XANTARES       94.150569
0        donk       93.356889
4          jL       91.482947
5       sh1ro       91.397976
6      flameZ       90.666063
11    malbsMd       90.378172
18      EliGE       90.326273
7       broky       87.259314
8         b1t       86.044320
9      frozen       85.991624
12      Spinx       85.786595
13    xertioN       84.915578
16   Jimpphat       84.739577
20      NertZ       84.288081
10  w0nderful       82.866296
24     zont1x       82.708681
27      Senzu       82.518967


In [137]:
# Load actual HLTV 2024 rankings
actual_df = pd.read_csv('../../rankings/ranking_2024.csv')

# Normalize nicknames in both DataFrames (for easier comparison)
def normalize(name):
    return name.strip().lower().replace("⁠", "").replace("’", "'").replace("`", "'")

actual_df['Nickname'] = actual_df['Nickname'].apply(normalize)
top_20_2024['Player'] = top_20_2024['Player'].apply(normalize)

# Map: nickname -> actual rank
actual_ranks = {row['Nickname']: row['Rank'] for _, row in actual_df.iterrows()}

# Evaluation function
def score_ranking(pred_df, actual_rank_dict):
    score = 0
    graded = []

    for pred_rank, row in enumerate(pred_df['Player'].values, 1):
        actual_rank = actual_rank_dict.get(row)

        if actual_rank:
            diff = abs(actual_rank - pred_rank)
            if diff == 0:
                pts = 5
            elif diff == 1:
                pts = 4
            elif diff == 2:
                pts = 3
            elif diff == 3:
                pts = 2
            elif diff <= 5:
                pts = 1
            else:
                pts = 0
        else:
            pts = 0

        graded.append((pred_rank, row, actual_rank, pts))
        score += pts

    return score, graded


In [138]:
# Run scoring
total_score, breakdown = score_ranking(top_20_2024, actual_ranks)

# Display summary
print(f"🏆 Total Ranking Score: {total_score}/100\n")
print("🔍 Breakdown:")
for pred_rank, nickname, actual_rank, pts in breakdown:
    print(f"Predicted #{pred_rank:>2}: {nickname:<15} | Actual: {actual_rank if actual_rank else 'N/A':<2} | +{pts} pts")

🏆 Total Ranking Score: 46/100

🔍 Breakdown:
Predicted # 1: m0nesy          | Actual: 2  | +4 pts
Predicted # 2: zywoo           | Actual: 3  | +4 pts
Predicted # 3: niko            | Actual: 4  | +4 pts
Predicted # 4: xantares        | Actual: 15 | +0 pts
Predicted # 5: donk            | Actual: 1  | +1 pts
Predicted # 6: jl              | Actual: 5  | +4 pts
Predicted # 7: sh1ro           | Actual: 6  | +4 pts
Predicted # 8: flamez          | Actual: 7  | +4 pts
Predicted # 9: malbsmd         | Actual: 12 | +2 pts
Predicted #10: elige           | Actual: 19 | +0 pts
Predicted #11: broky           | Actual: 8  | +2 pts
Predicted #12: b1t             | Actual: 9  | +2 pts
Predicted #13: frozen          | Actual: 10 | +2 pts
Predicted #14: spinx           | Actual: 13 | +4 pts
Predicted #15: xertion         | Actual: 14 | +4 pts
Predicted #16: jimpphat        | Actual: 17 | +4 pts
Predicted #17: nertz           | Actual: 21 | +1 pts
Predicted #18: w0nderful       | Actual: 11 | +0 pts
Pr