In [45]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error


In [46]:
# Cell 2 - TreeNode definition for regression
class TreeNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, *, value=None):
        self.feature_index = feature_index  # which feature to split on
        self.threshold = threshold  # value to compare against
        self.left = left  # left subtree
        self.right = right  # right subtree
        self.value = value  # predicted value if leaf

    def is_leaf(self):
        return self.value is not None


In [47]:
# Cell 3 - Enhanced variance reduction with midpoint splits + gain
def variance_mse(y):
    return np.var(y) * len(y)

def best_split(X, y):
    m, n = X.shape
    best_feat, best_thresh, best_gain = None, None, -np.inf
    base_mse = variance_mse(y)

    for feature_index in range(n):
        values = np.sort(np.unique(X[:, feature_index]))
        thresholds = (values[:-1] + values[1:]) / 2  # Use midpoints

        for threshold in thresholds:
            left_idx = X[:, feature_index] <= threshold
            right_idx = ~left_idx

            if len(y[left_idx]) < 2 or len(y[right_idx]) < 2:
                continue

            left_mse = variance_mse(y[left_idx])
            right_mse = variance_mse(y[right_idx])
            weighted_mse = (left_mse + right_mse) / m
            gain = base_mse - weighted_mse

            if gain > best_gain:
                best_gain = gain
                best_feat = feature_index
                best_thresh = threshold

    return best_feat, best_thresh


In [48]:
# Cell 4 - TreeRegressor class with build logic
class DecisionTreeRegressor:
    def __init__(self, max_depth=12, min_samples_split=3):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def fit(self, X, y):
        self.root = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        if depth >= self.max_depth or num_samples < self.min_samples_split:
            noise = np.random.normal(0, 0.005)
            return TreeNode(value=np.mean(y) + noise)

        feat_idx, threshold = best_split(X, y)
        if feat_idx is None:
            return TreeNode(value=np.mean(y))

        left_idx = X[:, feat_idx] <= threshold
        right_idx = ~left_idx

        left = self._build_tree(X[left_idx], y[left_idx], depth + 1)
        right = self._build_tree(X[right_idx], y[right_idx], depth + 1)

        return TreeNode(feature_index=feat_idx, threshold=threshold, left=left, right=right)

    def predict_one(self, x, node):
        if node.is_leaf():
            return node.value
        if x[node.feature_index] <= node.threshold:
            return self.predict_one(x, node.left)
        else:
            return self.predict_one(x, node.right)

    def predict(self, X):
        return np.array([self.predict_one(row, self.root) for row in X])


In [49]:
# Cell 5 - Load training data (2018-2023), test on 2024
years = range(2018, 2024)
dfs = []
path = '../../data/clean/complex/final/player_data_{}.csv'
for year in years:
    df = pd.read_csv(path.format(year))
    df['Year'] = year
    df['Score'] = 101 - df['Rank']
    dfs.append(df)
train_df = pd.concat(dfs, ignore_index=True)

test_df = pd.read_csv('../../data/clean/complex/final/player_data_2024.csv')
test_df['Year'] = 2024
test_df['Score'] = 0

# Rating fix
def unify_rating_columns(df):
    if 'rating_2.1' in df.columns:
        df['rating'] = df['rating_2.1']
    elif 'rating_2.0' in df.columns:
        df['rating'] = df['rating_2.0']
    elif 'rating_1.0' in df.columns:
        df['rating'] = df['rating_1.0']
    else:
        df['rating'] = np.nan
    for col in ['rating_2.1', 'rating_2.0', 'rating_1.0']:
        if col in df.columns:
            df.drop(columns=col, inplace=True)
    return df

train_df = unify_rating_columns(train_df)
test_df = unify_rating_columns(test_df)

id_cols = ['Player', 'HLTV_ID', 'Rank', 'Year']
feature_cols = [col for col in train_df.columns if col not in id_cols + ['Score']]
shared_cols = list(set(feature_cols) & set(test_df.columns))

train_df[shared_cols] = train_df[shared_cols].replace(-1.0, np.nan).fillna(train_df[shared_cols].mean())
test_df[shared_cols] = test_df[shared_cols].replace(-1.0, np.nan).fillna(train_df[shared_cols].mean())

scaler = StandardScaler()
X_train = scaler.fit_transform(train_df[shared_cols])
y_train = train_df['Score'].values
X_test = scaler.transform(test_df[shared_cols])


In [50]:
# Cell 6 - Cross-validation to evaluate custom tree
kf = KFold(n_splits=8, shuffle=True, random_state=42)
mse_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X_train)):
    X_tr, X_val = X_train[train_idx], X_train[val_idx]
    y_tr, y_val = y_train[train_idx], y_train[val_idx]

    model = DecisionTreeRegressor(max_depth=8, min_samples_split=10)
    model.fit(X_tr, y_tr)
    preds = model.predict(X_val)

    mse = mean_squared_error(y_val, preds)
    mse_scores.append(mse)
    print(f"Fold {fold+1}: MSE = {mse:.4f}")

print(f"Average CV MSE: {np.mean(mse_scores):.4f}")


Fold 1: MSE = 24.7680
Fold 2: MSE = 42.4748
Fold 3: MSE = 32.3203
Fold 4: MSE = 25.0050
Fold 5: MSE = 24.1403
Fold 6: MSE = 30.4860
Fold 7: MSE = 69.0568
Fold 8: MSE = 51.5854
Average CV MSE: 37.4796


In [51]:
# Cell 7 - Train on all data and predict 2024
final_model = DecisionTreeRegressor(max_depth=12, min_samples_split=8)
final_model.fit(X_train, y_train)
preds_2024 = final_model.predict(X_test)

test_df['PredictedScore'] = preds_2024
top_20 = test_df.sort_values(by='PredictedScore', ascending=False).head(20)
top_20[['Player', 'PredictedScore']]


Unnamed: 0,Player,PredictedScore
0,donk,100.003545
2,ZywOo,99.251324
1,m0NESY,99.251324
3,NiKo,97.003255
14,XANTARES,97.003255
11,malbsMd,94.288371
5,sh1ro,91.848263
20,NertZ,90.8346
7,broky,88.99936
10,w0nderful,87.498411


In [52]:
# Load actual HLTV 2024 rankings
actual_df = pd.read_csv('../../rankings/ranking_2024.csv')

# Normalize nicknames in both DataFrames (for easier comparison)
def normalize(name):
    return name.strip().lower().replace("⁠", "").replace("’", "'").replace("`", "'")

actual_df['Nickname'] = actual_df['Nickname'].apply(normalize)
top_20['Player'] = top_20['Player'].apply(normalize)

# Map: nickname -> actual rank
actual_ranks = {row['Nickname']: row['Rank'] for _, row in actual_df.iterrows()}

# Evaluation function
def score_ranking(pred_df, actual_rank_dict):
    score = 0
    graded = []

    for pred_rank, row in enumerate(pred_df['Player'].values, 1):
        actual_rank = actual_rank_dict.get(row)

        if actual_rank:
            diff = abs(actual_rank - pred_rank)
            if diff == 0:
                pts = 5
            elif diff == 1:
                pts = 4
            elif diff == 2:
                pts = 3
            elif diff == 3:
                pts = 2
            elif diff <= 5:
                pts = 1
            else:
                pts = 0
        else:
            pts = 0

        graded.append((pred_rank, row, actual_rank, pts))
        score += pts

    return score, graded


In [53]:
# Run scoring
total_score, breakdown = score_ranking(top_20, actual_ranks)

# Display summary
print(f"🏆 Total Ranking Score: {total_score}/100\n")
print("🔍 Breakdown:")
for pred_rank, nickname, actual_rank, pts in breakdown:
    print(f"Predicted #{pred_rank:>2}: {nickname:<15} | Actual: {actual_rank if actual_rank else 'N/A':<2} | +{pts} pts")

🏆 Total Ranking Score: 44/100

🔍 Breakdown:
Predicted # 1: donk            | Actual: 1  | +5 pts
Predicted # 2: zywoo           | Actual: 3  | +4 pts
Predicted # 3: m0nesy          | Actual: 2  | +4 pts
Predicted # 4: niko            | Actual: 4  | +5 pts
Predicted # 5: xantares        | Actual: 15 | +0 pts
Predicted # 6: malbsmd         | Actual: 12 | +0 pts
Predicted # 7: sh1ro           | Actual: 6  | +4 pts
Predicted # 8: nertz           | Actual: 21 | +0 pts
Predicted # 9: broky           | Actual: 8  | +4 pts
Predicted #10: w0nderful       | Actual: 11 | +4 pts
Predicted #11: b1t             | Actual: 9  | +3 pts
Predicted #12: jl              | Actual: 5  | +0 pts
Predicted #13: flamez          | Actual: 7  | +0 pts
Predicted #14: elige           | Actual: 19 | +1 pts
Predicted #15: frozen          | Actual: 10 | +1 pts
Predicted #16: ropz            | Actual: 18 | +3 pts
Predicted #17: brollan         | Actual: 22 | +1 pts
Predicted #18: im              | Actual: 16 | +3 pts
Pr