# <span style="font-width:bold; font-size: 3rem; color:#333;">Training Pipeline</span>

## 🗒️ This notebook is divided into the following sections:

1. Select features for the model and create a Feature View with the selected features
2. Create training data using the feature view
3. Train model
4. Evaluate model performance
5. Save model to model registry

### <span style='color:#ff5f27'> 📝 Imports

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import os
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from xgboost import XGBClassifier, plot_importance
import xgboost as xgb
from util.helper import * 
import util.helper as helpfun
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

## <span style="color:#ff5f27;"> Prepare training features</span>

In [2]:
# Get current working directory
current_dir = os.getcwd()
print(f"Current directory: {current_dir}")

Current directory: C:\Users\marth\OneDrive - KTH\[Y1] Period 2\ID2223 Scalable Machine Learning and Deep Learning\Projects\Final Pj\ID2223-Final-Project


In [3]:
model_path = os.path.join(current_dir, 'model', 'champion_predictor.json')
label_encoder_path = os.path.join(current_dir, 'model', 'label_encoder.joblib')

# Load the label encoder first to get number of classes
label_encoder = joblib.load(label_encoder_path)
n_classes = len(label_encoder.classes_)

# Initialize the model with proper parameters
model = xgb.XGBClassifier(
    use_label_encoder=False,
    objective='multi:softprob',
    num_class=n_classes
)

# Load the model
model._Booster = xgb.Booster()
model._Booster.load_model(model_path)

# Set only n_classes_
model.n_classes_ = n_classes

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [4]:
meta_stats_path =  os.path.join(current_dir, 'util', 'data', 'meta_stats.csv')
meta_stats = pd.read_csv(meta_stats_path)
print(meta_stats)

     rank    champion  tier     role  win_rate  pick_rate  ban_rate  \
0       1       Riven     1      top    0.5279     0.0502    0.0300   
1       2      Irelia     1      top    0.5177     0.0610    0.1392   
2       3      Darius     1      top    0.5148     0.0763    0.1494   
3       4  Cassiopeia     1      top    0.5511     0.0118    0.0119   
4       5      Aatrox     2      top    0.5011     0.0837    0.0939   
..    ...         ...   ...      ...       ...        ...       ...   
230    40     LeBlanc     5  support    0.4784     0.0070    0.0240   
231    41        Shen     5  support    0.4901     0.0050    0.0006   
232    42     Camille     5  support    0.4799     0.0056    0.0026   
233    43        Hwei     5  support    0.4720     0.0077    0.0067   
234    44       Sylas     5  support    0.4609     0.0118    0.0102   

         counter1    counter2 counter3  
0      Cassiopeia       Urgot     Olaf  
1            Olaf       Riven   Wukong  
2            Udyr       

In [5]:
weekly_meta_stats_path =  os.path.join(current_dir, 'util', 'data', 'weekly_meta_stats.csv')
weekly_meta_stats = pd.read_csv(weekly_meta_stats_path)
print(weekly_meta_stats)

     rank champion  games   KDA     WR   pick    ban      cs   gold
0       1    Viego    602  3.22  52.66  27.44  21.73  178.51  11737
1       2   Viktor    581  2.52  51.81  26.48  36.53  206.93  11327
2       3    Corki    493  2.65  52.13  22.47  13.07  215.03  11859
3       4    Jayce    484  2.13  51.03  22.06   6.11  189.65  10715
4       5   Ezreal    404  2.60  46.29  18.41   7.74  199.91  11054
..    ...      ...    ...   ...    ...    ...    ...     ...    ...
164   165    Nasus     14  1.80   0.50   0.64   1.96  172.43  10215
165   166  Naafiri     12  1.84  33.33   0.55   0.26  176.83  10013
166   167    Quinn     11  1.45  36.36   0.51   0.77  148.82  10563
167   168   Illaoi     11  1.46  27.27   0.05   0.13  225.18  12160
168   169   Yorick      9  1.02  44.44   0.41   0.53  178.00   9785

[169 rows x 9 columns]


In [6]:
player_data_path = os.path.join(current_dir, 'util', 'data', 'feature_eng_stats.csv')
player_data = pd.read_csv(player_data_path)

In [7]:
pdata_df = convert_df(player_data)

Dropped 0 rows with NA champion values
Applying convert_team_colors...
Applying convert_region...
Applying convert_champion_columns...
Applying convert_date_column...
Applying convert_role_columns...
Applying convert_id_columns...
Applying remove_match_stats...


In [8]:
pdata_df = apply_feature_engineering(pdata_df)

Applying calculate_champ_variety_score...
Applying calculate_playstyle...
Applying get_most_role_3...
Applying calculate_role_specialization...
Applying calculate_champion_loyalty...
Applying <lambda>...
Applying remove_unwanted_columns...
Removed 306 columns
Remaining columns: 39
Applying optimize_feature_dtypes...


In [9]:
check_datatypes(pdata_df)

                               dtype  unique_values
champion                       UInt8             37
region                      category              4
team_champ1                    UInt8             49
team_champ2                    UInt8             53
team_champ3                    UInt8             43
team_champ4                    UInt8             40
opp_champ1                     UInt8             41
opp_champ2                     UInt8             33
opp_champ3                     UInt8             41
opp_champ4                     UInt8             31
opp_champ5                     UInt8             34
avg_kills                    float32              5
avg_deaths                   float32              5
avg_assists                  float32              5
kda_ratio_profile            float32              5
kill_participation_profile   float32              5
most_champ_1                   UInt8              5
most_champ_2                   UInt8              5
most_role_1 

Unnamed: 0,dtype,unique_values
champion,UInt8,37
region,category,4
team_champ1,UInt8,49
team_champ2,UInt8,53
team_champ3,UInt8,43
team_champ4,UInt8,40
opp_champ1,UInt8,41
opp_champ2,UInt8,33
opp_champ3,UInt8,41
opp_champ4,UInt8,31


In [10]:
label_column = pdata_df['champion']

In [11]:
predict_column = pdata_df.drop(columns=['champion', 'region']) #I think forgot to remove region

In [12]:
def predict_top_5_with_confidence(model, X, y_true, label_encoder, champion_converter):
    """
    Predict top 5 champions with confidence scores and compare with true label
    Returns a DataFrame with true champion, predicted champions and their confidence scores
    """
    # Get probabilities
    proba = model.predict_proba(X)
    
    # Get top 5 indices and probabilities
    top_5_idx = np.argsort(proba, axis=1)[:, -5:][:, ::-1]
    top_5_proba = np.take_along_axis(proba, top_5_idx, axis=1)
    
    # Initialize results DataFrame
    results = pd.DataFrame()
    
    # Add true champion - convert numeric label to champion name
    true_numbers = y_true
    results['True_Champion'] = [champion_converter.num_to_champion(int(num)) for num in true_numbers]
    
    # Process each rank separately
    for i in range(5):
        # Convert indices to champion names using the champion converter
        champions = [champion_converter.num_to_champion(int(label_encoder.classes_[idx])) for idx in top_5_idx[:, i]]
        probabilities = top_5_proba[:, i]
        
        # Add to results
        results[f'Rank_{i+1}_Champion'] = champions
        results[f'Rank_{i+1}_Confidence'] = probabilities.round(4)
    
    # Find which rank the true champion appeared in (if any)
    def find_champion_rank(row):
        true_champ = row['True_Champion']
        for i in range(1, 6):
            if row[f'Rank_{i}_Champion'] == true_champ:
                return f'Rank_{i}'
        return 'Not in Top 5'
    
    results['Prediction_Rank'] = results.apply(find_champion_rank, axis=1)
    
    return results

In [13]:
champion_converter = ChampionConverter()

# Make predictions
results = predict_top_5_with_confidence(
    model=model,
    X=predict_column,
    y_true=label_column,
    label_encoder=label_encoder,
    champion_converter=champion_converter
)

In [14]:
# Print results in a nice format
print("\nPrediction Results:")
print("-" * 100)
print(results.to_string(index=False))
print("-" * 100)

# Print summary statistics
rank_distribution = results['Prediction_Rank'].value_counts().sort_index()
print("\nPrediction Rank Distribution:")
for rank, count in rank_distribution.items():
    print(f"{rank}: {count} ({count/len(results):.1%})")

# Calculate accuracy metrics
in_top_1 = (results['Prediction_Rank'] == 'Rank_1').mean()
in_top_3 = results['Prediction_Rank'].isin(['Rank_1', 'Rank_2', 'Rank_3']).mean()
in_top_5 = results['Prediction_Rank'].isin(['Rank_1', 'Rank_2', 'Rank_3', 'Rank_4', 'Rank_5']).mean()

print(f"\nAccuracy Metrics:")
print(f"Top 1 Accuracy: {in_top_1:.2%}")
print(f"Top 3 Accuracy: {in_top_3:.2%}")
print(f"Top 5 Accuracy: {in_top_5:.2%}")

# Print detailed predictions for each instance
print("\nDetailed Predictions:")
for idx, row in results.iterrows():
    print(f"\nInstance {idx + 1}:")
    print(f"True Champion: {row['True_Champion']}")
    print(f"Found in: {row['Prediction_Rank']}")
    for i in range(1, 6):
        print(f"Rank {i}: {row[f'Rank_{i}_Champion']} ({row[f'Rank_{i}_Confidence']:.1%})")


Prediction Results:
----------------------------------------------------------------------------------------------------
True_Champion Rank_1_Champion  Rank_1_Confidence Rank_2_Champion  Rank_2_Confidence Rank_3_Champion  Rank_3_Confidence Rank_4_Champion  Rank_4_Confidence Rank_5_Champion  Rank_5_Confidence Prediction_Rank
    Jarvan IV            Jinx             0.0977           Varus             0.0803            Ashe             0.0608           Elise             0.0436            Jhin             0.0435    Not in Top 5
        Elise            Jinx             0.1284           Varus             0.0843           Elise             0.0736            Jhin             0.0333          Kai'Sa             0.0329          Rank_3
       Wukong            Ashe             0.0748           Varus             0.0659           Elise             0.0494            Jinx             0.0493            Jhin             0.0411    Not in Top 5
        Elise            Jinx             0.1452          