In [2]:
import pandas as pd

In [6]:
df = pd.read_csv('vctchamps/vct_2024/players_stats/players_stats.csv')
df.head()

Unnamed: 0,Tournament,Stage,Match Type,Player,Teams,Agents,Rounds Played,Rating,Average Combat Score,Kills:Deaths,...,First Deaths Per Round,Headshot %,Clutch Success %,Clutches (won/played),Maximum Kills in a Single Map,Kills,Deaths,Assists,First Kills,First Deaths
0,Valorant Champions 2024,Playoffs,Upper Quarterfinals,Boo,Team Heretics,astra,26,0.75,138,0.58,...,0.08,21%,22%,2/9,11,11,19,14,1,2
1,Valorant Champions 2024,Playoffs,Upper Quarterfinals,Boo,Team Heretics,fade,16,1.1,185,1.38,...,0.0,21%,,,11,11,8,3,2,0
2,Valorant Champions 2024,Playoffs,Upper Quarterfinals,Boo,Team Heretics,"astra, fade",42,0.88,162,0.81,...,0.05,21%,22%,2/9,11,22,27,17,3,2
3,Valorant Champions 2024,Playoffs,Upper Quarterfinals,benjyfishy,Team Heretics,cypher,26,0.85,203,0.86,...,0.15,34%,,,18,18,21,3,2,4
4,Valorant Champions 2024,Playoffs,Upper Quarterfinals,benjyfishy,Team Heretics,killjoy,16,1.36,169,2.4,...,0.0,28%,50%,1/2,12,12,5,2,1,0


In [4]:
def standardize_col_names(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = (
        df.columns
        .str.strip()  
        .str.lower()  
        .str.replace(" ", "_")  
        .str.replace("%", "percent")
        .str.replace(",", "_") 
        .str.replace("average", "avg") 
    )
    df = df.rename(columns={
        'kill__assist__trade__survive_percent': 'kast_percent',
        'kills:deaths':'kd_ratio'})
    return df

def map_agents_to_roles(agent: str) -> str:
    roles = {
        "duelist": ["jett", "reyna", "phoenix", "raze", "yoru", "neon", "iso"],
        "initiator": ["sova", "skye", "kayo", "breach", "fade", "gecko"],
        "controller": ["brimstone", "viper", "omen", "astra", "harbor","clove"],
        "sentinel": ["cypher", "killjoy", "sage", "chamber", "deadlock", "vyse"]
    }
    for role, agents in roles.items():
        if agent in agents:
            return role
        
def fix_percentage_cols(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:
        if col in df.columns:
            df[col] = df[col].astype(str)
            df[col] = df[col].str.rstrip('%').astype(float, errors='ignore')
    return df

def fill_nan(df: pd.DataFrame, col: str, tendency: str) -> pd.Series:
    '''
    Takes in a dataframe, column, and what value to use in place of NaN values
    '''
    if tendency == 'mean':
        df[col] = df[col].fillna(df[col].mean())
    elif tendency == 'median':
        df[col] = df[col].fillna(df[col].median())
    elif tendency == 'mode':
        df[col] = df[col].fillna(df[col].mode().iloc[0])

def encode_cat(df: pd.DataFrame, col: str) -> pd.Series:
    '''
    Encodes categorical values
    '''
    val_map = {}
    code = 0
    for value in df[col].unique():
        if value not in val_map:
            val_map[value] = code
            code += 1
    df[col] = df[col].map(val_map)


def clean(df: pd.DataFrame) -> pd.DataFrame:

    df = standardize_col_names(df)
    fix_percentage_cols(df)
    
    df = df.drop(['tournament', 'stage', 'match_type', 'player', 'teams', 
                  'clutches_(won/played)', 'clutch_success_percent', 
                  'maximum_kills_in_a_single_map'], axis=1, errors='ignore')

    if "agents" in df.columns:
        df = df[~df["agents"].str.contains(",", na=False)]

    if "rounds_played" in df.columns:
        df = df[df["rounds_played"] < 25]

    if "agents" in df.columns:
        df["agents"] = df["agents"].map(map_agents_to_roles)
        df = df.dropna(subset=["agents"])  

    if "agents" in df.columns:
        encode_cat(df, "agents")

    processed_columns = ["agents"]
    for col in df:
        if col not in processed_columns and df[col].dtype not in [int, float]:
            encode_cat(df, col)

    df = df.rename(columns={"agents": "role"})
    df = df.dropna()
    df = df.reset_index(drop=True)

    return df
df = clean(df)
df.head(60)

Unnamed: 0,role,rounds_played,rating,avg_combat_score,kd_ratio,kast_percent,avg_damage_per_round,kills_per_round,assists_per_round,first_kills_per_round,first_deaths_per_round,headshot_percent,kills,deaths,assists,first_kills,first_deaths
0,0,16.0,1.1,185.0,1.38,81.0,122.0,0.69,0.19,0.13,0.0,21.0,11.0,8.0,3.0,2.0,0.0
1,1,16.0,1.36,169.0,2.4,81.0,120.0,0.75,0.13,0.06,0.0,28.0,12.0,5.0,2.0,1.0,0.0
2,2,16.0,1.84,289.0,3.6,94.0,188.0,1.13,0.25,0.13,0.0,56.0,18.0,5.0,4.0,2.0,0.0
3,3,16.0,1.0,204.0,1.0,88.0,140.0,0.69,0.25,0.13,0.19,16.0,11.0,11.0,4.0,2.0,3.0
4,2,16.0,1.34,242.0,1.75,88.0,166.0,0.88,0.31,0.25,0.13,32.0,14.0,8.0,5.0,4.0,2.0
5,2,18.0,0.42,126.0,0.47,61.0,69.0,0.44,0.11,0.0,0.11,14.0,8.0,17.0,2.0,0.0,2.0
6,3,18.0,0.67,150.0,0.56,67.0,96.0,0.5,0.22,0.11,0.17,42.0,9.0,16.0,4.0,2.0,3.0
7,2,16.0,0.37,81.0,0.25,69.0,54.0,0.19,0.38,0.06,0.25,22.0,3.0,12.0,6.0,1.0,4.0
8,0,16.0,0.73,135.0,0.64,69.0,92.0,0.44,0.25,0.0,0.19,19.0,7.0,11.0,4.0,0.0,3.0
9,3,16.0,0.92,246.0,0.87,56.0,164.0,0.81,0.13,0.13,0.19,18.0,13.0,15.0,2.0,2.0,3.0


In [5]:
#df.to_csv('processed_player_stats.csv', index=False)
