In [1]:
import sys
print(sys.executable)

c:\Users\pengu\OneDrive\Documents\LoL-Cleaning\.venv\Scripts\python.exe


In [2]:
# LoL Data Cleaning â€” Simplified Starter

# 0) Imports & Paths â€” edit this to match your folder
import pandas as pd
from pathlib import Path
import json

# ðŸ‘‰ EDIT THIS to where your files live
DATA_DIR = Path('./data/archive')
PATH_MATCHES = DATA_DIR / 'matchData.csv'
PATH_PLAYERS = DATA_DIR / 'players_8-14-25.csv'
PATH_MATCH_IDS = DATA_DIR / 'match_ids.csv'
PATH_JSONL = DATA_DIR / 'match_data.jsonl'

for p in [PATH_MATCHES, PATH_PLAYERS, PATH_MATCH_IDS, PATH_JSONL]:
    print(f'{p} exists? ', p.exists())

data\archive\matchData.csv exists?  True
data\archive\players_8-14-25.csv exists?  True
data\archive\match_ids.csv exists?  True
data\archive\match_data.jsonl exists?  True


In [14]:
# 1) Load CSVs
df_matches = pd.read_csv(PATH_MATCHES)
df_players = pd.read_csv(PATH_PLAYERS)
df_match_ids = pd.read_csv(PATH_MATCH_IDS)

print('Matches shape:', df_matches.shape)
print('Players shape:', df_players.shape)
print('Match IDs shape:', df_match_ids.shape)
df_matches.head(3)
print(df_matches.columns.tolist())


Matches shape: (101843, 1770)
Players shape: (3560, 9)
Match IDs shape: (101843, 3)
['dataVersion', 'matchId', 'endOfGameResult', 'gameCreation', 'gameDuration', 'gameEndTimestamp', 'gameId', 'gameMode', 'gameName', 'gameType', 'gameVersion', 'mapId', 'participant0PlayerScore0', 'participant0PlayerScore1', 'participant0PlayerScore10', 'participant0PlayerScore11', 'participant0PlayerScore2', 'participant0PlayerScore3', 'participant0PlayerScore4', 'participant0PlayerScore5', 'participant0PlayerScore6', 'participant0PlayerScore7', 'participant0PlayerScore8', 'participant0PlayerScore9', 'participant0AllInPings', 'participant0AssistMePings', 'participant0Assists', 'participant0BaronKills', 'participant0BasicPings', 'participant0ChampExperience', 'participant0ChampLevel', 'participant0ChampionId', 'participant0ChampionName', 'participant0ChampionTransform', 'participant0CommandPings', 'participant0ConsumablesPurchased', 'participant0DamageDealtToBuildings', 'participant0DamageDealtToObjectiv

In [None]:
# Keep only a few numeric columns for a simple model
num_cols = [c for c in df_matches.columns if df_matches[c].dtype in ('int64', 'float64')]
print("Numeric columns:", num_cols[:10])  # peek at the first 10

# Allow some missing data â€” fill with 0 or median where appropriate
df_model = df_matches[num_cols].copy()

# Fill numeric NaNs with column medians (more realistic)
df_model = df_model.fillna(df_model.median(numeric_only=True))

print("âœ… After filling NaNs:", df_model.shape)
df_model.head()

print("Data ready for modeling:", df_model.shape)
df_model.head()


Numeric columns: ['dataVersion', 'gameCreation', 'gameDuration', 'gameEndTimestamp', 'gameId', 'mapId', 'participant0PlayerScore0', 'participant0PlayerScore1', 'participant0PlayerScore10', 'participant0PlayerScore11']
âœ… After filling NaNs: (101843, 1568)
Data ready for modeling: (101843, 1568)


Unnamed: 0,dataVersion,gameCreation,gameDuration,gameEndTimestamp,gameId,mapId,participant0PlayerScore0,participant0PlayerScore1,participant0PlayerScore10,participant0PlayerScore11,...,team1FeatsEPIC_MONSTER_KILLFeatState,team1FeatsFIRST_BLOODFeatState,team1FeatsFIRST_TURRETFeatState,team1BaronKills,team1ChampionKills,team1DragonKills,team1HordeKills,team1InhibitorKills,team1RiftHeraldKills,team1TowerKills
0,2,1755372956560,1682,1755374669877,5348438296,11,0,0,0,0,...,2,1001,1001,0,25,2,0,0,0,2
1,2,1755370753217,1759,1755372544375,5348419072,11,0,0,0,0,...,1,3,1,0,27,0,3,0,0,2
2,2,1755055472135,2491,1755057984148,5345908214,11,0,0,0,0,...,3,1001,1,0,24,4,0,3,1,11
3,2,1754532281603,1973,1754534283418,5341292117,11,0,0,0,0,...,2,1001,1001,0,28,3,0,0,1,5
4,2,1754529263025,2376,1754531661796,5341241370,11,0,0,0,0,...,1,3,1,0,27,3,1,1,1,7


In [16]:
df_matches[['team0Win', 'team1Win']].dtypes


team0Win    bool
team1Win    bool
dtype: object

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Choose team-level features
team_features = [
    "BaronKills", "DragonKills", "TowerKills",
    "ChampionKills", "InhibitorKills", "RiftHeraldKills"
]

# Build team0 and team1 DataFrames
team0 = df_matches[[f"team0{f}" for f in team_features] + ["team0Win"]].copy()
team1 = df_matches[[f"team1{f}" for f in team_features] + ["team1Win"]].copy()

# Rename columns to standard form
team0.columns = [c.replace("team0", "") for c in team0.columns]
team1.columns = [c.replace("team1", "") for c in team1.columns]
team0["teamId"] = 0
team1["teamId"] = 1

# Combine both into a single DataFrame
df_team = pd.concat([team0, team1], ignore_index=True)

# Ensure target is numeric (0/1)
df_team["Win"] = df_team["Win"].astype(int)

# Separate features/labels
X = df_team.drop(columns=["Win"])
y = df_team["Win"]

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate
preds = model.predict(X_test)
acc = accuracy_score(y_test, preds)
print(f"âœ… Model accuracy: {acc:.3f}")

# Show which features matter most
import pandas as pd
importance = pd.Series(model.coef_[0], index=X.columns).sort_values(ascending=False)
print("\nFeature importance:")
print(importance)

âœ… Model accuracy: 0.862

Feature importance:
TowerKills         0.538147
DragonKills        0.488593
InhibitorKills     0.346817
RiftHeraldKills    0.104759
ChampionKills     -0.038816
BaronKills        -0.075229
teamId            -0.129379
dtype: float64


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# 1. load your file
df = pd.read_csv(r"C:\Users\pengu\OneDrive\Documents\LoL-Cleaning\data\archive\matchData.csv", low_memory=False, on_bad_lines="skip")

# 2. columns we will need (they all exist in your printout)
damage_cols = [f"participant{i}TotalDamageDealtToChampions" for i in range(10)]
gold_cols   = [f"participant{i}GoldEarned" for i in range(10)]
vision_cols = [f"participant{i}VisionScore" for i in range(10)]
turret_cols = [f"participant{i}DamageDealtToTurrets" for i in range(10)]
team_cols   = ["team0DragonKills", "team1DragonKills",
               "team0BaronKills", "team1BaronKills",
               "team0Win", "team1Win"]

# 3. make sure these are numeric / usable
for c in damage_cols + gold_cols + vision_cols + turret_cols + team_cols[:-2]:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# 4. build team 0 rows
team0 = pd.DataFrame({
    "Win": df["team0Win"].astype(str).str.lower().map({"true":1, "false":0}),
    "Gold":   df[[f"participant{i}GoldEarned" for i in range(0,5)]].sum(axis=1),
    "Vision": df[[f"participant{i}VisionScore" for i in range(0,5)]].sum(axis=1),
    "Damage": df[[f"participant{i}TotalDamageDealtToChampions" for i in range(0,5)]].sum(axis=1),
    "TurretDamage": df[[f"participant{i}DamageDealtToTurrets" for i in range(0,5)]].sum(axis=1),
    "Dragons": df["team0DragonKills"],
    "Barons":  df["team0BaronKills"],
})

# 5. build team 1 rows
team1 = pd.DataFrame({
    "Win": df["team1Win"].astype(str).str.lower().map({"true":1, "false":0}),
    "Gold":   df[[f"participant{i}GoldEarned" for i in range(5,10)]].sum(axis=1),
    "Vision": df[[f"participant{i}VisionScore" for i in range(5,10)]].sum(axis=1),
    "Damage": df[[f"participant{i}TotalDamageDealtToChampions" for i in range(5,10)]].sum(axis=1),
    "TurretDamage": df[[f"participant{i}DamageDealtToTurrets" for i in range(5,10)]].sum(axis=1),
    "Dragons": df["team1DragonKills"],
    "Barons":  df["team1BaronKills"],
})

# 6. stack them
team_data = pd.concat([team0, team1], ignore_index=True)

# 7. drop rows with missing values (logistic regression canâ€™t take NaN)
team_data = team_data.dropna()

X = team_data[["Gold","Vision","Damage","TurretDamage","Dragons","Barons"]]
y = team_data["Win"]

# 8. scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 9. fit logistic regression
clf = LogisticRegression(max_iter=500)
clf.fit(X_scaled, y)

# 10. look at coefficients
coef_df = pd.DataFrame({
    "Feature": ["Gold","Vision","Damage","TurretDamage","Dragons","Barons"],
    "Coefficient": clf.coef_[0]
}).sort_values("Coefficient", ascending=False)

print("Model accuracy:", clf.score(X_scaled, y))
print(coef_df)

SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (2126570297.py, line 7)

In [None]:
import numpy as np

# If no column like 'win' or 'winner' exists, create a dummy one
if not any('win' in c.lower() for c in df_model.columns):
    np.random.seed(42)
    df_model['win'] = np.random.choice([0, 1], size=len(df_model))

print(df_model['win'].value_counts())

KeyError: "['team0GoldEarned', 'team0ObjectivesStolen'] not in index"

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Separate features and target
X = df_model.drop(columns=['win'])
y = df_model['win']

# ðŸ©¹ FIX: fill NaNs
X = X.fillna(0)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LogisticRegression(max_iter=500)
model.fit(X_train, y_train)

# Evaluate
preds = model.predict(X_test)
acc = accuracy_score(y_test, preds)
print(f"âœ… Model accuracy: {acc:.2f}")

âœ… Model accuracy: 0.50
