In [1]:
# Install missing packages if not already installed
!pip install textblob nltk

# Imports
import pandas as pd
import numpy as np
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

# Display settings
pd.set_option("display.max_columns", None)




[nltk_data] Downloading package vader_lexicon to C:\Users\M.ANTONY
[nltk_data]     ROJES\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [3]:
# Load datasets
market   = pd.read_csv(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\cleaned_market_value_data.csv")
injuries = pd.read_csv(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\injuries_raw.csv")
tweets   = pd.read_csv(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\Sentiment Analysis.csv")
events   = pd.read_csv(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\open-data-master\events_out.csv")


print("Market:", market.shape)
print("Injuries:", injuries.shape)
print("Tweets:", tweets.shape)
print("Events:", events.shape)


Market: (957, 3)
Injuries: (656, 43)
Tweets: (22524, 6)
Events: (4165, 107)


In [4]:
def clean_dataframe(df, name):
    print(f"\n--- Cleaning {name} ---")
    
    # 1. Remove duplicates
    before = df.shape[0]
    df = df.drop_duplicates()
    after = df.shape[0]
    print(f"Removed {before - after} duplicates")
    
    # 2. Handle missing values (numeric vs categorical separately)
    for col in df.columns:
        if df[col].dtype in ["float64", "int64"]:
            df[col] = df[col].fillna(df[col].median())
        else:
            df[col] = df[col].fillna("Unknown")
    
    print("Remaining missing values:", df.isna().sum().sum())
    return df


In [5]:
market_clean   = clean_dataframe(market, "Market Values")
injuries_clean = clean_dataframe(injuries, "Injuries")
tweets_clean   = clean_dataframe(tweets, "Tweets")
events_clean   = clean_dataframe(events, "Events")



--- Cleaning Market Values ---
Removed 0 duplicates
Remaining missing values: 0

--- Cleaning Injuries ---
Removed 0 duplicates
Remaining missing values: 0

--- Cleaning Tweets ---
Removed 0 duplicates
Remaining missing values: 0

--- Cleaning Events ---
Removed 0 duplicates
Remaining missing values: 0


In [None]:
Phase 4

In [6]:
import os

# Create cleaned_data folder if not exists
os.makedirs("../data/cleaned_data", exist_ok=True)

market_clean.to_csv("../data/cleaned_data/market_clean.csv", index=False)
injuries_clean.to_csv("../data/cleaned_data/injuries_clean.csv", index=False)
tweets_clean.to_csv("../data/cleaned_data/tweets_clean.csv", index=False)
events_clean.to_csv("../data/cleaned_data/events_clean.csv", index=False)

print("✅ All cleaned files saved in data/cleaned_data/")


✅ All cleaned files saved in data/cleaned_data/


In [7]:
import pandas as pd
import os

# Path where your cleaned CSVs are stored
path = r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\cleaned_data"

# List all files in cleaned_data folder
files = os.listdir(path)
print("✅ Files inside cleaned_data/:", files)

# Go through each file and check details
for f in files:
    print("\n📂 Checking:", f)
    
    df = pd.read_csv(os.path.join(path, f))
    
    # Shape
    print("   → Shape:", df.shape)
    
    # Preview first 3 rows
    print(df.head(3))
    
    # Duplicates check
    print("   → Duplicates:", df.duplicated().sum())
    
    # Missing values check
    print("   → Missing values per column:\n", df.isna().sum().to_dict())


✅ Files inside cleaned_data/: ['events_clean.csv', 'injuries_clean.csv', 'market_clean.csv', 'tweets_clean.csv']

📂 Checking: events_clean.csv
   → Shape: (4165, 107)
                                     id  index  period     timestamp  minute  \
0  963946e5-3c66-401f-a73b-3737b8fd93ff      1       1  00:00:00.000       0   
1  ffb0fe8c-dd64-4bda-86ea-d5b23b83c221      2       1  00:00:00.000       0   
2  1c519ba7-f860-48d9-b130-cbaf9e9dee79      3       1  00:00:00.000       0   

   second  possession  duration  type.id    type.name  ...  \
0       0           1      0.00       35  Starting XI  ...   
1       0           1      0.00       35  Starting XI  ...   
2       0           1      8.88       18   Half Start  ...   

   substitution.outcome.name substitution.replacement.id  \
0                    Unknown                      6022.0   
1                    Unknown                      6022.0   
2                    Unknown                      6022.0   

   substitution.replac

In [None]:
Phase 5

In [3]:
import pandas as pd
import numpy as np
import os
from datetime import datetime

# === Set this to your cleaned_data folder ===
CLEAN_DIR = r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\cleaned_data"
OUT_DIR   = os.path.join(CLEAN_DIR)  # save outputs here too

# small helper to drop unnamed cols (index columns that sneaked in)
def drop_unnamed(df):
    return df.loc[:, ~df.columns.str.startswith('Unnamed')]

# standardize names for joining across files
def norm_name(x: str) -> str:
    if pd.isna(x):
        return ""
    s = str(x).strip().lower()
    s = s.replace(".", "").replace("-", " ").replace("’","'").replace("`","'")
    s = " ".join(s.split())
    return s


In [4]:
events   = drop_unnamed(pd.read_csv(os.path.join(CLEAN_DIR, "events_clean.csv")))
injuries = drop_unnamed(pd.read_csv(os.path.join(CLEAN_DIR, "injuries_clean.csv")))
market   = drop_unnamed(pd.read_csv(os.path.join(CLEAN_DIR, "market_clean.csv")))
tweets   = drop_unnamed(pd.read_csv(os.path.join(CLEAN_DIR, "tweets_clean.csv")))

print("events:", events.shape)
print("injuries:", injuries.shape)
print("market:", market.shape)
print("tweets:", tweets.shape)

# Attach normalized player keys where possible
# events: try common StatsBomb columns
cand_event_player_cols = [c for c in events.columns if c.lower() in ["player.name","player","name","player_name"]]
if cand_event_player_cols:
    events["player_key"] = events[cand_event_player_cols[0]].map(norm_name)
else:
    events["player_key"] = ""  # fallback

# market: Name column exists per your CSV
market["player_key"] = market["Name"].map(norm_name)

# injuries: try to locate player column by common names
cand_inj_player_cols = [c for c in injuries.columns if c.lower() in ["player","name","player_name","full_name"]]
if cand_inj_player_cols:
    injuries["player_key"] = injuries[cand_inj_player_cols[0]].map(norm_name)
else:
    injuries["player_key"] = ""

# tweets: only useful if you added a player column; otherwise skip here
cand_tw_player_cols = [c for c in tweets.columns if "player" in c.lower() or c.lower() in ["name"]]
if cand_tw_player_cols:
    tweets["player_key"] = tweets[cand_tw_player_cols[0]].map(norm_name)


events: (4165, 107)
injuries: (656, 43)
market: (957, 3)
tweets: (22524, 5)


In [5]:
perf = pd.DataFrame()

# Helper: safe column check
def has(col): 
    return col in events.columns

# Shots & Goals
if has("type.name"):
    shots_df = events[events["type.name"].str.lower().eq("shot")]
    shots_per_player = shots_df.groupby("player_key").size().rename("shots")
    perf = shots_per_player.to_frame().reset_index()
else:
    perf = pd.DataFrame({"player_key": [], "shots": []})

# Goals (StatsBomb: shot.outcome.name == 'Goal')
if has("shot.outcome.name"):
    goals_df = events[
        events["type.name"].str.lower().eq("shot") &
        events["shot.outcome.name"].str.lower().eq("goal")
    ]
    goals = goals_df.groupby("player_key").size().rename("goals")
    perf = perf.merge(goals, on="player_key", how="outer")
else:
    perf["goals"] = np.nan

# Passes
if has("type.name"):
    pass_df = events[events["type.name"].str.lower().eq("pass")]
    passes_total = pass_df.groupby("player_key").size().rename("passes_total")

    # completed logic: in StatsBomb, completed passes often have no "pass.outcome.name"
    # you filled missings with "Unknown" earlier, so treat "Unknown" as completed.
    if has("pass.outcome.name"):
        completed_mask = pass_df["pass.outcome.name"].isin([np.nan, None, "Unknown"])
        passes_completed = pass_df[completed_mask].groupby("player_key").size().rename("passes_completed")
    else:
        passes_completed = pd.Series(0, index=passes_total.index, name="passes_completed")

    perf = perf.merge(passes_total, on="player_key", how="outer") \
               .merge(passes_completed, on="player_key", how="outer")
    perf["pass_accuracy"] = (perf["passes_completed"] / perf["passes_total"]).replace([np.inf, -np.inf], np.nan)
else:
    perf[["passes_total","passes_completed","pass_accuracy"]] = np.nan

# Assists: try several known columns
assist_cols = [c for c in ["pass.goal_assist", "pass.shot_assist", "pass.assisted_shot_id"] if has(c)]
assists = None
if assist_cols:
    # Priority: explicit boolean cols if present
    if "pass.goal_assist" in assist_cols and events["pass.goal_assist"].isin(["True", True, 1]).any():
        assists = events[events["pass.goal_assist"].isin(["True", True, 1])].groupby("player_key").size()
    elif "pass.shot_assist" in assist_cols and events["pass.shot_assist"].isin(["True", True, 1]).any():
        assists = events[events["pass.shot_assist"].isin(["True", True, 1])].groupby("player_key").size()
    elif "pass.assisted_shot_id" in assist_cols:
        # any non-Unknown/NaN assisted shot id counts as an assist
        mask = (~events["pass.assisted_shot_id"].isin(["Unknown"])) & (events["pass.assisted_shot_id"].notna())
        assists = events[mask].groupby("player_key").size()
if assists is not None:
    perf = perf.merge(assists.rename("assists"), on="player_key", how="outer")
else:
    perf["assists"] = np.nan

# Matches played: search for a match id-ish column
match_cols = [c for c in events.columns if "match" in c.lower() and "id" in c.lower()]
if match_cols:
    mcol = match_cols[0]
    mp = events.dropna(subset=["player_key", mcol]).groupby(["player_key", mcol]).size().reset_index()
    matches_played = mp.groupby("player_key")[mcol].nunique().rename("matches_played")
    perf = perf.merge(matches_played, on="player_key", how="outer")
else:
    perf["matches_played"] = np.nan

# Fill NaNs with 0 for counts
for c in ["shots","goals","passes_total","passes_completed","assists","matches_played"]:
    if c in perf.columns:
        perf[c] = perf[c].fillna(0).astype("int64", errors="ignore")

perf["pass_accuracy"] = perf["pass_accuracy"].astype(float)

# Save
perf.to_csv(os.path.join(OUT_DIR, "performance_features.csv"), index=False)
print("✅ Saved:", os.path.join(OUT_DIR, "performance_features.csv"))
perf.head(30)


✅ Saved: C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\cleaned_data\performance_features.csv


Unnamed: 0,player_key,shots,goals,passes_total,passes_completed,pass_accuracy,assists,matches_played
0,andrej kramarić,3,0,13,13,1.0,0,0
1,ante rebić,3,0,26,10,0.384615,0,0
2,ashley young,0,0,31,22,0.709677,0,0
3,bamidele alli,0,0,46,35,0.76087,0,0
4,danijel subašić,0,0,37,29,0.783784,0,0
5,danny rose,0,0,15,11,0.733333,0,0
6,dejan lovren,1,0,54,48,0.888889,0,0
7,domagoj vida,0,0,46,38,0.826087,0,0
8,eric dier,1,0,21,20,0.952381,0,0
9,harry kane,2,0,33,17,0.515152,0,0


In [7]:
inj = injuries.copy()

# Guess columns
day_cols  = [c for c in inj.columns if "day" in c.lower() and ("miss" in c.lower() or "out" in c.lower())]
date_cols = [c for c in inj.columns if "date" in c.lower() and ("start" in c.lower() or "inj" in c.lower() or c.lower() == "date")]

# Convert days missed to numeric
if day_cols:
    dcol = day_cols[0]
    inj[dcol] = pd.to_numeric(inj[dcol], errors="coerce")
else:
    inj["days_missed_auto"] = np.nan
    dcol = "days_missed_auto"

# Convert date to datetime (optional)
if date_cols:
    dtcol = date_cols[0]
    inj[dtcol] = pd.to_datetime(inj[dtcol], errors="coerce")
else:
    dtcol = None

agg = inj.groupby("player_key").agg(
    injury_count = ("player_key","size"),
    avg_days_missed = (dcol, "mean"),
    max_days_missed = (dcol, "max")
).reset_index()

# Optional: injuries in last 180 days if we have dates
if dtcol:
    cutoff = pd.Timestamp.today() - pd.Timedelta(days=180)
    recent = inj[inj[dtcol] >= cutoff].groupby("player_key").size().rename("injuries_last_180d")
    agg = agg.merge(recent, on="player_key", how="left")
else:
    agg["injuries_last_180d"] = np.nan

# Clean up
agg["avg_days_missed"] = agg["avg_days_missed"].fillna(0).round(1)
agg["max_days_missed"] = agg["max_days_missed"].fillna(0).astype(int, errors="ignore")
agg["injuries_last_180d"] = agg["injuries_last_180d"].fillna(0).astype(int, errors="ignore")

agg.to_csv(os.path.join(OUT_DIR, "injury_features.csv"), index=False)
print("✅ Saved:", os.path.join(OUT_DIR, "injury_features.csv"))
agg.head(30)


✅ Saved: C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\cleaned_data\injury_features.csv


Unnamed: 0,player_key,injury_count,avg_days_missed,max_days_missed,injuries_last_180d
0,aaron hickey,2,299.5,517,0
1,aaron lennon,1,19.0,19,0
2,aaron wan bissaka,4,19.8,43,0
3,abdoulaye doucouré,4,45.8,64,0
4,acob ramsey,1,46.0,46,0
5,alex iwobi,3,24.3,35,0
6,alexander isak,3,26.7,30,0
7,alexandre lacazette,3,23.7,48,0
8,allan,2,36.0,58,0
9,allan saint maximin,8,27.0,61,0


In [9]:
mkt = market.copy()

# Normalize fee
mkt["fee_million"] = pd.to_numeric(mkt["Fee"], errors="coerce").fillna(0.0)

# Buckets
bins   = [-0.01, 0.0, 5, 15, 30, 60, 120, np.inf]
labels = ["Free", "0-5m", "5-15m", "15-30m", "30-60m", "60-120m", "120m+"]
mkt["fee_bucket"] = pd.cut(mkt["fee_million"], bins=bins, labels=labels, include_lowest=True, right=True)

# Flags
mkt["is_free_transfer"] = (mkt["fee_million"] == 0).astype(int)
mkt["high_value_flag"]  = (mkt["fee_million"] >= 50).astype(int)

# Keep essentials
market_feat = mkt[["player_key", "Club", "fee_million", "fee_bucket", "is_free_transfer", "high_value_flag"]].drop_duplicates("player_key")

market_feat.to_csv(os.path.join(OUT_DIR, "market_features.csv"), index=False)
print("✅ Saved:", os.path.join(OUT_DIR, "market_features.csv"))
market_feat.head(30)


✅ Saved: C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\cleaned_data\market_features.csv


Unnamed: 0,player_key,Club,fee_million,fee_bucket,is_free_transfer,high_value_flag
0,erling haaland,Man City,60.0,30-60m,0,1
1,enzo fernández,Chelsea,121.0,120m+,0,1
2,antony,Man Utd,95.0,60-120m,0,1
3,wesley fofana,Chelsea,80.4,60-120m,0,1
4,aurélien tchouaméni,Real Madrid,80.0,60-120m,0,1
5,casemiro,Man Utd,70.65,60-120m,0,1
6,mykhaylo mudryk,Chelsea,70.0,60-120m,0,1
7,alexander isak,Newcastle,70.0,60-120m,0,1
8,matthijs de ligt,Bayern Munich,67.0,60-120m,0,1
9,raheem sterling,Chelsea,56.2,30-60m,0,1


In [11]:
base = market_feat.copy()   # seed with market list of players

# Merge performance
if os.path.exists(os.path.join(OUT_DIR, "performance_features.csv")):
    perf = pd.read_csv(os.path.join(OUT_DIR, "performance_features.csv"))
    base = base.merge(perf, on="player_key", how="left")
else:
    print("⚠️ performance_features.csv not found")

# Merge injuries
if os.path.exists(os.path.join(OUT_DIR, "injury_features.csv")):
    injf = pd.read_csv(os.path.join(OUT_DIR, "injury_features.csv"))
    base = base.merge(injf, on="player_key", how="left")
else:
    print("⚠️ injury_features.csv not found")

# Optional: if your tweets_clean has player_key, you could add avg sentiment here
if "player_key" in tweets.columns and "Sentiment" in tweets.columns:
    # map sentiment text to scores (simple)
    sent_map = {"positive": 1, "neutral": 0, "negative": -1}
    tdf = tweets.copy()
    tdf["sent_score"] = tdf["Sentiment"].str.lower().map(sent_map)
    tw_agg = tdf.groupby("player_key")["sent_score"].mean().rename("avg_sentiment")
    base = base.merge(tw_agg, on="player_key", how="left")
else:
    base["avg_sentiment"] = np.nan

# Final tidying
num_cols = ["fee_million","is_free_transfer","high_value_flag","shots","goals","passes_total",
            "passes_completed","pass_accuracy","assists","matches_played",
            "injury_count","avg_days_missed","max_days_missed","injuries_last_180d","avg_sentiment"]
for c in num_cols:
    if c in base.columns:
        if c == "pass_accuracy":
            base[c] = base[c].astype(float)
        else:
            base[c] = pd.to_numeric(base[c], errors="coerce")

# Save
out_path = os.path.join(OUT_DIR, "player_features.csv")
base.to_csv(out_path, index=False)
print("✅ Saved:", out_path)
base.head(30)


✅ Saved: C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\cleaned_data\player_features.csv


Unnamed: 0,player_key,Club,fee_million,fee_bucket,is_free_transfer,high_value_flag,shots,goals,passes_total,passes_completed,pass_accuracy,assists,matches_played,injury_count,avg_days_missed,max_days_missed,injuries_last_180d,avg_sentiment
0,erling haaland,Man City,60.0,30-60m,0,1,,,,,,,,,,,,
1,enzo fernández,Chelsea,121.0,120m+,0,1,,,,,,,,,,,,
2,antony,Man Utd,95.0,60-120m,0,1,,,,,,,,,,,,
3,wesley fofana,Chelsea,80.4,60-120m,0,1,,,,,,,,,,,,
4,aurélien tchouaméni,Real Madrid,80.0,60-120m,0,1,,,,,,,,,,,,
5,casemiro,Man Utd,70.65,60-120m,0,1,,,,,,,,,,,,
6,mykhaylo mudryk,Chelsea,70.0,60-120m,0,1,,,,,,,,,,,,
7,alexander isak,Newcastle,70.0,60-120m,0,1,,,,,,,,3.0,26.7,30.0,0.0,
8,matthijs de ligt,Bayern Munich,67.0,60-120m,0,1,,,,,,,,,,,,
9,raheem sterling,Chelsea,56.2,30-60m,0,1,1.0,0.0,11.0,8.0,0.727273,0.0,0.0,,,,,


In [12]:
pf = pd.read_csv(os.path.join(OUT_DIR, "player_features.csv"))
print(pf.shape)
display(pf.head(10))

# How many players have any events?
print("Players with events:", (pf["shots"].fillna(0) + pf["passes_total"].fillna(0)).gt(0).sum())

# Missing join diagnostics
missing_perf = pf["shots"].isna().sum()
missing_inj  = pf["injury_count"].isna().sum()
print(f"Missing performance rows: {missing_perf}")
print(f"Missing injury rows: {missing_inj}")

# Distribution snapshot
print(pf[["fee_million","goals","assists","pass_accuracy","injury_count","avg_days_missed"]].describe())


(938, 18)


Unnamed: 0,player_key,Club,fee_million,fee_bucket,is_free_transfer,high_value_flag,shots,goals,passes_total,passes_completed,pass_accuracy,assists,matches_played,injury_count,avg_days_missed,max_days_missed,injuries_last_180d,avg_sentiment
0,erling haaland,Man City,60.0,30-60m,0,1,,,,,,,,,,,,
1,enzo fernández,Chelsea,121.0,120m+,0,1,,,,,,,,,,,,
2,antony,Man Utd,95.0,60-120m,0,1,,,,,,,,,,,,
3,wesley fofana,Chelsea,80.4,60-120m,0,1,,,,,,,,,,,,
4,aurélien tchouaméni,Real Madrid,80.0,60-120m,0,1,,,,,,,,,,,,
5,casemiro,Man Utd,70.65,60-120m,0,1,,,,,,,,,,,,
6,mykhaylo mudryk,Chelsea,70.0,60-120m,0,1,,,,,,,,,,,,
7,alexander isak,Newcastle,70.0,60-120m,0,1,,,,,,,,3.0,26.7,30.0,0.0,
8,matthijs de ligt,Bayern Munich,67.0,60-120m,0,1,,,,,,,,,,,,
9,raheem sterling,Chelsea,56.2,30-60m,0,1,1.0,0.0,11.0,8.0,0.727273,0.0,0.0,,,,,


Players with events: 2
Missing performance rows: 936
Missing injury rows: 903
       fee_million  goals  assists  pass_accuracy  injury_count  \
count   938.000000    2.0      2.0       2.000000     35.000000   
mean      8.011365    0.0      0.0       0.808081      2.428571   
std      11.549730    0.0      0.0       0.114280      1.819595   
min       1.000000    0.0      0.0       0.727273      1.000000   
25%       2.500000    0.0      0.0       0.767677      1.000000   
50%       4.085000    0.0      0.0       0.808081      2.000000   
75%       8.500000    0.0      0.0       0.848485      3.000000   
max     121.000000    0.0      0.0       0.888889      9.000000   

       avg_days_missed  
count        35.000000  
mean         57.268571  
std          61.813033  
min           5.000000  
25%          21.500000  
50%          36.000000  
75%          73.500000  
max         299.500000  


In [None]:
Phase 5

In [21]:
import pandas as pd
import numpy as np
import os

# Paths
base_path = r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\cleaned_data"

# Load datasets
# Load cleaned datasets
events   = pd.read_csv(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\cleaned_data\events_clean.csv")
injuries = pd.read_csv(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\cleaned_data\injuries_clean.csv")
market   = pd.read_csv(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\cleaned_data\market_clean.csv")
tweets   = pd.read_csv(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\cleaned_data\tweets_clean.csv")


print("Events:", events.shape)
print("Injuries:", injuries.shape)
print("Market:", market.shape)
print("Tweets:", tweets.shape)


Events: (4165, 107)
Injuries: (656, 43)
Market: (957, 3)
Tweets: (22524, 6)


In [15]:
print(events.columns.tolist())


['id', 'index', 'period', 'timestamp', 'minute', 'second', 'possession', 'duration', 'type.id', 'type.name', 'possession_team.id', 'possession_team.name', 'play_pattern.id', 'play_pattern.name', 'team.id', 'team.name', 'tactics.formation', 'tactics.lineup', 'related_events', 'location', 'player.id', 'player.name', 'position.id', 'position.name', 'pass.recipient.id', 'pass.recipient.name', 'pass.length', 'pass.angle', 'pass.height.id', 'pass.height.name', 'pass.end_location', 'pass.body_part.id', 'pass.body_part.name', 'pass.type.id', 'pass.type.name', 'carry.end_location', 'under_pressure', 'pass.outcome.id', 'pass.outcome.name', 'ball_receipt.outcome.id', 'ball_receipt.outcome.name', 'counterpress', 'pass.switch', 'interception.outcome.id', 'interception.outcome.name', 'duel.outcome.id', 'duel.outcome.name', 'duel.type.id', 'duel.type.name', 'ball_recovery.recovery_failure', 'foul_committed.offensive', 'foul_won.defensive', 'shot.statsbomb_xg', 'shot.end_location', 'shot.technique.id'

In [22]:
# ✅ Aggregate key performance stats per player
perf_features = events.groupby("player.name").agg({
    "pass.outcome.name": lambda x: (x == "Complete").sum(),   # completed passes
    "shot.statsbomb_xg": "sum",                               # total expected goals
    "shot.outcome.name": lambda x: (x == "Goal").sum(),       # count goals
    "pass.goal_assist": "sum"                                 # assists
}).reset_index()

# ✅ Rename columns for clarity
perf_features.rename(columns={
    "player.name": "player",
    "pass.outcome.name": "passes_completed",
    "shot.statsbomb_xg": "expected_goals",
    "shot.outcome.name": "goals",
    "pass.goal_assist": "assists"
}, inplace=True)

print("✅ Performance features created:", perf_features.shape)
perf_features.head(30)


✅ Performance features created: (31, 5)


Unnamed: 0,player,passes_completed,expected_goals,goals,assists
0,Andrej Kramarić,0,2.654001,0,UnknownUnknownUnknownUnknownUnknownUnknownUnkn...
1,Ante Rebić,0,9.025964,0,UnknownUnknownUnknownUnknownUnknownUnknownUnkn...
2,Ashley Young,0,5.599779,0,UnknownUnknownUnknownUnknownUnknownUnknownUnkn...
3,Bamidele Alli,0,12.271856,0,UnknownUnknownUnknownUnknownUnknownUnknownUnkn...
4,Danijel Subašić,0,5.540207,0,UnknownUnknownUnknownUnknownUnknownUnknownUnkn...
5,Danny Rose,0,2.502029,0,UnknownUnknownUnknownUnknownUnknownUnknownUnkn...
6,Dejan Lovren,0,8.525549,0,UnknownUnknownUnknownUnknownUnknownUnknownUnkn...
7,Domagoj Vida,0,6.850793,0,UnknownUnknownUnknownUnknownUnknownUnknownUnkn...
8,Eric Dier,0,3.475921,0,UnknownUnknownUnknownUnknownUnknownUnknownUnkn...
9,Harry Kane,0,9.359422,0,UnknownUnknownUnknownUnknownUnknownUnknownUnkn...


In [24]:
print(injuries.columns)


Index(['Name', 'Team Name', 'Position', 'Age', 'Season', 'FIFA rating',
       'Injury', 'Date of Injury', 'Date of return',
       'Match1_before_injury_Result', 'Match1_before_injury_Opposition',
       'Match1_before_injury_GD', 'Match1_before_injury_Player_rating',
       'Match2_before_injury_Result', 'Match2_before_injury_Opposition',
       'Match2_before_injury_GD', 'Match2_before_injury_Player_rating',
       'Match3_before_injury_Result', 'Match3_before_injury_Opposition',
       'Match3_before_injury_GD', 'Match3_before_injury_Player_rating',
       'Match1_missed_match_Result', 'Match1_missed_match_Opposition',
       'Match1_missed_match_GD', 'Match2_missed_match_Result',
       'Match2_missed_match_Opposition', 'Match2_missed_match_GD',
       'Match3_missed_match_Result', 'Match3_missed_match_Opposition',
       'Match3_missed_match_GD', 'Match1_after_injury_Result',
       'Match1_after_injury_Opposition', 'Match1_after_injury_GD',
       'Match1_after_injury_Player_rat

In [25]:
print(market.columns)

Index(['Club', 'Name', 'Fee'], dtype='object')


In [26]:
import pandas as pd

# Load cleaned datasets
events   = pd.read_csv(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\cleaned_data\events_clean.csv")
injuries = pd.read_csv(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\cleaned_data\injuries_clean.csv")
market   = pd.read_csv(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\cleaned_data\market_clean.csv")
tweets   = pd.read_csv(r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\cleaned_data\tweets_clean.csv")

print("✅ Datasets Loaded")
print("Events:", events.shape)
print("Injuries:", injuries.shape)
print("Market:", market.shape)
print("Tweets:", tweets.shape)


✅ Datasets Loaded
Events: (4165, 107)
Injuries: (656, 43)
Market: (957, 3)
Tweets: (22524, 6)


In [27]:
# Create performance features per player from events data
perf_features = events.groupby("player.name").agg({
    "pass.length": "count",   # proxy for total passes
    "shot.statsbomb_xg": "sum",  # total expected goals
    "shot.outcome.name": lambda x: (x == "Goal").sum(),  # total goals
    "pass.goal_assist": "sum"   # assists from event flag
}).reset_index()

# Rename for clarity
perf_features.rename(columns={
    "player.name": "player",
    "pass.length": "passes_attempted",
    "shot.statsbomb_xg": "expected_goals",
    "shot.outcome.name": "goals",
    "pass.goal_assist": "assists"
}, inplace=True)

print("✅ Performance features created:", perf_features.shape)
perf_features.head()


✅ Performance features created: (31, 5)


Unnamed: 0,player,passes_attempted,expected_goals,goals,assists
0,Andrej Kramarić,44,2.654001,0,UnknownUnknownUnknownUnknownUnknownUnknownUnkn...
1,Ante Rebić,147,9.025964,0,UnknownUnknownUnknownUnknownUnknownUnknownUnkn...
2,Ashley Young,94,5.599779,0,UnknownUnknownUnknownUnknownUnknownUnknownUnkn...
3,Bamidele Alli,206,12.271856,0,UnknownUnknownUnknownUnknownUnknownUnknownUnkn...
4,Danijel Subašić,93,5.540207,0,UnknownUnknownUnknownUnknownUnknownUnknownUnkn...


In [29]:
# Create injury-related features
injury_features = injuries.groupby("Name").agg({
    "Injury": "count",       # number of injuries
    "days_out": "sum"        # total days missed
}).reset_index()

injury_features.rename(columns={
    "Name": "player",
    "Injury": "injury_count",
    "days_out": "total_days_out"
}, inplace=True)

print("✅ Injury features created:", injury_features.shape)
injury_features.head()


✅ Injury features created: (224, 3)


Unnamed: 0,player,injury_count,total_days_out
0,Aaron Hickey,2,599.0
1,Aaron Lennon,1,19.0
2,Aaron Wan-Bissaka,4,79.0
3,Abdoulaye Doucouré,4,183.0
4,Alex Iwobi,3,73.0


In [30]:
# Market dataset: ['Club', 'Name', 'Fee']
# We'll take average transfer fee as market value proxy

contract_features = market.groupby("Name").agg({
    "Fee": "mean"
}).reset_index()

contract_features.rename(columns={
    "Name": "player",
    "Fee": "avg_market_value"
}, inplace=True)

print("✅ Market features created:", contract_features.shape)
contract_features.head()


✅ Market features created: (938, 2)


Unnamed: 0,player,avg_market_value
0,Aaron Boupendza,7.0
1,Aaron Hickey,16.5
2,Aarón Herrera,4.71
3,Abdelhamid Sabiri,1.75
4,Abdou Harroui,1.0


In [32]:
# Merge all feature sets
player_features = perf_features.merge(injury_features, on="player", how="outer")
player_features = player_features.merge(contract_features, on="player", how="outer")

print("✅ Final Player Features:", player_features.shape)
player_features.head(100)


✅ Final Player Features: (1148, 8)


Unnamed: 0,player,passes_attempted,expected_goals,goals,assists,injury_count,total_days_out,avg_market_value
0,Aaron Boupendza,,,,,,,7.00
1,Aaron Hickey,,,,,2.0,599.0,16.50
2,Aaron Lennon,,,,,1.0,19.0,
3,Aaron Wan-Bissaka,,,,,4.0,79.0,
4,Aarón Herrera,,,,,,,4.71
...,...,...,...,...,...,...,...,...
95,Andrés Andrade,,,,,,,1.25
96,Andrés Perea,,,,,,,7.13
97,Andy Carroll,,,,,4.0,182.0,
98,Anel Ahmedhodzic,,,,,,,4.50


In [34]:
import os

# Define output path
output_dir = r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\feature_engineered"
output_path = os.path.join(output_dir, "player_features.csv")

# Create folder if it doesn’t exist
os.makedirs(output_dir, exist_ok=True)

# Save file
player_features.to_csv(output_path, index=False)

print("📁 Saved:", output_path)



📁 Saved: C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\feature_engineered\player_features.csv


In [35]:
import os

# Define output path
output_dir = r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\feature_engineered"
output_path = os.path.join(output_dir, "player_features.csv")

# Create folder if it doesn’t exist
os.makedirs(output_dir, exist_ok=True)

# Save file
player_features.to_csv(output_path, index=False)

print("📁 Saved:", output_path)


📁 Saved: C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\feature_engineered\player_features.csv


In [36]:
print("🔎 Columns:", list(player_features.columns))


🔎 Columns: ['player', 'passes_attempted', 'expected_goals', 'goals', 'assists', 'injury_count', 'total_days_out', 'avg_market_value']


In [37]:
# Check for duplicates
print("Duplicates:", player_features.duplicated().sum())

# Missing values
print("Missing values per column:\n", player_features.isna().sum())

# Basic stats for numeric features
print("\n📊 Summary Statistics:\n", player_features.describe())


Duplicates: 0
Missing values per column:
 player                 0
passes_attempted    1117
expected_goals      1117
goals               1117
assists             1117
injury_count         924
total_days_out       924
avg_market_value     210
dtype: int64

📊 Summary Statistics:
        passes_attempted  expected_goals      goals  injury_count  \
count         31.000000       31.000000  31.000000    224.000000   
mean         134.354839        8.027096   0.096774      2.928571   
std           76.752654        4.571257   0.300537      2.150041   
min            6.000000        0.357433   0.000000      1.000000   
25%           70.000000        4.143122   0.000000      1.000000   
50%          144.000000        8.602184   0.000000      2.000000   
75%          200.000000       11.928438   0.000000      4.000000   
max          266.000000       15.817113   1.000000     11.000000   

       total_days_out  avg_market_value  
count      224.000000        938.000000  
mean       130.584821   

In [42]:
print("🔎 Columns present in player_features.csv:")
print(player_features.columns.tolist())


🔎 Columns present in player_features.csv:
['player', 'passes_attempted', 'expected_goals', 'goals', 'assists', 'injury_count', 'total_days_out', 'avg_market_value']


In [43]:
# Check all columns
print("🔎 Columns:", list(player_features.columns))

# Check duplicates
print("Duplicates:", player_features.duplicated().sum())

# Check missing values
print("Missing values per column:\n", player_features.isna().sum())

# Summary stats for numeric features
print("\n📊 Summary Statistics:\n", player_features.describe())

# Quick checks
if "expected_goals" in player_features.columns:
    print("\n🏆 Top 5 players by Expected Goals:")
    print(player_features[['player', 'expected_goals']].dropna().sort_values(by="expected_goals", ascending=False).head(5))

if "assists" in player_features.columns:
    print("\n🎯 Top 5 players by Assists:")
    print(player_features[['player', 'assists']].dropna().sort_values(by="assists", ascending=False).head(5))

if "total_days_out" in player_features.columns:
    print("\n🩺 Top 5 players by Total Days Out (Injury):")
    print(player_features[['player', 'total_days_out']].dropna().sort_values(by="total_days_out", ascending=False).head(5))

if "avg_market_value" in player_features.columns:
    print("\n💰 Top 5 players by Market Value:")
    print(player_features[['player', 'avg_market_value']].dropna().sort_values(by="avg_market_value", ascending=False).head(5))


🔎 Columns: ['player', 'passes_attempted', 'expected_goals', 'goals', 'assists', 'injury_count', 'total_days_out', 'avg_market_value']
Duplicates: 0
Missing values per column:
 player                 0
passes_attempted    1117
expected_goals      1117
goals               1117
assists             1117
injury_count         924
total_days_out       924
avg_market_value     210
dtype: int64

📊 Summary Statistics:
        passes_attempted  expected_goals      goals  injury_count  \
count         31.000000       31.000000  31.000000    224.000000   
mean         134.354839        8.027096   0.096774      2.928571   
std           76.752654        4.571257   0.300537      2.150041   
min            6.000000        0.357433   0.000000      1.000000   
25%           70.000000        4.143122   0.000000      1.000000   
50%          144.000000        8.602184   0.000000      2.000000   
75%          200.000000       11.928438   0.000000      4.000000   
max          266.000000       15.817113   1

In [9]:
# --- Cell 1: Setup & Load Data ---
import pandas as pd
import os

# Path to final Phase 5 dataset
path = r"C:\Users\M.ANTONY ROJES\Downloads\Infosys\data\feature_engineered\player_features.csv"

# Load dataset
df = pd.read_csv(path)

print("✅ Data Loaded Successfully")
print("Shape:", df.shape)
print("Columns:", df.columns.tolist())
df.head(5)


✅ Data Loaded Successfully
Shape: (1148, 8)
Columns: ['player', 'passes_attempted', 'expected_goals', 'goals', 'assists', 'injury_count', 'total_days_out', 'avg_market_value']


Unnamed: 0,player,passes_attempted,expected_goals,goals,assists,injury_count,total_days_out,avg_market_value
0,Aaron Boupendza,,,,,,,7.0
1,Aaron Hickey,,,,,2.0,599.0,16.5
2,Aaron Lennon,,,,,1.0,19.0,
3,Aaron Wan-Bissaka,,,,,4.0,79.0,
4,Aarón Herrera,,,,,,,4.71


In [2]:
# --- Cell 2: Handle Missing Values ---

# Show missing values count
print("Missing values per column:\n", df.isna().sum())

# Fill numeric columns with 0, categorical with "Unknown"
for col in df.columns:
    if df[col].dtype == "object":
        df[col] = df[col].fillna("Unknown")
    else:
        df[col] = df[col].fillna(0)

print("\n✅ Missing values handled")


Missing values per column:
 player                 0
passes_attempted    1117
expected_goals      1117
goals               1117
assists             1117
injury_count         924
total_days_out       924
avg_market_value     210
dtype: int64

✅ Missing values handled


In [3]:
# --- Cell 3: Encode Categorical Variables ---

# Identify categorical columns
categorical_cols = df.select_dtypes(include=["object"]).columns.tolist()
print("Categorical Columns:", categorical_cols)

# One-hot encode categorical variables
df_encoded = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("✅ Encoding Completed")
print("New Shape:", df_encoded.shape)
df_encoded.head(5)


Categorical Columns: ['player', 'assists']
✅ Encoding Completed
New Shape: (1148, 1182)


Unnamed: 0,passes_attempted,expected_goals,goals,injury_count,total_days_out,avg_market_value,player_Aaron Hickey,player_Aaron Lennon,player_Aaron Wan-Bissaka,player_Aarón Herrera,...,assists_UnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknown,assists_UnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownTrueUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknown,assists_UnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknown,assists_UnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknown,assists_UnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknown,assists_UnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknown,assists_UnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknown,assists_UnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknown,assists_UnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknown,assists_UnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknown
0,0.0,0.0,0.0,0.0,0.0,7.0,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0.0,0.0,0.0,2.0,599.0,16.5,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0.0,0.0,0.0,1.0,19.0,0.0,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
3,0.0,0.0,0.0,4.0,79.0,0.0,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
4,0.0,0.0,0.0,0.0,0.0,4.71,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [1]:
import sys
print("Interpreter in use:", sys.executable)

import sklearn
print("✅ scikit-learn version:", sklearn.__version__)


Interpreter in use: c:\Users\M.ANTONY ROJES\AppData\Local\Programs\Python\Python39\python.exe


ModuleNotFoundError: No module named 'sklearn'

In [2]:
# --- Cell 4: Scale Numerical Features ---
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Scale numeric features only
numeric_cols = df_encoded.select_dtypes(include=["int64", "float64"]).columns
df_encoded[numeric_cols] = scaler.fit_transform(df_encoded[numeric_cols])

print("✅ Scaling Completed")
df_encoded.head(5)


ModuleNotFoundError: No module named 'sklearn'