# Game Churn Prediction AI - Data Cleaning & Feature Engineering

This notebook loads the raw data, cleans it, and prepares it for machine learning.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
sns.set_theme(style='whitegrid')


### 1. Load Raw Dataset


In [None]:
df = pd.read_csv('../data/raw_data.csv')
df.head()


### 2. Handle missing values


In [None]:
print("Missing values:\n", df.isnull().sum())

# Fill missing values
df.fillna(df.median(numeric_only=True), inplace=True)
for col in df.select_dtypes(include=['object']).columns:
    df[col] = df[col].fillna(df[col].mode()[0])


### 3. Remove duplicates


In [None]:
print(f"Shape before duplicates removed: {df.shape}")
df.drop_duplicates(inplace=True)
print(f"Shape after duplicates removed: {df.shape}")


### 4. Drop PlayerID column
PlayerID is a unique identifier and won't help the model learn patterns.


In [None]:
if 'PlayerID' in df.columns:
    df.drop(columns=['PlayerID'], inplace=True)


### 5. Define Churn Label
We will use 'EngagementLevel' == 'Low' as our churn label. Meaning players with low engagement are considered to have churned.


In [None]:
if 'EngagementLevel' in df.columns:
    df['Churn'] = (df['EngagementLevel'] == 'Low').astype(int)
    df.drop(columns=['EngagementLevel'], inplace=True)

df['Churn'].value_counts().plot(kind='bar', color=['skyblue', 'salmon'])
plt.title('Churn Distribution')
plt.show()


### 6. Encode categorical features using one-hot encoding


In [None]:
categorical_cols = df.select_dtypes(include=['object']).columns
print("Categorical columns:", categorical_cols)

df = pd.get_dummies(df, drop_first=True)
df.head()


### 7. Save cleaned dataset


In [None]:
df.to_csv('../data/clean_data.csv', index=False)
print("Data cleaning complete. Cleaned dataset saved to '../data/clean_data.csv'")
