# ML Modeling Submission

1. **Model 1:** Full feature set
2. **Model 2:** Reduced feature set (using feature selection)

Both models will be saved as `.sav` files for future use.

# Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns

# Set random seed for reproducibility
np.random.seed(42)

print("Libraries imported successfully!")

# Load Preprocessed Data

In [None]:
# Load the preprocessed dataset
df = pd.read_csv('data/processed/games_preprocessed.csv')

print(f"Dataset shape: {df.shape}")
print(f"Total samples: {df.shape[0]}")
print(f"Total features: {df.shape[1]}")
print("\nFirst few rows:")
df.head()

In [None]:
# Check target variable distribution
print("Target variable distribution:")
print(df['popularity_class'].value_counts())
print("\nPercentage distribution:")
print(df['popularity_class'].value_counts(normalize=True) * 100)

# Prepare Data for Modeling

Separate features (X) and target variable (y), then split into training and testing sets.

In [None]:
# Separate features and target
X = df.drop('popularity_class', axis=1)
y = df['popularity_class']

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature columns ({len(X.columns)}):")
print(X.columns.tolist())

In [None]:
# Split data into training and testing sets (80/20 split)
# Use stratify to maintain class distribution
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")
print(f"\nTraining set class distribution:")
print(y_train.value_counts())
print(f"\nTesting set class distribution:")
print(y_test.value_counts())