# ML Model Building - Submission

This notebook demonstrates:
1. Loading preprocessed data with NLP embeddings
2. Feature selection to create a reduced feature set
3. Building two models:
   - **Model 1 (M1):** Full feature set
   - **Model 2 (M2):** Reduced feature set (selected features only)
4. Saving both models as `.sav` files for future use

**Note:** This submission focuses on model building and saving. Performance evaluation will be included in the next phase.

# Setup and Imports

In [None]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("âœ“ Libraries imported successfully!")
print(f"âœ“ Random seed set to: 42")

# Load Preprocessed Data

In [None]:
# Load the preprocessed dataset with NLP embeddings
df = pd.read_csv('data/processed/games_preprocessed.csv')

print(f"Dataset shape: {df.shape}")
print(f"Total samples: {df.shape[0]:,}")
print(f"Total features (including target): {df.shape[1]}")
print("\nFirst few rows:")
display(df.head())

# Check for any missing values
print(f"\nMissing values: {df.isnull().sum().sum()}")

# Convert Embedding Strings to Numeric Arrays

The 'About the game' column contains embedding vectors stored as strings. We need to convert them back to numeric arrays.

In [None]:
import ast

# Check if 'About the game' column exists and contains string representations of arrays
if 'About the game' in df.columns:
    print("Converting 'About the game' embeddings from strings to numeric arrays...")
    
    # Convert string representations to actual arrays
    def string_to_array(s):
        if isinstance(s, str):
            # Remove extra whitespace and convert to numpy array
            return np.fromstring(s.strip('[]'), sep=' ')
        return s
    
    df['About the game'] = df['About the game'].apply(string_to_array)
    
    print(f"âœ“ Conversion complete!")
    print(f"âœ“ Sample embedding shape: {df['About the game'].iloc[0].shape}")
    print(f"âœ“ Sample embedding (first 10 values): {df['About the game'].iloc[0][:10]}")
else:
    print("'About the game' column not found in dataset")

In [None]:
# Expand embeddings into separate columns
# This is necessary because sklearn models need 2D numeric arrays, not arrays within cells

if 'About the game' in df.columns:
    print("Expanding embeddings into separate columns...")
    
    # Convert the 'About the game' column (which contains arrays) into separate columns
    embeddings_list = df['About the game'].tolist()
    embeddings_df = pd.DataFrame(embeddings_list, 
                                  columns=[f'embedding_{i}' for i in range(len(embeddings_list[0]))])
    
    # Drop the original 'About the game' column
    df = df.drop('About the game', axis=1)
    
    # Insert embedding columns at the beginning
    df = pd.concat([embeddings_df, df], axis=1)
    
    print(f"âœ“ Expanded embeddings into {len(embeddings_list[0])} numeric columns")
    print(f"âœ“ New dataset shape: {df.shape}")
    print(f"âœ“ First few column names: {df.columns[:5].tolist()}")
    print(f"âœ“ Last few column names: {df.columns[-5:].tolist()}")
else:
    print("No embedding expansion needed - 'About the game' not found")

In [None]:
# Check target variable distribution
print("Target variable distribution:")
print(df['popularity_class'].value_counts())
print("\nPercentage distribution:")
print(df['popularity_class'].value_counts(normalize=True) * 100)

# Prepare Data for Modeling

Separate features (X) and target variable (y), then split into training and testing sets.

In [None]:
# Separate features and target
X = df.drop('popularity_class', axis=1)
y = df['popularity_class']

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"\nTotal number of features: {X.shape[1]}")
print(f"\nFeature data types:")
print(X.dtypes.value_counts())

# Handle Non-Numeric Columns

Check for and handle any remaining categorical/text columns that need to be encoded or removed.

In [None]:
# Check for non-numeric columns in X
print("Checking for non-numeric columns...")
print(f"\nData types in features:")
print(X.dtypes.value_counts())

# Identify object (string) columns
object_cols = X.select_dtypes(include=['object']).columns.tolist()

if object_cols:
    print(f"\nâš  Found {len(object_cols)} non-numeric columns:")
    for col in object_cols:
        print(f"  - {col}: {X[col].nunique()} unique values")
        print(f"    Sample values: {X[col].dropna().head(3).tolist()}")
    
    print(f"\nðŸ”§ Dropping non-numeric columns: {object_cols}")
    X = X.drop(columns=object_cols)
    print(f"âœ“ Remaining features: {X.shape[1]}")
else:
    print("âœ“ All columns are numeric!")

print(f"\nFinal feature set shape: {X.shape}")
print(f"Final feature types:\n{X.dtypes.value_counts()}")

In [None]:
# Split data into training and testing sets (80/20 split)
# Use stratify to maintain class distribution in both sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

# Model 1: Random Forest with Full Feature Set

Train a Random Forest Classifier using all available features.

In [None]:
# Initialize the Random Forest Classifier for Model 1 (Full Feature Set)
model_M1 = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

print("Training Model 1 with full feature set...")
# Fit the model on training set
model_M1.fit(X_train, y_train)
print("âœ“ Model 1 training complete!")

In [None]:
# Save Model 1 to disk
filename_M1 = 'finalized_model_M1.sav'
pickle.dump(model_M1, open(filename_M1, 'wb'))
print(f"âœ“ Model 1 saved as: {filename_M1}")

# Load the model from disk to verify
loaded_model_M1 = pickle.load(open(filename_M1, 'rb'))
result_M1 = loaded_model_M1.score(X_test, y_test)
print(f"âœ“ Model 1 loaded and verified")
print(f"âœ“ Test score: {result_M1}")

# Feature Selection

Use SelectKBest with ANOVA F-statistic to select the most important features for Model 2.

# Model 2: Random Forest with Reduced Feature Set

Train a Random Forest Classifier using only the selected features.