# ML Model Building - Submission

This notebook demonstrates:
1. Loading preprocessed data with NLP embeddings
2. Feature selection to create a reduced feature set
3. Building two models:
   - **Model 1 (M1):** Full feature set
   - **Model 2 (M2):** Reduced feature set (selected features only)
4. Saving both models as `.sav` files for future use

**Note:** This submission focuses on model building and saving. Performance evaluation will be included in the next phase.

# Setup and Imports

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
# import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("âœ“ Libraries imported successfully!")
print(f"âœ“ Random seed set to: 42")

âœ“ Libraries imported successfully!
âœ“ Random seed set to: 42


# Load Preprocessed Data

In [2]:
# Load the preprocessed dataset with NLP embeddings
df = pd.read_csv('data/processed/games_preprocessed.csv')

print(f"Dataset shape: {df.shape}")
print(f"Total samples: {df.shape[0]:,}")
print(f"Total features (including target): {df.shape[1]}")
print("\nFirst few rows:")
display(df.head())

# Check for any missing values
print(f"\nMissing values: {df.isnull().sum().sum()}")

Dataset shape: (89302, 109)
Total samples: 89,302
Total features (including target): 109

First few rows:


Unnamed: 0,Price,DLC count,About the game,Windows,Mac,Linux,Achievements,Release_Year,Release_Month,Release_Day,...,Publishers_LTD,Publishers_EroticGamesClub,Publishers_Square_Enix,Publishers_Strategy_First,Publishers_HH_Games,Publishers_Choice_of_Games,Publishers_Sekai_Project,Publishers_Electronic_Arts,Publishers_Atomic_Fabrik,popularity_class
0,2.007898,-0.039159,[ 2.43655536e-02 -4.33482192e-02 -1.89679326e-...,True,False,False,0.057969,2008,10,21,...,0,0,0,0,0,0,0,0,0,Low
1,-0.963858,-0.039159,[-1.18375435e-01 6.85120896e-02 -8.45908746e-...,True,True,False,-0.04963,2017,10,12,...,0,0,0,0,0,0,0,0,0,Low
2,-0.338225,-0.039159,[-7.47546032e-02 -1.29440166e-02 2.83202082e-...,True,False,False,-0.121362,2021,11,17,...,0,0,0,0,0,0,0,0,0,Low
3,-0.181817,-0.039159,[ 2.98691001e-02 1.47273587e-02 5.98186301e-...,True,True,True,-0.121362,2020,7,23,...,0,0,0,0,0,0,0,0,0,Low
4,-1.118702,-0.039159,[-6.26481697e-02 7.47049646e-03 -5.16111143e-...,True,True,False,-0.019741,2020,2,3,...,0,0,0,0,0,0,0,0,0,Low



Missing values: 0


# Convert Embedding Strings to Numeric Arrays

The 'About the game' column contains embedding vectors stored as strings. We need to convert them back to numeric arrays.

In [3]:
import ast

# Check if 'About the game' column exists and contains string representations of arrays
if 'About the game' in df.columns:
    print("Converting 'About the game' embeddings from strings to numeric arrays...")
    
    # Convert string representations to actual arrays
    def string_to_array(s):
        if isinstance(s, str):
            # Remove extra whitespace and convert to numpy array
            return np.fromstring(s.strip('[]'), sep=' ')
        return s
    
    df['About the game'] = df['About the game'].apply(string_to_array)
    
    print(f"âœ“ Conversion complete!")
    print(f"âœ“ Sample embedding shape: {df['About the game'].iloc[0].shape}")
    print(f"âœ“ Sample embedding (first 10 values): {df['About the game'].iloc[0][:10]}")
else:
    print("'About the game' column not found in dataset")

Converting 'About the game' embeddings from strings to numeric arrays...
âœ“ Conversion complete!
âœ“ Sample embedding shape: (384,)
âœ“ Sample embedding (first 10 values): [ 0.02436555 -0.04334822 -0.00189679 -0.03764986 -0.08963642  0.02961544
 -0.0579943   0.0187653   0.01877719  0.06303879]


In [4]:
# Expand embeddings into separate columns
# This is necessary because sklearn models need 2D numeric arrays, not arrays within cells

if 'About the game' in df.columns:
    print("Expanding embeddings into separate columns...")
    
    # Convert the 'About the game' column (which contains arrays) into separate columns
    embeddings_list = df['About the game'].tolist()
    embeddings_df = pd.DataFrame(embeddings_list, 
                                  columns=[f'embedding_{i}' for i in range(len(embeddings_list[0]))])
    
    # Drop the original 'About the game' column
    df = df.drop('About the game', axis=1)
    
    # Insert embedding columns at the beginning
    df = pd.concat([embeddings_df, df], axis=1)
    
    print(f"âœ“ Expanded embeddings into {len(embeddings_list[0])} numeric columns")
    print(f"âœ“ New dataset shape: {df.shape}")
    print(f"âœ“ First few column names: {df.columns[:5].tolist()}")
    print(f"âœ“ Last few column names: {df.columns[-5:].tolist()}")
else:
    print("No embedding expansion needed - 'About the game' not found")

Expanding embeddings into separate columns...
âœ“ Expanded embeddings into 384 numeric columns
âœ“ New dataset shape: (89302, 492)
âœ“ First few column names: ['embedding_0', 'embedding_1', 'embedding_2', 'embedding_3', 'embedding_4']
âœ“ Last few column names: ['Publishers_Choice_of_Games', 'Publishers_Sekai_Project', 'Publishers_Electronic_Arts', 'Publishers_Atomic_Fabrik', 'popularity_class']


In [5]:
# Check target variable distribution
print("Target variable distribution:")
print(df['popularity_class'].value_counts())
print("\nPercentage distribution:")
print(df['popularity_class'].value_counts(normalize=True) * 100)

Target variable distribution:
popularity_class
Low       78429
Medium     8934
High       1939
Name: count, dtype: int64

Percentage distribution:
popularity_class
Low       87.824461
Medium    10.004255
High       2.171284
Name: proportion, dtype: float64


# Prepare Data for Modeling

Separate features (X) and target variable (y), then split into training and testing sets.

In [6]:
# Separate features and target
X = df.drop('popularity_class', axis=1)
y = df['popularity_class']

print(f"Features (X) shape: {X.shape}")
print(f"Target (y) shape: {y.shape}")
print(f"\nTotal number of features: {X.shape[1]}")
print(f"\nFeature data types:")
print(X.dtypes.value_counts())

Features (X) shape: (89302, 491)
Target (y) shape: (89302,)

Total number of features: 491

Feature data types:
float64    387
int64      101
bool         3
Name: count, dtype: int64


# Handle Non-Numeric Columns

Check for and handle any remaining categorical/text columns that need to be encoded or removed.

In [7]:
# Check for non-numeric columns in X
print("Checking for non-numeric columns...")
print(f"\nData types in features:")
print(X.dtypes.value_counts())

# Identify object (string) columns
object_cols = X.select_dtypes(include=['object']).columns.tolist()

if object_cols:
    print(f"\nâš  Found {len(object_cols)} non-numeric columns:")
    for col in object_cols:
        print(f"  - {col}: {X[col].nunique()} unique values")
        print(f"    Sample values: {X[col].dropna().head(3).tolist()}")
    
    print(f"\nðŸ”§ Dropping non-numeric columns: {object_cols}")
    X = X.drop(columns=object_cols)
    print(f"âœ“ Remaining features: {X.shape[1]}")
else:
    print("âœ“ All columns are numeric!")

print(f"\nFinal feature set shape: {X.shape}")
print(f"Final feature types:\n{X.dtypes.value_counts()}")

Checking for non-numeric columns...

Data types in features:
float64    387
int64      101
bool         3
Name: count, dtype: int64
âœ“ All columns are numeric!

Final feature set shape: (89302, 491)
Final feature types:
float64    387
int64      101
bool         3
Name: count, dtype: int64


In [8]:
# Split data into training and testing sets (80/20 split)
# Use stratify to maintain class distribution in both sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

Training set size: 71441 samples
Testing set size: 17861 samples


# Model 1: Random Forest with Full Feature Set

Train a Random Forest Classifier using all available features.

In [9]:
# Initialize the Random Forest Classifier for Model 1 (Full Feature Set)
model_M1 = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

print("Training Model 1 with full feature set...")
# Fit the model on training set
model_M1.fit(X_train, y_train)
print("âœ“ Model 1 training complete!")

Training Model 1 with full feature set...
âœ“ Model 1 training complete!


In [10]:
# Save Model 1 to disk
import sklearn
filename_M1 = 'finalized_model_M1.sav'
pickle.dump(model_M1, open(filename_M1, 'wb'))
print(f"âœ“ Model 1 saved as: {filename_M1}")

# Load the model from disk to verify
loaded_model_M1 = pickle.load(open(filename_M1, 'rb'))
result_M1 = loaded_model_M1.score(X_test, y_test)
print(f"âœ“ Model 1 loaded and verified")
print(f"âœ“ Test score: {result_M1}")

# Use f1 score as well since accuracy may be insufficient for imbalanced classes
# f1 = 2 * [(precision*recall)/(precision+recall)]
# this way we can account for precision and recall as well as accuracy
y_pred = loaded_model_M1.predict(X_test)
modelF1Score = sklearn.metrics.f1_score(y_test, y_pred, average='weighted')
print(f"âœ“ F1score: {modelF1Score}")

âœ“ Model 1 saved as: finalized_model_M1.sav
âœ“ Model 1 loaded and verified
âœ“ Test score: 0.8839370695929679
âœ“ F1score: 0.8377274845027388


# Feature Selection

Use SelectKBest with ANOVA F-statistic to select the most important features for Model 2.

In [11]:
print("Features before SelectKBest with ANOVA: ", X.shape)
X_new = SelectKBest(f_classif, k=20).fit_transform(X, y)

print("Features after SelectKBest with ANOVA:  ", X_new.shape)

Features before SelectKBest with ANOVA:  (89302, 491)
Features after SelectKBest with ANOVA:   (89302, 20)


# Model 2: Random Forest with Reduced Feature Set

Re-split data with X_new obtained from SelectKBest

In [12]:
# Split data into training and testing sets (80/20 split)
# Use stratify to maintain class distribution in both sets
X_NewTrain, X_NewTest, y_NewTrain, y_NewTest = train_test_split(
    X_new, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

Training set size: 71441 samples
Testing set size: 17861 samples


Train a Random Forest Classifier using only the selected (20 best) features.

In [None]:
# Initialize the Random Forest Classifier for Model 2 (Best Features)
model_M2 = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

print("Training Model 2 with full feature set...")
# Fit the model on training set
model_M2.fit(X_NewTrain, y_NewTrain)
print("âœ“ Model 2 training complete!")

Training Model 2 with full feature set...
âœ“ Model 2 training complete!


In [None]:
# Save Model 2 to disk
import sklearn
filename_M2 = 'finalized_model_M2.sav'
pickle.dump(model_M2, open(filename_M2, 'wb'))
print(f"âœ“ Model 2 saved as: {filename_M2}")

# Load the model from disk to verify
loaded_model_M2 = pickle.load(open(filename_M2, 'rb'))
result_M2 = loaded_model_M2.score(X_NewTest, y_NewTest)
print(f"âœ“ Model 2 loaded and verified")
print(f"âœ“ Test score: {result_M2}")

# Use f1 score as well since accuracy may be insufficient for imbalanced classes
# f1 = 2 * [(precision*recall)/(precision+recall)]
# this way we can account for precision and recall as well as accuracy
y_NewPred = loaded_model_M2.predict(X_NewTest)
model2F1Score = sklearn.metrics.f1_score(y_NewTest, y_NewPred, average='weighted')
print(f"âœ“ F1score: {model2F1Score}")

âœ“ Model 1 saved as: finalized_model_M2.sav


ValueError: X has 491 features, but RandomForestClassifier is expecting 20 features as input.