# 4 or more ML Classifiers - Submission

Will basically copy over first_model notebook things, and then make new models sections. Thinking of doing Logistic Regression, XGBoost, and SVM. Open to other models.

<h3>Setup and Imports</h3>

In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif
import matplotlib.pyplot as plt
# import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set random seed for reproducibility
np.random.seed(42)

print("✓ Libraries imported successfully!")
print(f"✓ Random seed set to: 42")

✓ Libraries imported successfully!
✓ Random seed set to: 42


<h1>Load Preprocessed Data</h1>
<b>Load data and perform embedded vector conversion from string to numerical</b>

In [2]:
# Load the preprocessed dataset with NLP embeddings
df = pd.read_csv('data/processed/games_preprocessed.csv')

print(f"Dataset shape: {df.shape}")
print(f"Total samples: {df.shape[0]:,}")
print(f"Total features (including target): {df.shape[1]}")
print("\nFirst few rows:")
display(df.head())

# Check for any missing values
print(f"\nMissing values: {df.isnull().sum().sum()}")

Dataset shape: (89302, 109)
Total samples: 89,302
Total features (including target): 109

First few rows:


Unnamed: 0,Price,DLC count,About the game,Windows,Mac,Linux,Achievements,Release_Year,Release_Month,Release_Day,...,Publishers_LTD,Publishers_EroticGamesClub,Publishers_Square_Enix,Publishers_Strategy_First,Publishers_HH_Games,Publishers_Choice_of_Games,Publishers_Sekai_Project,Publishers_Electronic_Arts,Publishers_Atomic_Fabrik,popularity_class
0,2.007898,-0.039159,[ 2.43655536e-02 -4.33482192e-02 -1.89679326e-...,True,False,False,0.057969,2008,10,21,...,0,0,0,0,0,0,0,0,0,Low
1,-0.963858,-0.039159,[-1.18375435e-01 6.85120896e-02 -8.45908746e-...,True,True,False,-0.04963,2017,10,12,...,0,0,0,0,0,0,0,0,0,Low
2,-0.338225,-0.039159,[-7.47546032e-02 -1.29440166e-02 2.83202082e-...,True,False,False,-0.121362,2021,11,17,...,0,0,0,0,0,0,0,0,0,Low
3,-0.181817,-0.039159,[ 2.98691001e-02 1.47273587e-02 5.98186301e-...,True,True,True,-0.121362,2020,7,23,...,0,0,0,0,0,0,0,0,0,Low
4,-1.118702,-0.039159,[-6.26481697e-02 7.47049646e-03 -5.16111143e-...,True,True,False,-0.019741,2020,2,3,...,0,0,0,0,0,0,0,0,0,Low



Missing values: 0


<b>Conversion from string embedded vectors to numeric embedded vectors</b>

In [3]:
import ast

# Check if 'About the game' column exists and contains string representations of arrays
if 'About the game' in df.columns:
    print("Converting 'About the game' embeddings from strings to numeric arrays...")
    
    # Convert string representations to actual arrays
    def string_to_array(s):
        if isinstance(s, str):
            # Remove extra whitespace and convert to numpy array
            return np.fromstring(s.strip('[]'), sep=' ')
        return s
    
    df['About the game'] = df['About the game'].apply(string_to_array)
    
    print(f"✓ Conversion complete!")
    print(f"✓ Sample embedding shape: {df['About the game'].iloc[0].shape}")
    print(f"✓ Sample embedding (first 10 values): {df['About the game'].iloc[0][:10]}")
else:
    print("'About the game' column not found in dataset")

Converting 'About the game' embeddings from strings to numeric arrays...
✓ Conversion complete!
✓ Sample embedding shape: (384,)
✓ Sample embedding (first 10 values): [ 0.02436555 -0.04334822 -0.00189679 -0.03764986 -0.08963642  0.02961544
 -0.0579943   0.0187653   0.01877719  0.06303879]


<b>Place numeric embedded vectors back into the 'About the game' column</b>

In [4]:
# Expand embeddings into separate columns
# This is necessary because sklearn models need 2D numeric arrays, not arrays within cells

if 'About the game' in df.columns:
    print("Expanding embeddings into separate columns...")
    
    # Convert the 'About the game' column (which contains arrays) into separate columns
    embeddings_list = df['About the game'].tolist()
    embeddings_df = pd.DataFrame(embeddings_list, 
                                  columns=[f'embedding_{i}' for i in range(len(embeddings_list[0]))])
    
    # Drop the original 'About the game' column
    df = df.drop('About the game', axis=1)
    
    # Insert embedding columns at the beginning
    df = pd.concat([embeddings_df, df], axis=1)
    
    print(f"✓ Expanded embeddings into {len(embeddings_list[0])} numeric columns")
    print(f"✓ New dataset shape: {df.shape}")
    print(f"✓ First few column names: {df.columns[:5].tolist()}")
    print(f"✓ Last few column names: {df.columns[-5:].tolist()}")
else:
    print("No embedding expansion needed - 'About the game' not found")

Expanding embeddings into separate columns...
✓ Expanded embeddings into 384 numeric columns
✓ New dataset shape: (89302, 492)
✓ First few column names: ['embedding_0', 'embedding_1', 'embedding_2', 'embedding_3', 'embedding_4']
✓ Last few column names: ['Publishers_Choice_of_Games', 'Publishers_Sekai_Project', 'Publishers_Electronic_Arts', 'Publishers_Atomic_Fabrik', 'popularity_class']


In [5]:
# Check target variable distribution
print("Target variable distribution:")
print(df['popularity_class'].value_counts())
print("\nPercentage distribution:")
print(df['popularity_class'].value_counts(normalize=True) * 100)

Target variable distribution:
popularity_class
Low       78429
Medium     8934
High       1939
Name: count, dtype: int64

Percentage distribution:
popularity_class
Low       87.824461
Medium    10.004255
High       2.171284
Name: proportion, dtype: float64


<h1>Prepared Raw and Processed Data for Models</h1>