In [41]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler

# Load your dataset
df = pd.read_csv('vgsales_consolidated.csv')  # Replace with your actual dataset

# Select relevant columns
df = df[['Name', 'Genre', 'Publisher', 'Year', 'Global_Sales', 'Decade', 'FranchiseTag']].dropna()
df = df.reset_index(drop=True)

# summary of df columns
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11338 entries, 0 to 11337
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Name          11338 non-null  object 
 1   Genre         11338 non-null  object 
 2   Publisher     11338 non-null  object 
 3   Year          11338 non-null  int64  
 4   Global_Sales  11338 non-null  float64
 5   Decade        11338 non-null  int64  
 6   FranchiseTag  11338 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 620.2+ KB
None


In [42]:
# Encode categorical features
genre_le = LabelEncoder()
publisher_le = LabelEncoder()

df['Genre_enc'] = genre_le.fit_transform(df['Genre'])
df['Publisher_enc'] = publisher_le.fit_transform(df['Publisher'])

# Scale numerical features
scaler = StandardScaler()
num_features = scaler.fit_transform(df[['Year']])

In [43]:
df.head()

Unnamed: 0,Name,Genre,Publisher,Year,Global_Sales,Decade,FranchiseTag,Genre_enc,Publisher_enc
0,007 quantum of solace,action,activision,2008,3.92,2000,0,0,15
1,007 racing,racing,electronic arts,2000,0.53,2000,0,6,136
2,007 the world is not enough,action,electronic arts,2000,2.47,2000,0,0,136
3,007 tomorrow never dies,shooter,electronic arts,1999,3.21,1990,0,8,136
4,1 vs 100,misc,dsi games,2008,0.09,2000,0,3,129


In [44]:
from transformers import BertTokenizer, BertModel
import torch

# Load BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize titles
titles = df['Name'].tolist()
inputs = tokenizer(titles, padding=True, truncation=True, return_tensors="pt")

# Generate embeddings
with torch.no_grad():
    outputs = bert_model(**inputs)

# Use [CLS] token embedding
title_embeddings = outputs.last_hidden_state[:, 0, :].numpy()  # shape: (num_samples, 768)


  from .autonotebook import tqdm as notebook_tqdm


In [45]:
import numpy as np
import pyarrow as pa
import pyarrow.parquet as pq

# assume title_embeddings is a numpy array shape (N, D) dtype float32
X = title_embeddings.astype(np.float32)  # (N, D)
ids = [f"doc_{i}" for i in range(X.shape[0])]
meta = df['Name'].astype(str).tolist()    # or other metadata

# Build pandas DataFrame with embedding as Python list per row
df_parquet = pd.DataFrame({
    "id": ids,
    "title": meta,
    "embedding": X.tolist()   # list[float]
})

# Create explicit pyarrow schema using list<float32>
pa_schema = pa.schema([
    pa.field("id", pa.string()),
    pa.field("title", pa.string()),
    pa.field("embedding", pa.list_(pa.float32()))
])

table = pa.Table.from_pandas(df_parquet, schema=pa_schema, preserve_index=False)
pq.write_table(table, "title_embeddings.parquet", compression="zstd", row_group_size=100_000)

In [46]:
import pyarrow.dataset as ds
import numpy as np

dataset = ds.dataset("title_embeddings.parquet", format="parquet")

for batch in dataset.to_batches(batch_size=50_000, columns=["id","title","embedding"]):
    pdf = batch.to_pandas()
    emb_batch = np.vstack(pdf["embedding"].values).astype(np.float32)  # (B, D)
    # use emb_batch for indexing or inference

In [47]:
# extracting column from pandas DataFrame
ids = pdf['id'].tolist()
titles = pdf['title']
embeddings = np.vstack(pdf['embedding'].values).astype(np.float32)  # (N, D)

In [48]:
embeddings.shape  # (N, D)

(11338, 768)

In [49]:
import numpy as np

# Combine all features
X = np.concatenate([
    titles.values.reshape(-1, 1),
    embeddings,
    df[['Genre_enc', 'Publisher_enc', 'FranchiseTag']].values,
    num_features
], axis=1)

y = df['Global_Sales'].values
print("Feature matrix shape:", X.shape)
print("Target vector shape:", y.shape)

Feature matrix shape: (11338, 773)
Target vector shape: (11338,)


In [50]:
# Perform 80-20 train-test split
from sklearn.model_selection import train_test_split

X_train_name, X_test_name, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train_name[:, 1:].astype(np.float32)  # Exclude title column for model training
X_test = X_test_name[:, 1:].astype(np.float32)    # Exclude title column for model testing

print("Training set shape:", X_train.shape, y_train.shape)
print("Test set shape:", X_test.shape, y_test.shape)

Training set shape: (9070, 772) (9070,)
Test set shape: (2268, 772) (2268,)


In [52]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dense(1, activation='linear')  # Regression output
])

model.compile(optimizer='adam', loss='mse', metrics=['mae'])
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)


Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 27.9874 - mae: 3.0827 - val_loss: 5.2270 - val_mae: 1.4799
Epoch 2/50
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 7.7737 - mae: 1.5197 - val_loss: 4.2591 - val_mae: 1.0753
Epoch 3/50
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 6.2699 - mae: 1.2112 - val_loss: 4.1778 - val_mae: 1.0332
Epoch 4/50
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 5.5871 - mae: 1.0853 - val_loss: 3.9746 - val_mae: 0.8213
Epoch 5/50
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 5.3853 - mae: 1.0343 - val_loss: 4.5934 - val_mae: 1.2618
Epoch 6/50
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 5.1430 - mae: 0.9875 - val_loss: 4.2071 - val_mae: 0.9014
Epoch 7/50
[1m227/227[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 4.90

<keras.src.callbacks.history.History at 0x23de2929bd0>

In [64]:
predictions = model.predict(X_test)

# Constructing dataframe of titles, global sales, and predictions
results_df = pd.DataFrame({
    "Title": X_test_name[:, 0],  # Titles from the name-included test set
    "Actual_Global_Sales": y_test,
    "Predicted_Global_Sales": predictions.flatten()
})

# viewing model predictions
results_df

[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  


Unnamed: 0,Title,Actual_Global_Sales,Predicted_Global_Sales
0,chronicles of mystery the secret tree of life,0.08,0.388166
1,back at the barnyard slop bucket games,0.14,0.375769
2,earth defense force 2025,0.51,0.402670
3,scoobydoo unmasked,0.75,0.471825
4,mlb 10 the show,1.32,0.549422
...,...,...,...
2263,ben 10 omniverse 2,0.03,1.269474
2264,medabots metabee,0.05,0.466737
2265,katekyoo hitman reborn kindan no yami no delta,0.03,0.085962
2266,gaia saver hero saidai no sakusen,0.03,0.245511


In [54]:
# Displaying top 10 highest global sales out of the test set
top_10 = results_df.sort_values(by="Actual_Global_Sales", ascending=False).head(10)
top_10

Unnamed: 0,Title,Actual_Global_Sales,Predicted_Global_Sales
1852,pokemon redpokemon blue,31.37,7.121071
743,new super mario bros wii,28.62,8.496074
1480,call of duty black ops 3,25.32,28.27309
982,nintendogs,24.76,6.165191
97,wii fit,22.72,0.559672
175,call of duty advanced warfare,21.9,21.513552
1576,fifa 16,16.44,6.566523
575,fifa soccer 13,16.16,17.964226
1395,call of duty world at war,15.87,1.731981
1970,pokemon blackpokemon white,15.32,3.90045


In [55]:
# Displaying top 10 predicted sales out of the test set
top_10_pred = results_df.sort_values(by="Predicted_Global_Sales", ascending=False).head(10)
top_10_pred

Unnamed: 0,Title,Actual_Global_Sales,Predicted_Global_Sales
1480,call of duty black ops 3,25.32,28.27309
175,call of duty advanced warfare,21.9,21.513552
575,fifa soccer 13,16.16,17.964226
1108,grand theft auto 2,3.42,15.451985
1643,the sims 4,2.97,12.058343
603,fifa soccer 07,6.38,11.409861
1660,assassins creed iii,13.1,11.178028
2025,super mario advance,5.49,10.861039
1532,madden nfl 09,7.14,8.524139
743,new super mario bros wii,28.62,8.496074


In [59]:
# Displaying bottom 10 predicted sales out of the test set
bottom_10_pred = results_df.sort_values(by="Predicted_Global_Sales", ascending=True).head(10)
bottom_10_pred

Unnamed: 0,Title,Actual_Global_Sales,Predicted_Global_Sales
1032,higurashi no nakukoru ni kizuna daiichikan ta...,0.07,0.001214
323,shitsuji ga aruji o erabu toki,0.01,0.010169
1598,oshiri kajiri mushi no rhythm lesson ds kawai ...,0.02,0.020956
413,kanshuu nippon joushikiryoku kentei kyoukai im...,1.71,0.039816
1341,sengoku musou 3 z special,0.04,0.044458
389,super run for money tousouchuu atsumare saikyo...,0.11,0.045531
2089,ds yamamura misa suspense maiko kogiku kisha ...,0.12,0.046416
25,sangoku koi senki omoide gaeshi cs edition,0.01,0.060601
1923,harukanaru toki no naka de yumenoukihashi,0.05,0.0609
2129,nobunagas ambition tenshouki with powerup kit ...,0.02,0.062068


In [60]:
# Displaying bottom 10 global sales out of the test set
bottom_10 = results_df.sort_values(by="Actual_Global_Sales", ascending=True).head(10)
bottom_10

Unnamed: 0,Title,Actual_Global_Sales,Predicted_Global_Sales
25,sangoku koi senki omoide gaeshi cs edition,0.01,0.060601
1038,thunder alley,0.01,0.520494
1039,satomi hakkenden hachi tamanoki,0.01,0.160552
1725,himawari pebble in the sky portable,0.01,0.48546
1731,monster rancher advance 2,0.01,0.281185
1734,legoland,0.01,3.335294
503,mario luigi paper jam mario kart 7 double pack,0.01,0.44202
1840,tengai makyo dai yon no mokushiroku,0.01,0.185748
425,pachitte chonmage tatsujin 16 pachinko hissats...,0.01,0.222706
1708,il2 sturmovik,0.01,0.457793


In [56]:
# calculating test MAE
from sklearn.metrics import mean_absolute_error
mae = mean_absolute_error(y_test, predictions)
print("Test MAE:", mae)

Test MAE: 0.6983064030552353
