In [None]:
# Exploratory Data Analysis (EDA) for Gaming Forecast Project

This notebook explores the cleaned datasets prepared for predicting breakout trends in video games.

We will:
- Inspect raw and cleaned data
- Understand structure and columns
- Visualize trends (genre, platform, yearly sales)
- Define target feature (Breakout = top 10% sales)


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load cleaned datasets
vgsales = pd.read_csv("../data/processed/vgsales_cleaned.csv")
topgross = pd.read_csv("../data/processed/topgrossing_cleaned.csv")

# Show first few rows of Video Game Sales
vgsales.head()


In [None]:
print(f"VGSales shape: {vgsales.shape}")
print(vgsales.info())

# Summary statistics
vgsales.describe(include='all').transpose()


In [None]:
# Show 10 random rows to visualize data variety
vgsales.sample(10)


In [None]:
# Define breakout threshold (top 10% global sales)
threshold = vgsales['Global_Sales'].quantile(0.9)
vgsales['Breakout'] = (vgsales['Global_Sales'] >= threshold).astype(int)

# Display rows with Breakout column
vgsales[['Name','Global_Sales','Breakout']].head(10)


In [None]:
plt.figure(figsize=(10,5))
sns.countplot(data=vgsales, x='Genre', order=vgsales['Genre'].value_counts().index)
plt.xticks(rotation=45)
plt.title("Number of Games per Genre")
plt.show()


In [None]:
plt.figure(figsize=(12,6))
vgsales.groupby('Year')['Global_Sales'].sum().plot(kind='bar')
plt.title("Total Global Sales by Year")
plt.xlabel("Year")
plt.ylabel("Sales (Millions)")
plt.show()


In [None]:
top_platforms = vgsales.groupby('Platform')['Global_Sales'].sum().sort_values(ascending=False).head(10)

plt.figure(figsize=(10,5))
top_platforms.plot(kind='bar')
plt.title("Top 10 Platforms by Global Sales")
plt.ylabel("Total Sales (Millions)")
plt.show()
