In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df_top100 = pd.read_csv('../steam_top100_cleaned_data_corrected.csv')
df_genres = pd.read_csv('./gamalytic_genres.csv')
df_publishers = pd.read_csv('./gamalytic_publishers.csv')
df_subgenres = pd.read_csv('./gamalytic_sub-genres_attributes.csv')
df_history = pd.read_csv('./gamalytic_years_game_revenue.csv')

df_top100['positive_ratio'] = df_top100['positive'] / (df_top100['positive'] + df_top100['negative']).replace(0, 1)
df_top100['total_reviews'] = df_top100['positive'] + df_top100['negative']
df_top100['estimated_revenue'] = df_top100['price'] * df_top100['total_reviews']

In [None]:
df_top100['total_reviews'] = df_top100['positive'] + df_top100['negative']

top100_genre_grouped = df_top100.groupby('genre').agg({
    'price': 'mean',
    'total_reviews': 'sum'
}).reset_index()

merged_genre = pd.merge(top100_genre_grouped, df_genres, left_on='genre', right_on='label')

plt.figure(figsize=(12,6))
sns.scatterplot(
    data=merged_genre,
    x='averageRevenue', 
    y='total_reviews',
    size='numberOfGames', 
    hue='genre', 
    legend=False
)
plt.title('Top 100 Review Volume vs. Genre-Level Average Revenue')
plt.xlabel('Average Revenue (from All Games in Genre)')
plt.ylabel('Total Reviews (Top 100)')
plt.grid(True)
plt.show()

In [None]:
publisher_top100 = df_top100.groupby('publisher').agg({
    'appid': 'count',
    'price': 'mean',
    'positive': 'sum',
    'negative': 'sum'
}).rename(columns={'appid': 'top100_count'}).reset_index()

merged_pub = pd.merge(publisher_top100, df_publishers, left_on='publisher', right_on='name', how='inner')

merged_pub['positive_ratio'] = merged_pub['positive'] / (merged_pub['positive'] + merged_pub['negative'])

plt.figure(figsize=(12,6))
sns.scatterplot(
    data=merged_pub,
    x='averageRevenue',
    y='positive_ratio',
    size='top100_count',
    hue='publisher',
    legend=False
)
plt.title('Publisher Average Revenue vs. Positive Review Ratio (Top 100)')
plt.xlabel('Average Revenue (All Games from Publisher)')
plt.ylabel('Positive Review Ratio (Top 100 Games)')
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

df_history['label'] = pd.to_numeric(df_history['label'], errors='coerce')
df_filtered = df_history[(df_history['label'] == 1970) | (df_history['label'] >= 2000)]

plt.figure(figsize=(12, 6))
sns.lineplot(data=df_filtered, x='label', y='totalRevenue')

x_ticks = [1970] + list(range(2000, 2026, 2))
plt.xticks(ticks=x_ticks, rotation=45)

plt.title('Total Revenue Over Time (1970 and 2000–2025)')
plt.xlabel('Year')
plt.ylabel('Total Revenue')
plt.grid(True)
plt.tight_layout()
plt.show()



In [None]:
top_genres = df_top100['genre'].value_counts().nlargest(3).index.tolist()

df_genres_trimmed = df_genres[df_genres['label'].isin(top_genres)]

plt.figure(figsize=(12,6))
sns.lineplot(
    data=df_history,
    x='label',
    y='totalRevenue'
)
plt.title('Total Revenue Over Time (All Games)')
plt.xlabel('Year')
plt.ylabel('Total Revenue')
plt.grid(True)
plt.show()


In [None]:
top100_counts = df_top100['genre'].value_counts().reset_index()
top100_counts.columns = ['genre', 'top100_count']

genre_merged = pd.merge(top100_counts, df_genres, left_on='genre', right_on='label')

# Plot with legend
plt.figure(figsize=(12, 6))
scatter = sns.scatterplot(
    data=genre_merged,
    x='top100_count',
    y='totalRevenue',
    hue='genre',
    size='numberOfGames',
    sizes=(40, 800),
    alpha=0.8,
    palette='tab10'
)

plt.xlabel('Top 100 Game Count by Genre')
plt.ylabel('Total Revenue by Genre (All Games)')
plt.title('Top 100 Genre Popularity vs Overall Genre Revenue')
plt.grid(True)

# Add legend for hue (genres)
plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
df_top100['total_reviews'] = df_top100['positive'] + df_top100['negative']
df_top100['positive_ratio'] = df_top100['positive'] / df_top100['total_reviews'].replace(0, 1)
review_quality = df_top100.groupby('publisher')['positive_ratio'].mean().reset_index()

pub_merged = pd.merge(df_publishers, review_quality, left_on='name', right_on='publisher')

Q1 = pub_merged['totalRevenue'].quantile(0.25)
Q3 = pub_merged['totalRevenue'].quantile(0.75)
IQR = Q3 - Q1
pub_filtered = pub_merged[pub_merged['totalRevenue'] <= Q3 + 1.5 * IQR]

top_publishers = pub_filtered.nlargest(15, 'numberOfGames')

plt.figure(figsize=(12, 6))
sns.scatterplot(data=top_publishers,
    x='totalRevenue',
    y='positive_ratio',
    hue='publisher',
    size='numberOfGames',
    sizes=(100, 1000),
    # alpha=0.8,
    palette='tab10')
plt.title('Top 15 Publishers: Revenue vs Review Quality')
plt.xlabel('Total Revenue (All Games)')
plt.ylabel('Average Positive Review Ratio (Top 100)')
plt.grid(True)
plt.legend(title="Publisher", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
df_history['normalizedRevenue'] = df_history['totalRevenue'] / df_history['totalRevenue'].max()

plt.figure(figsize=(12, 6))
sns.lineplot(data=df_history[df_history['label'] >= 2000], x='label', y='normalizedRevenue')
plt.title('Normalized Revenue Growth Over Time')
plt.xlabel('Year')
plt.ylabel('Normalized Revenue')
plt.grid(True)
plt.show()


In [None]:
df_top100['total_reviews'] = df_top100['positive'] + df_top100['negative']
df_top100['positive_ratio'] = df_top100['positive'] / df_top100['total_reviews'].replace(0, 1)

# Merge publisher info
review_quality = df_top100.groupby('publisher')['positive_ratio'].mean().reset_index()
pub_merged = pd.merge(df_publishers, review_quality, left_on='name', right_on='publisher')

# IQR-based outlier removal
Q1 = pub_merged['totalRevenue'].quantile(0.25)
Q3 = pub_merged['totalRevenue'].quantile(0.75)
IQR = Q3 - Q1
pub_filtered = pub_merged[pub_merged['totalRevenue'] <= Q3 + 1.5 * IQR]

# Limit to top 15 for clarity (optional)
top = pub_filtered.nlargest(20, 'numberOfGames')

# Plot using plt.scatter
plt.figure(figsize=(12, 6))
scatter = plt.scatter(
    top['totalRevenue'],
    top['positive_ratio'],
    s=top['numberOfGames'] * 10,  # Scale marker size
    alpha=0.7,
    c='skyblue',
    edgecolors='black'
)

plt.title('Top 15 Publishers: Revenue vs Top 100 Review Quality')
plt.xlabel('Total Revenue (All Games)')
plt.ylabel('Average Positive Review Ratio (Top 100)')
plt.grid(True)

# Optional: annotate points with publisher name
for i, row in top.iterrows():
    plt.text(
        row['totalRevenue'],
        row['positive_ratio'] + 0.01,
        row['publisher'],
        fontsize=9,
        ha='center'
    )

plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Review metrics from top 100 games
df_top100['total_reviews'] = df_top100['positive'] + df_top100['negative']
df_top100['positive_ratio'] = df_top100['positive'] / df_top100['total_reviews'].replace(0, 1)
review_quality = df_top100.groupby('publisher')['positive_ratio'].mean().reset_index()

# Merge with publisher dataset (contains class info)
pub_merged = pd.merge(df_publishers, review_quality, left_on='name', right_on='publisher')

# Aggregate by class
class_grouped = pub_merged.groupby('class').agg({
    'totalRevenue': 'sum',
    'positive_ratio': 'mean',
    'numberOfGames': 'sum'
}).reset_index()

# Plot
plt.figure(figsize=(15, 10))
scatter = plt.scatter(
    class_grouped['totalRevenue'],
    class_grouped['positive_ratio'],
    s=class_grouped['numberOfGames'] * 5,
    c=['#2ca02c', '#1f77b4', '#ff7f0e', '#d62728'][:len(class_grouped)],
    edgecolors='black',
    alpha=0.8
)

# Label each point with class name
for i, row in class_grouped.iterrows():
    plt.text(
        row['totalRevenue'],
        row['positive_ratio'],
        row['class'],
        ha='center',
        fontsize=25
    )

plt.title('Publisher Class: Revenue vs Review Quality')
plt.xlabel('Total Revenue (All Games)')
plt.ylabel('Average Positive Review Ratio (Top 100)')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df_top100['total_reviews'] = df_top100['positive'] + df_top100['negative']
df_top100['positive_ratio'] = df_top100['positive'] / df_top100['total_reviews'].replace(0, 1)
review_quality = df_top100.groupby('publisher')['positive_ratio'].mean().reset_index()

pub_merged = pd.merge(df_publishers, review_quality, left_on='name', right_on='publisher')

class_grouped = pub_merged.groupby('class').agg({
    'totalRevenue': 'sum',
    'averageRevenue': 'mean',
    'positive_ratio': 'mean'
}).reset_index()

heatmap_data = class_grouped.set_index('class')
heatmap_data = heatmap_data.round(2)

# Step 5: Plot heatmap
plt.figure(figsize=(8, 5))
sns.heatmap(heatmap_data, annot=True, cmap='YlGnBu', fmt='.2f')
plt.title('Publisher Class Performance Heatmap')
plt.ylabel('Publisher Class')
plt.xlabel('Metric')
plt.tight_layout()
plt.show()

In [None]:
df_top100['total_reviews'] = df_top100['positive'] + df_top100['negative']
df_top100['positive_ratio'] = df_top100['positive'] / df_top100['total_reviews'].replace(0, 1)

genre_reviews = df_top100.groupby('genre')['positive_ratio'].mean().reset_index()

genre_merged = pd.merge(df_genres, genre_reviews, left_on='label', right_on='genre')

heatmap_genre = genre_merged[['label', 'averageRevenue', 'averagePrice', 'positive_ratio']]
heatmap_genre = heatmap_genre.set_index('label')
heatmap_genre = heatmap_genre.round(2)

plt.figure(figsize=(12, 8))
sns.heatmap(heatmap_genre, annot=True, cmap='YlGnBu', fmt='.2f')
plt.title('Genre Performance Heatmap')
plt.ylabel('Genre')
plt.xlabel('Metric')
plt.tight_layout()
plt.show()

## 

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report

df = df_top100.copy()

df['total_reviews'] = df['positive'] + df['negative']
df['positive_ratio'] = df['positive'] / df['total_reviews'].replace(0, 1)

df_pub = df_publishers[['name', 'class']].rename(columns={'name': 'publisher', 'class': 'publisher_class'})
df = df.merge(df_pub, on='publisher', how='left')

df = df.merge(df_genres[['label', 'averageRevenue', 'averagePlayTime', 'averagePrice']],
              left_on='genre', right_on='label', how='left')

df['label'] = (df['positive_ratio'] >= 0.8).astype(int)

features = ['price', 'total_reviews', 'genre', 'publisher_class']
X = df[features]
y = df['label']

numeric = ['price', 'total_reviews']
categorical = ['genre', 'publisher_class']

preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numeric),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical)
])

pipeline = Pipeline([
    ('pre', preprocessor),
    ('clf', RandomForestClassifier(random_state=42))
])

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Not Successful', 'Successful'])

fig, ax = plt.subplots(figsize=(6, 5))
disp.plot(ax=ax, cmap='Blues', values_format='d')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')

caption = "This confusion matrix shows the number of correct and incorrect predictions.\nThe model is slightly better at identifying successful games than unsuccessful ones."
plt.figtext(0.5, -0.1, caption, wrap=True, horizontalalignment='center', fontsize=10)

plt.tight_layout()
plt.show()


In [None]:
import numpy as np

feature_names = pipeline.named_steps['pre'].get_feature_names_out()
importances = pipeline.named_steps['clf'].feature_importances_

indices = np.argsort(importances)[-10:]

plt.figure(figsize=(10, 6))
plt.barh(range(len(indices)), importances[indices], align='center', color='skyblue', edgecolor='black')
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel('Feature Importance Score')
plt.title('Top 10 Most Important Features')

caption = "This bar chart shows which features the model found most predictive.\nTotal reviews and price-related features play a key role in determining success."
plt.figtext(0.5, -0.1, caption, wrap=True, horizontalalignment='center', fontsize=10)

plt.tight_layout()
plt.show()



In [None]:
from sklearn.metrics import precision_recall_curve, auc

y_probs = pipeline.predict_proba(X_test)[:, 1]
precision, recall, _ = precision_recall_curve(y_test, y_probs)
pr_auc = auc(recall, precision)

plt.figure(figsize=(8, 6))
plt.plot(recall, precision, marker='.', color='darkorange')
plt.title(f'Precision-Recall Curve (AUC = {pr_auc:.2f})')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.grid(True)

caption = "This curve shows the trade-off between precision and recall.\nThe AUC value reflects how well the model distinguishes successful games."
plt.figtext(0.5, -0.1, caption, wrap=True, horizontalalignment='center', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

fpr, tpr, _ = roc_curve(y_test, y_probs)
roc_auc = roc_auc_score(y_test, y_probs)

plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}', color='teal')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid(True)

caption = "This ROC curve illustrates the model's ability to distinguish between the two classes.\nHigher AUC indicates better overall classification performance."
plt.figtext(0.5, -0.1, caption, wrap=True, horizontalalignment='center', fontsize=10)

plt.tight_layout()
plt.show()