In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data.loader import (
    load_interactions,
    load_games,
    load_extended_games,
    load_item_reviews,
    load_user_reviews,
    load_bundles
)

In [2]:
# Load Data
train = load_interactions(train=True)
test_in = load_interactions(train=False)

games = load_games()
ext_games = load_extended_games()
item_reviews = load_item_reviews()
user_reviews = load_user_reviews()
bundles = load_bundles()

In [8]:
# Quick Overview of DataFrames
dfs = {
    "train_interactions": train,
    "test_interactions_in": test_in,
    "games": games,
    "extended_games": ext_games,
    "item_reviews": item_reviews,
    "user_reviews": user_reviews,
    "bundles": bundles,
}

for name, df in dfs.items():
    print(f"=== {name} ===")
    print(df.shape)
    print(df.columns)
    print(df.head(3))
    print()

=== train_interactions ===
(2293985, 4)
Index(['user_id', 'item_id', 'item_name', 'playtime'], dtype='object')
   user_id  item_id              item_name  playtime
0        0        0         Counter-Strike         6
1        0     2555          Day of Defeat         7
2        0     2556  Day of Defeat: Source      4733

=== test_interactions_in ===
(448211, 4)
Index(['user_id', 'item_id', 'item_name', 'playtime'], dtype='object')
   user_id  item_id                     item_name  playtime
0        4      760                 Ace of Spades        83
1        4     8417  Geometry Wars: Retro Evolved        18
2        4     5238      Half-Life 2: Episode One       491

=== games ===
(8523, 11)
Index(['item_id', 'item_name', 'publisher', 'genres', 'url', 'tags',
       'sentiment', 'metascore', 'specs', 'price', 'release_date'],
      dtype='object')
   item_id         item_name             publisher  \
0        0    Counter-Strike                 Valve   
1        1  Rag Doll Kung Fu   

In [9]:
# Check for Missing Values
for name, df in dfs.items():
    print(f"{name} missing values:")
    display(df.isna().sum())
    print()

train_interactions missing values:


user_id      0
item_id      0
item_name    0
playtime     0
dtype: int64


test_interactions_in missing values:


user_id      0
item_id      0
item_name    0
playtime     0
dtype: int64


games missing values:


item_id            0
item_name          0
publisher        514
genres           419
url                0
tags               4
sentiment         31
metascore       6406
specs            137
price            150
release_date     335
dtype: int64


extended_games missing values:


item_id                        0
item_name                      0
release_date                   0
required_age                   0
price                          0
                            ... 
tags_Mahjong                6753
tags_Birds                  6753
tags_Football (American)    6751
tags_Fox                    6753
tags_Extraction Shooter     6753
Length: 490, dtype: int64


item_reviews missing values:


username_hash        220
item_id                0
hours              16033
num_items          12096
page_order             0
date                   0
review             16317
early_access           0
page                   0
found_funny      5815565
compensation     6735005
dtype: int64


user_reviews missing values:


user_id            0
item_id            0
funny          36469
posted             0
last_edited    38251
helpful            0
recommend          0
review            18
dtype: int64


bundles missing values:


bundle_id                0
item_id               1926
bundle_final_price       0
bundle_url               0
bundle_price             0
bundle_name              0
bundle_discount          0
genre                  346
discounted_price         0
item_url                 0
item_name                0
dtype: int64


