## Video Games Sales

`pip install pandas`

In [109]:
from datetime import datetime
import filecmp
import os
import pandas as pd

resources_path = "resources/"

vg_sales_path = resources_path + "vg_sales/"
vg_sales_res = {
    "v0": "vg_sales_v0.csv",
    "v1": "vg_sales_v1.csv",
    "v2": "vg_sales_v2.csv",
    "extended_release_year": "vg_sales_extended_release_year.csv",
    "extended_release_year_2": "vg_sales_extended_release_year_2.csv",
    "filled_year": "vg_sales_filled_year.csv",
    "filled_year_2": "vg_sales_filled_year_2.csv",
    "batch": "vg_sales_batches/"
}

vg_developers_path = resources_path + "vg_developers/"
vg_developers_res = {
    "v0": "vg_developers.csv",
}

results_path = "results/"

In [80]:
def save_to_csv(df, folder_path, filename):
    # Generate the full path for the new file
    new_file_path = os.path.join(folder_path, filename)
    
    # Check if the folder exists
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    
    # Save the dataframe to a temporary file
    temp_file_path = os.path.join(folder_path, 'temp_' + filename)
    df.to_csv(temp_file_path, index=True)
    
    # Compare the temporary file with existing files in the folder
    for existing_file in os.listdir(folder_path):
        existing_file_path = os.path.join(folder_path, existing_file)
        if existing_file_path != temp_file_path and filecmp.cmp(temp_file_path, existing_file_path, shallow=False):
            print(f"The content of {filename} already exists as {existing_file}. Not saving the file.")
            os.remove(temp_file_path)
            return
    
    # If no matching file is found, rename the temporary file to the new file
    os.rename(temp_file_path, new_file_path)
    print(f"File saved as {new_file_path}")

In [81]:
# Load the datasets with index

# vg_sales
vg_sales = pd.read_csv(vg_sales_path + vg_sales_res["v0"], index_col=0)

# vg_developers
vg_developers = pd.read_csv(vg_developers_path + vg_developers_res["v0"], index_col=0)
vg_developers = vg_developers.iloc[:, [0, 2, 3]]
vg_developers.rename(columns={'Est.': 'Year_of_Establishment'}, inplace=True)

In [82]:
# columns
print("vg_sales.columns :", vg_sales.columns)
print("vg_developers.columns :", vg_developers.columns)

vg_sales.columns : Index(['Platform', 'Year_of_Release', 'Genre', 'Publisher', 'NA_Sales',
       'EU_Sales', 'JP_Sales', 'Other_Sales', 'Global_Sales', 'Critic_Score',
       'Critic_Count', 'User_Score', 'User_Count', 'Developer', 'Rating'],
      dtype='object')
vg_developers.columns : Index(['City', 'Country', 'Year_of_Establishment'], dtype='object')


In [83]:
# NaN count
nan_count = vg_sales.isna().sum()
print(nan_count)

Platform              0
Year_of_Release     269
Genre                 2
Publisher            54
NA_Sales              0
EU_Sales              0
JP_Sales              0
Other_Sales           0
Global_Sales          0
Critic_Score       8582
Critic_Count       8582
User_Score         6704
User_Count         9129
Developer          6623
Rating             6769
dtype: int64


### Fix Year_Of_Release

In [84]:
# Create a folder in results_path if it doesn't exist
folder_name = 'vg_sales_no_year'
folder_path = os.path.join(results_path, folder_name)
os.makedirs(folder_path, exist_ok=True)

# Isolate rows where Year_of_Release is NaN
vg_sales_no_year = vg_sales[vg_sales['Year_of_Release'].isna()]

# Generate a unique filename based on the current date and time
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'vg_sales_no_year_{timestamp}.csv'

# Save to a separate CSV file
save_to_csv(vg_sales_no_year, folder_path, filename)

The content of vg_sales_no_year_20241210_202017.csv already exists as vg_sales_no_year_20241210_162859.csv. Not saving the file.


In [85]:
# Load the extended release year data
vg_sales_extended = pd.read_csv(vg_sales_path + vg_sales_res["extended_release_year"], index_col=0)

# Merge the extended release year data with the original vg_sales dataframe
vg_sales = vg_sales.merge(vg_sales_extended[['Year_of_Release']], left_index=True, right_index=True, how='left', suffixes=('', '_extended'))

# Update the Year_of_Release column with the extended data where available
vg_sales['Year_of_Release'] = vg_sales['Year_of_Release_extended'].combine_first(vg_sales['Year_of_Release'])

# Drop the extended column as it's no longer needed
vg_sales.drop(columns=['Year_of_Release_extended'], inplace=True)

# Display the updated dataframe
print(vg_sales.head())

# Create a folder in results_path if it doesn't exist
folder_name = 'vg_sales_filled_year'
folder_path = os.path.join(results_path, folder_name)
os.makedirs(folder_path, exist_ok=True)

# Generate a unique filename based on the current date and time
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'vg_sales_filled_year_{timestamp}.csv'

# Save the updated dataframe to a separate CSV file
save_to_csv(vg_sales, folder_path, filename)

                         Platform  Year_of_Release         Genre Publisher  \
Name                                                                         
Wii Sports                    Wii           2006.0        Sports  Nintendo   
Super Mario Bros.             NES           1985.0      Platform  Nintendo   
Mario Kart Wii                Wii           2008.0        Racing  Nintendo   
Wii Sports Resort             Wii           2009.0        Sports  Nintendo   
Pokemon Red/Pokemon Blue       GB           1996.0  Role-Playing  Nintendo   

                          NA_Sales  EU_Sales  JP_Sales  Other_Sales  \
Name                                                                  
Wii Sports                   41.36     28.96      3.77         8.45   
Super Mario Bros.            29.08      3.58      6.81         0.77   
Mario Kart Wii               15.68     12.76      3.79         3.29   
Wii Sports Resort            15.61     10.93      3.28         2.95   
Pokemon Red/Pokemon Blue   

In [86]:
# vg_sales_filled_year
vg_sales_filled_year = pd.read_csv(vg_sales_path + vg_sales_res["filled_year"], index_col=0)

In [87]:
# Create a folder in results_path if it doesn't exist
folder_name = 'vg_sales_filled_year_no_year'
folder_path = os.path.join(results_path, folder_name)
os.makedirs(folder_path, exist_ok=True)

# Isolate rows where Year_of_Release is NaN
vg_sales_filled_year_no_year = vg_sales_filled_year[vg_sales_filled_year['Year_of_Release'].isna()]

# Generate a unique filename based on the current date and time
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'vg_sales_filled_year_no_year_{timestamp}.csv'

# Save to a separate CSV file
save_to_csv(vg_sales_filled_year_no_year, folder_path, filename)

The content of vg_sales_filled_year_no_year_20241210_202023.csv already exists as vg_sales_filled_year_no_year_20241210_162927.csv. Not saving the file.


In [88]:
# Load the extended release year data
vg_sales_extended_2 = pd.read_csv(vg_sales_path + vg_sales_res["extended_release_year_2"], index_col=0)

# Merge the extended release year data with the original vg_sales dataframe
vg_sales = vg_sales.merge(vg_sales_extended_2[['Year_of_Release']], left_index=True, right_index=True, how='left', suffixes=('', '_extended'))

# Update the Year_of_Release column with the extended data where available
vg_sales['Year_of_Release'] = vg_sales['Year_of_Release_extended'].combine_first(vg_sales['Year_of_Release'])

# Drop the extended column as it's no longer needed
vg_sales.drop(columns=['Year_of_Release_extended'], inplace=True)

# Display the updated dataframe
print(vg_sales.head())

# Create a folder in results_path if it doesn't exist
folder_name = 'vg_sales_filled_year_2'
folder_path = os.path.join(results_path, folder_name)
os.makedirs(folder_path, exist_ok=True)

# Generate a unique filename based on the current date and time
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'vg_sales_filled_year_2_{timestamp}.csv'

# Save the updated dataframe to a separate CSV file
save_to_csv(vg_sales, folder_path, filename)

                         Platform  Year_of_Release         Genre Publisher  \
Name                                                                         
Wii Sports                    Wii           2006.0        Sports  Nintendo   
Super Mario Bros.             NES           1985.0      Platform  Nintendo   
Mario Kart Wii                Wii           2008.0        Racing  Nintendo   
Wii Sports Resort             Wii           2009.0        Sports  Nintendo   
Pokemon Red/Pokemon Blue       GB           1996.0  Role-Playing  Nintendo   

                          NA_Sales  EU_Sales  JP_Sales  Other_Sales  \
Name                                                                  
Wii Sports                   41.36     28.96      3.77         8.45   
Super Mario Bros.            29.08      3.58      6.81         0.77   
Mario Kart Wii               15.68     12.76      3.79         3.29   
Wii Sports Resort            15.61     10.93      3.28         2.95   
Pokemon Red/Pokemon Blue   

In [89]:
# vg_sales_filled_year_2
vg_sales_filled_year_2 = pd.read_csv(vg_sales_path + vg_sales_res["filled_year_2"], index_col=0)

In [90]:
# Create a folder in results_path if it doesn't exist
folder_name = 'vg_sales_filled_year_2_no_year'
folder_path = os.path.join(results_path, folder_name)
os.makedirs(folder_path, exist_ok=True)

# Isolate rows where Year_of_Release is NaN
vg_sales_filled_year_2_no_year = vg_sales_filled_year_2[vg_sales_filled_year_2['Year_of_Release'].isna()]

# Generate a unique filename based on the current date and time
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'vg_sales_filled_year_2_no_year_{timestamp}.csv'

# Save to a separate CSV file
save_to_csv(vg_sales_filled_year_2_no_year, folder_path, filename)

The content of vg_sales_filled_year_2_no_year_20241210_202027.csv already exists as vg_sales_filled_year_2_no_year_20241210_165438.csv. Not saving the file.


### Cancelled games in the dataset
Brothers in Arms: Furious 4,X360,,Shooter,,0.01,0.0,0.0,0.0,0.01,,,,,Gearbox Software,M
was never released and put into sales

In [91]:
# Remove the specified entry
vg_sales_cleaned_release_date = vg_sales_filled_year_2.drop(index='Brothers in Arms: Furious 4') # type: ignore

# Create a folder in results_path if it doesn't exist
folder_name = 'vg_sales_cleaned_release_date'
folder_path = os.path.join(results_path, folder_name)
os.makedirs(folder_path, exist_ok=True)

# Generate a unique filename based on the current date and time
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'vg_sales_cleaned_release_date_{timestamp}.csv'

# Save the cleaned dataframe to a separate CSV file
save_to_csv(vg_sales_cleaned_release_date, folder_path, filename)

The content of vg_sales_cleaned_release_date_20241210_202029.csv already exists as vg_sales_cleaned_release_date_20241210_170408.csv. Not saving the file.


In [102]:
# vg_sales_v1
vg_sales_v1 = pd.read_csv(vg_sales_path + vg_sales_res["v1"], index_col=0)

In [93]:
# Isolate rows where Year_of_Release is NaN
vg_sales_v1_no_release_date = vg_sales_v1[vg_sales_v1['Year_of_Release'].isna()]
print("Number of video games that have no release date :", len(vg_sales_v1_no_release_date))

Number of video games that have no release date : 0


### Fix Developer & Rating

In [94]:

import pandas as pd
import requests
import time
from tqdm.notebook import tqdm

# Mapping ratings to letters
rating_map = {
    1: "Three",
    2: "Seven",
    3: "Twelve",
    4: "Sixteen",
    5: "Eighteen",
    6: "RP",
    7: "EC",
    8: "E",
    9: "E10",
    10: "T",
    11: "M",
    12: "AO",
    13: "CERO_A",
    14: "CERO_B",
    15: "CERO_C",
    16: "CERO_D",
    17: "CERO_Z",
    18: "USK_0",
    19: "USK_6",
    20: "USK_12",
    21: "USK_16",
    22: "USK_18",
    23: "GRAC_ALL",
    24: "GRAC_Twelve",
    25: "GRAC_Fifteen",
    26: "GRAC_Eighteen",
    27: "GRAC_TESTING",
    28: "CLASS_IND_L",
    29: "CLASS_IND_Ten",
    30: "CLASS_IND_Twelve",
    31: "CLASS_IND_Fourteen",
    32: "CLASS_IND_Sixteen",
    33: "CLASS_IND_Eighteen",
    34: "ACB_G",
    35: "ACB_PG",
    36: "ACB_M",
    37: "ACB_MA15",
    38: "ACB_R18",
    39: "ACB_RC"
}

unique_platforms = vg_sales_v1['Platform'].unique()
print(unique_platforms)

['Wii' 'NES' 'GB' 'DS' 'X360' 'PS3' 'PS2' 'SNES' 'GBA' 'PS4' '3DS' 'N64'
 'PS' 'XB' 'PC' '2600' 'PSP' 'XOne' 'WiiU' 'GC' 'GEN' 'DC' 'PSV' 'SAT'
 'SCD' 'WS' 'NG' 'TG16' '3DO' 'GG' 'PCFX']


In [95]:
# Function to get the access token from IGDB
def get_access_token(client_id, client_secret):
    url = "https://id.twitch.tv/oauth2/token"
    params = {
        "client_id": client_id,
        "client_secret": client_secret,
        "grant_type": "client_credentials"
    }
    response = requests.post(url, params=params)
    return response.json()['access_token']

# Your actual IGDB client ID and client secret
client_id = 'ms8zqnreo7k8v5v4fp198eivs5039o'
client_secret = 'ia642tpku45i8qwcmp1pvo4ybi838d'

# Get the access token
access_token = get_access_token(client_id, client_secret)

In [96]:
# Function to get game details from IGDB
def get_game_details(game_name, release_year, platform, client_id, access_token):
    url = "https://api.igdb.com/v4/games"
    headers = {
        "Client-ID": client_id,
        "Authorization": f"Bearer {access_token}"
    }
    platform_ids = {
        "PC": 6,
        "Wii": 5,
        "NES": 18,
        "X360": 12,
        "PS3": 9,
        "PS2": 8,
        "SNES": 19,
        "PS4": 48,
        "N64": 4,
        "PS": 7,
        "XB": 11,
        "2600": 59,
        "XOne": 49,
        "WiiU": 41,
        "GC": 21,
        "GEN": 29,
        "DC": 23,
        "SAT": 32,
        "SCD": 22,
        "3DO": 50,
        "GB": 33,
        "DS": 20,
        "GBA": 24,
        "3DS": 37,
        "PSP": 38,
        "PSV": 46,
        "WS": 35,
        "NG": 119,
        "TG16": 86,
        "GG": 87,
        "PCFX": 88
    }
    platform_id = platform_ids.get(platform, "unknown")
    if platform_id != "unknown":
        data = f'search "{game_name}"; where release_dates.y = {int(release_year)} & release_dates.platform = {platform_id}; fields name, involved_companies.company.name, age_ratings.rating;'
    else:
        data = f'search "{game_name}"; where release_dates.y = {int(release_year)}; fields name, involved_companies.company.name, age_ratings.rating;'
    response = requests.post(url, headers=headers, data=data)
    return response.json()

In [97]:

# Update the DataFrame with a progress bar and save added details in a separate DataFrame
def update_game_info_with_progress(df, client_id, access_token, video_games_sales=None):
    request_count = 0
    added_details = []

    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Updating game info", leave=True):
        game_name = row.name
        updated_row = row.copy()

        if pd.isna(row.get('Developer', None)) or pd.isna(row.get('Rating', None)):
            # First try to update from video_games_sales, if available
            if video_games_sales is not None:
                same_year_games = video_games_sales[
                    (video_games_sales['Year_of_Release'] == row['Year_of_Release'])
                ]
                if not same_year_games.empty:
                    if pd.isna(row['Developer']):
                        developer = same_year_games[same_year_games.index == game_name]['Developer'].values
                        if developer.size > 0:
                            updated_row['Developer'] = developer[0]
                    
                    if pd.isna(row['Rating']):
                        rating = same_year_games[same_year_games.index == game_name]['Rating'].values
                        if rating.size > 0:
                            updated_row['Rating'] = rating[0]

            # If still missing data, fetch from IGDB
            if pd.isna(updated_row['Developer']) or pd.isna(updated_row['Rating']):
                try:
                    game_details = get_game_details(game_name, row['Year_of_Release'], row['Platform'], client_id, access_token)
                except Exception as e:
                    print(f"Error fetching details for {game_name}: {e}")
                    continue
                
                if game_details and isinstance(game_details, list) and len(game_details) > 0:
                    game_info = game_details[0]
                    if 'involved_companies' in game_info and game_info['involved_companies']:
                        updated_row['Developer'] = game_info['involved_companies'][0]['company']['name']
                        print(f"Updated {game_name}: Developer={updated_row['Developer']}")
                    if 'age_ratings' in game_info and game_info['age_ratings']:
                        numeric_rating = game_info['age_ratings'][0]['rating']
                        updated_row['Rating'] = rating_map.get(numeric_rating, "Unknown")
                        print(f"Updated {game_name}: Rating={updated_row['Rating']}")
            # Rate limiting
            request_count += 1
            if request_count >= 4:
                time.sleep(1)
                request_count = 0

        added_details.append(updated_row)

    added_details_df = pd.DataFrame(added_details)
    return added_details_df


In [70]:
# Update the DataFrame with progress bar
no_dev_rating_assigned = vg_sales_v1[(vg_sales_v1['Developer'].isnull()) | (vg_sales_v1['Rating'].isnull())]

batch_size = 1000
num_batches = (len(no_dev_rating_assigned) // batch_size) + 1
all_added_details = []

# Create a folder in results_path if it doesn't exist
folder_name = 'vg_sales_batches'
folder_path = os.path.join(results_path, folder_name)
os.makedirs(folder_path, exist_ok=True)

for i in range(num_batches):
    batch_df = no_dev_rating_assigned.iloc[i * batch_size:(i + 1) * batch_size]
    print(f"Processing batch {i + 1}/{num_batches} with {len(batch_df)} records")
    batch_added_details_df = update_game_info_with_progress(batch_df, client_id, access_token, vg_sales)
    all_added_details.append(batch_added_details_df)
    
    # Generate a unique filename for each batch based on the current date and time
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    batch_filename = f'vg_sales_batch_{i + 1}_{timestamp}.csv'
    
    # Save each batch to a separate CSV file
    save_to_csv(batch_added_details_df, folder_path, batch_filename)

# Merge all batches into a single DataFrame
final_added_details_df = pd.concat(all_added_details, ignore_index=True)

# Create a folder in results_path if it doesn't exist
folder_name = 'vg_sales_updated'
folder_path = os.path.join(results_path, folder_name)
os.makedirs(folder_path, exist_ok=True)

# Generate a unique filename based on the current date and time
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'vg_sales_updated_{timestamp}.csv'

# Save the final DataFrame to a CSV file
save_to_csv(final_added_details_df, folder_path, filename)

# Display the final DataFrame
print(final_added_details_df.head())


Processing batch 1/7 with 1000 records


Updating game info:   0%|          | 0/1000 [00:00<?, ?it/s]

Updated Super Mario Bros.: Developer=Playtronic
Updated Super Mario Bros.: Rating=E
Updated Tetris: Developer=Nintendo
Updated Tetris: Rating=E
Updated Nintendogs: Developer=Nintendo
Updated Nintendogs: Rating=E
Updated Super Mario Land: Developer=Nintendo R&D1
Updated Super Mario Land: Rating=E
Updated Pokémon Yellow: Special Pikachu Edition: Developer=Nintendo
Updated Pokémon Yellow: Special Pikachu Edition: Rating=E
Updated Call of Duty: Black Ops 3: Developer=Activision
Updated Call of Duty: Black Ops 3: Rating=Eighteen
Updated Super Mario 64: Developer=Nintendo
Updated Super Mario 64: Rating=E
Updated Super Mario Land 2: 6 Golden Coins: Developer=Nintendo R&D1
Updated Super Mario Land 2: 6 Golden Coins: Rating=E
Updated Super Mario All-Stars: Developer=Nintendo EAD
Updated Super Mario All-Stars: Rating=E
Updated Super Mario 64: Developer=Nintendo EAD
Updated Super Mario 64: Rating=E
Updated Mario Kart 64: Developer=Nintendo
Updated Mario Kart 64: Rating=E
Updated Donkey Kong Count

Updating game info:   0%|          | 0/1000 [00:00<?, ?it/s]

Updated Mickey's Speedway USA: Developer=Nintendo
Updated Mickey's Speedway USA: Rating=E
Updated BeatMania Append 3rdMix: Developer=Konami Computer Entertainment Japan
Updated Disney TH!NK Fast: The Ultimate Trivia Showdown: Developer=Disney Interactive Studios
Updated Disney TH!NK Fast: The Ultimate Trivia Showdown: Rating=E
Updated  Frozen: Olaf's Quest: Developer=Avanquest Software
Updated  Frozen: Olaf's Quest: Rating=E
Updated New Play Control! Pikmin 2: Developer=Nintendo
Updated New Play Control! Pikmin 2: Rating=Three
Updated New Play Control! Donkey Kong Jungle Beat: Developer=Nintendo
Updated New Play Control! Donkey Kong Jungle Beat: Rating=E10
Updated Avatar: The Game: Developer=Gameloft
Updated Avatar: The Game: Rating=Twelve
Updated PGR3 - Project Gotham Racing 3: Developer=Microsoft Game Studios
Updated PGR3 - Project Gotham Racing 3: Rating=E10
Updated Beijing 2008: Developer=Sega
Updated Beijing 2008: Rating=E
Updated Pokemon Trozei!: Developer=Nintendo
Updated Pokemo

Updating game info:   0%|          | 0/1000 [00:00<?, ?it/s]

Updated NBA Jam 99: Developer=Iguana UK
Updated NBA Jam 99: Rating=E
Updated Shadow Man: Developer=Acclaim Studios Teesside
Updated Shadow Man: Rating=M
Updated Command & Conquer: Developer=Virgin Interactive Entertainment (Europe) Ltd.
Updated Command & Conquer: Rating=M
Updated Rhythm Thief & the Emperor's Treasure: Developer=Xeen
Updated Rhythm Thief & the Emperor's Treasure: Rating=E10
Updated One Piece Unlimited Cruise 1: The Treasure Beneath the Waves: Developer=Ganbarion
Updated One Piece Unlimited Cruise 1: The Treasure Beneath the Waves: Rating=Twelve
Updated Monopoly: Developer=Electronic Arts
Updated Monopoly: Rating=E
Updated Ratatouille: Food Frenzy: Developer=THQ
Updated Ratatouille: Food Frenzy: Rating=E
Updated Magical Vacation: Developer=Nintendo
Updated Magical Vacation: Rating=CERO_A
Updated Punch-Out!!: Developer=Nintendo
Updated Punch-Out!!: Rating=E
Updated Despicable Me: The Game: Developer=Vicious Cycle Software, Inc.
Updated Despicable Me: The Game: Rating=E10


Updating game info:   0%|          | 0/1000 [00:00<?, ?it/s]

Updated The Walking Dead: Season One: Developer=Telltale Games
Updated The Walking Dead: Season One: Rating=M
Updated Zapper: One Wicked Cricket!: Developer=Infogrames
Updated Zapper: One Wicked Cricket!: Rating=E
Updated Scooby-Doo! Night of 100 Frights: Developer=THQ
Updated Scooby-Doo! Night of 100 Frights: Rating=E
Updated T.R.A.G. - Tactical Rescue Assault Group: Mission of Mercy: Developer=SUNSOFT
Updated T.R.A.G. - Tactical Rescue Assault Group: Mission of Mercy: Rating=T
Updated Naruto RPG 2: Chidori vs Rasengan: Developer=Tomy
Updated Akiba's Trip: Developer=ACQUIRE Corp.
Updated Akiba's Trip: Rating=CERO_C
Updated Bravo Air Race: Developer=Xing Entertainment
Updated Bravo Air Race: Rating=E
Updated Gundam: The Battle Master: Developer=Bandai
Updated ¡Shin Chan Flipa en colores!: Developer=Banpresto Co., Ltd.
Updated ¡Shin Chan Flipa en colores!: Rating=Seven
Updated The Granstream Saga: Developer=Arc Entertainment
Updated The Granstream Saga: Rating=E
Updated Bio FREAKS: Deve

Updating game info:   0%|          | 0/1000 [00:00<?, ?it/s]

Updated Nanashi no Game: Developer=Square Enix
Updated Diego's Build & Rescue: Developer=Black Lantern Studios
Updated Diego's Build & Rescue: Rating=E
Updated Auto Destruct: Rating=T
Updated Deep Fear: Developer=Sega CS2 R&D
Updated Virus: Developer=Hudson Soft
Updated Dragon Shadow Spell: Developer=Flight-Plan
Updated Dragon Slayer: The Legend of Heroes: Developer=SPS
Updated The Idolm@ster: Developer=Namco Bandai Games
Updated Adventure Time: Finn & Jake Investigations: Developer=Vicious Cycle Software
Updated Adventure Time: Finn & Jake Investigations: Rating=Three
Updated Super Robot Taisen Z Special Disc: Developer=Banpresto
Updated Slayers Royal 2: Developer=Kadokawa Shoten
Updated Scooby-Doo! Who's Watching Who?: Developer=THQ
Updated Scooby-Doo! Who's Watching Who?: Rating=E
Updated Serious Sam 3: BFE: Developer=Croteam
Updated Serious Sam 3: BFE: Rating=Eighteen
Updated Pro Yakyuu Famista DS 2010: Developer=Namco
Updated Pro Yakyuu Famista DS 2010: Rating=CERO_A
Updated Capta

Updating game info:   0%|          | 0/1000 [00:00<?, ?it/s]

Updated Novastorm: Developer=Psygnosis
Updated Novastorm: Rating=E
Updated Dead Ball Zone: Rating=T
Updated Winning Post World 2010: Rating=CERO_A
Updated Ou to Maou to 7-nin no Himegimitachi: Shin Ousama Monogatari: Developer=Marvelous AQL
Updated Ou to Maou to 7-nin no Himegimitachi: Shin Ousama Monogatari: Rating=Twelve
Updated S.Y.K Renshouden Portable: Developer=Otomate
Updated S.Y.K Renshouden Portable: Rating=CERO_B
Updated Hanaoni: Yume no Tsudzuki: Developer=Otomate
Updated Hanaoni: Yume no Tsudzuki: Rating=CERO_C
Updated Gal*Gun: Developer=Inti Creates
Updated Gal*Gun: Rating=GRAC_Eighteen
Updated Black & White: Developer=Feral Interactive
Updated Black & White: Rating=T
Updated Ookami Kakushi: Developer=Konami
Updated Ookami Kakushi: Rating=CERO_C
Updated  Haikyu!! Cross Team Match!: Developer=Bandai Namco Games
Updated  Haikyu!! Cross Team Match!: Rating=CERO_A
Updated Gormiti: The Lords of Nature!: Rating=Twelve
Updated North American Hunting Extravaganza 2: Rating=T
Updat

Updating game info:   0%|          | 0/783 [00:00<?, ?it/s]

Updated Agatha Christie: The ABC Murders: Developer=Gravity Europe SAS
Updated Agatha Christie: The ABC Murders: Rating=T
Updated Elminage Gothic: Ulm Zakir to Yami no Gishiki: Developer=Starfish SD
Updated Crayon Shin-Chan: Uchuu de Achoo!? Yuujou no Oba-Karate!!: Developer=Inti Creates
Updated Crayon Shin-Chan: Uchuu de Achoo!? Yuujou no Oba-Karate!!: Rating=CERO_A
Updated Mushishi: Amefuru Sato: Developer=Marvelous Entertainment
Updated Mushishi: Amefuru Sato: Rating=CERO_A
Updated Arcana Famiglia: Vascello Phantasma no Majutsushi: Developer=Comfort
Updated Arcana Famiglia: Vascello Phantasma no Majutsushi: Rating=CERO_B
Updated Airport Simulator: Developer=United Independent Entertainment GmbH
Updated Winning Post 7: Maximum 2007: Developer=Koei
Updated Winning Post 7: Maximum 2007: Rating=CERO_A
Updated Vitamin R: Developer=Rejet
Updated Vitamin R: Rating=CERO_B
Updated Angelique Retour: Developer=Ruby Party
Updated Angelique Retour: Rating=CERO_B
Updated Gakuen K: Wonderful Schoo

In [None]:
import glob

# Get all CSV files in the batch folder
batch_files = glob.glob(os.path.join(vg_sales_path, vg_sales_res["batch"], "*.csv"))

# Read and concatenate all batch CSV files into a single DataFrame
batch_dfs = [pd.read_csv(file, index_col=0) for file in batch_files]
concatenated_df = pd.concat(batch_dfs, ignore_index=False)

In [98]:
# Find rows with non-null Developer or Rating
rows_with_developer_or_rating = concatenated_df[(concatenated_df['Developer'].notna()) | (concatenated_df['Rating'].notna())]

In [104]:
# Create a copy of vg_sales_v1 to vg_sales_v2
vg_sales_v2 = vg_sales_v1.copy()

# Drop duplicate indices from rows_with_developer_or_rating
rows_with_developer_or_rating = rows_with_developer_or_rating[~rows_with_developer_or_rating.index.duplicated(keep='first')]

# Fill missing values in vg_sales_v2 with values from rows_with_developer_or_rating
vg_sales_v2.update(rows_with_developer_or_rating)

# Create a folder in results_path if it doesn't exist
folder_name = 'vg_sales_v2'
folder_path = os.path.join(results_path, folder_name)
os.makedirs(folder_path, exist_ok=True)

# Generate a unique filename based on the current date and time
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'vg_sales_v2_{timestamp}.csv'

# Save the updated dataframe to a separate CSV file
save_to_csv(vg_sales_v2, folder_path, filename)

File saved as results/vg_sales_v2/vg_sales_v2_20241210_202447.csv


In [115]:
vg_sales_v2 = pd.read_csv(vg_sales_path + vg_sales_res["v2"], index_col=0)

print("--------- FROM ---------\n")

# NaN count vg_sales_v0
nan_count = vg_sales.isna().sum()
print(nan_count)

print("\n--------- TO ---------\n")

# NaN count vg_sales_v2
nan_count = vg_sales_v2.isna().sum()
print(nan_count)

--------- FROM ---------

Platform              0
Year_of_Release       1
Genre                 2
Publisher            54
NA_Sales              0
EU_Sales              0
JP_Sales              0
Other_Sales           0
Global_Sales          0
Critic_Score       8610
Critic_Count       8610
User_Score         6720
User_Count         9166
Developer          6637
Rating             6783
dtype: int64

--------- TO ---------

Platform              0
Year_of_Release       0
Genre                 2
Publisher            51
NA_Sales              0
EU_Sales              0
JP_Sales              0
Other_Sales           0
Global_Sales          0
Critic_Score       8598
Critic_Count       8598
User_Score         6711
User_Count         9153
Developer          3056
Rating             3592
dtype: int64
