In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import functools
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [None]:
df1 = pd.read_csv('/content/drive/MyDrive/Senior Year/Machine Learning/Data/gamesrating.csv')
df2 = pd.read_csv('/content/drive/MyDrive/Senior Year/Machine Learning/Data/vgsales.csv')

In [None]:
df1

Unnamed: 0,name,platform,release_date,summary,metascore,userscore
0,! SPACE ACCIDENT !,PC,"February 10, 2022",The year is 2119. A turning point has begun in...,tbd,tbd
1,! That Bastard Is Trying To Steal Our Gold !,PC,"May 11, 2014",It's a fun puzzle game where you need to steal...,tbd,3.4
2,!4RC4N01D!,PC,"January 12, 2018",Hardcore arkanoid in the spirit of old games f...,tbd,4.0
3,!4RC4N01D! 2: Retro Edition,PC,"February 6, 2018","Everyone dreams of returning 2007, but no one ...",tbd,3.8
4,!4RC4N01D! 3: Cold Space,PC,"March 8, 2018",!4RC4N01D! returns! This time we have an accid...,tbd,3.0
...,...,...,...,...,...,...
142412,{Undefined},PC,"August 20, 2021","A sandbox survival game, set on life-sized vox...",tbd,tbd
142413,~ Daydream ~,PC,"May 26, 2022",~Daydream~is an action 2D platform jumping gam...,tbd,tbd
142414,~Azur Ring~virgin and slave's phylacteries,PC,"July 17, 2020",New DIABLO-Like adventure game. Legendary gear...,tbd,tbd
142415,~Gigantify~,PC,"January 23, 2022",This is a small Puzzle Platformer where you dy...,tbd,tbd


In [None]:
df2

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82
3,4,Wii Sports Resort,Wii,2009.0,Sports,Nintendo,15.75,11.01,3.28,2.96,33.00
4,5,Pokemon Red/Pokemon Blue,GB,1996.0,Role-Playing,Nintendo,11.27,8.89,10.22,1.00,31.37
...,...,...,...,...,...,...,...,...,...,...,...
16593,16596,Woody Woodpecker in Crazy Castle 5,GBA,2002.0,Platform,Kemco,0.01,0.00,0.00,0.00,0.01
16594,16597,Men in Black II: Alien Escape,GC,2003.0,Shooter,Infogrames,0.01,0.00,0.00,0.00,0.01
16595,16598,SCORE International Baja 1000: The Official Game,PS2,2008.0,Racing,Activision,0.00,0.00,0.00,0.00,0.01
16596,16599,Know How 2,DS,2010.0,Puzzle,7G//AMES,0.00,0.01,0.00,0.00,0.01


In [None]:
df1['Name'] = df1['name']

In [None]:
# Data Cleaning and Transformation

# Renaming columns in vgsales_df to match the naming convention of games_rating_df
vgsales_df_renamed = df1.rename(columns={
    'Name': 'name',
    'Platform': 'platform',
    'Year': 'year',
    'Genre': 'genre',
    'Publisher': 'publisher'
})

# Checking data types in both dataframes
games_rating_dtypes = df2.dtypes
vgsales_dtypes = vgsales_df_renamed.dtypes

# Standardizing date formats: converting 'year' in vgsales_df to a date format
# Assuming the year is the only component of the date, we'll use January 1 of that year as the standardized date



In [None]:
vgsales_df_renamed['release_date']

0         February 10, 2022
1              May 11, 2014
2          January 12, 2018
3          February 6, 2018
4             March 8, 2018
                ...        
142412      August 20, 2021
142413         May 26, 2022
142414        July 17, 2020
142415     January 23, 2022
142416    December 21, 2018
Name: release_date, Length: 142417, dtype: object

In [None]:
vgsales_df_renamed['release_date'] = pd.to_datetime(vgsales_df_renamed['release_date'], errors='coerce', format='%B %d, %Y').dt.date
vgsales_df_renamed['release_date'] = vgsales_df_renamed['release_date'].astype('datetime64[ns]')

# Dropping the original 'year' column as it's now redundant
vgsales_df_transformed = vgsales_df_renamed.drop(columns=['release_date'])

# Transforming 'release_date' in games_rating_df to datetime format for consistency
df2['release_date'] = pd.to_datetime(df2['Year'], errors='coerce')

# Data types after transformation
games_rating_dtypes_after = df2.dtypes
vgsales_dtypes_after = vgsales_df_transformed.dtypes

games_rating_dtypes_after, vgsales_dtypes_after

(Rank                     int64
 Name                    object
 Platform                object
 Year                   float64
 Genre                   object
 Publisher               object
 NA_Sales               float64
 EU_Sales               float64
 JP_Sales               float64
 Other_Sales            float64
 Global_Sales           float64
 release_date    datetime64[ns]
 dtype: object,
 name         object
 platform     object
 summary      object
 metascore    object
 userscore    object
 name         object
 dtype: object)

In [None]:
# Checking if 'name' and 'platform' columns exist in vgsales_df_transformed
columns_in_vgsales = vgsales_df_transformed.columns

# Assuming df2 is games_rating_df (as used in your script)
# Checking if 'Name' and 'Platform' columns exist in df2 (games_rating_df)
columns_in_games_rating = df2.columns

# Verifying the columns and correcting the script based on the available column names
columns_in_vgsales, columns_in_games_rating
# Since the 'name' and 'platform' columns exist in both DataFrames, we can proceed with the corrected merge process

# Adjusting string operations for 'name' and 'platform' columns in both DataFrames
df2['name'] = df2['Name'].str.strip().str.lower()
df2['platform'] = df2['Platform'].str.strip().str.lower()

vgsales_df_transformed['name'] = vgsales_df_transformed['name'].str.strip().str.lower()
vgsales_df_transformed['platform'] = vgsales_df_transformed['platform'].str.strip().str.lower()

# Performing the merge with available columns
merged_df = pd.merge(df2, vgsales_df_transformed, on=['name', 'platform'], how='inner')

# After merging, selecting relevant columns, ensuring they are present in the merged dataframe
selected_columns = ['name', 'platform', 'summary', 'metascore', 'userscore', 'Global_Sales']
selected_columns = [col for col in selected_columns if col in merged_df.columns]

merged_df_selected = merged_df[selected_columns]

# Displaying the first few rows of the merged and selected dataframe
merged_df_selected.head()


AttributeError: ignored

In [None]:
# Adjusting the merge process by excluding 'release_date' as it seems to be missing in one of the datasets

# Merging the previously altered datasets: games_rating_df and vgsales_df_transformed
# The merge will be based on the 'name' and 'platform' columns
# Including information for each game about its global sales, metascore, etc.

# Before merging, ensuring that the 'name' and 'platform' columns are in the same format in both dataframes
df2['name'] = df2['Name'].str.strip().str.lower()
df2['platform'] = df2['Platform'].str.strip().str.lower()

vgsales_df_transformed['name'] = vgsales_df_transformed['name'].str.strip().str.lower()
vgsales_df_transformed['platform'] = vgsales_df_transformed['platform'].str.strip().str.lower()

available_columns_games_rating = df2.columns
available_columns_vgsales = vgsales_df_transformed.columns

# Now, performing the merge with available columns
merged_df = pd.merge(df2, vgsales_df_transformed, on=['name', 'platform'], how='inner')

# After merging, we'll select relevant columns, ensuring they are present in the merged dataframe
selected_columns = ['name', 'platform', 'summary', 'metascore', 'userscore', 'Global_Sales']
selected_columns = [col for col in selected_columns if col in merged_df.columns]

merged_df_selected = merged_df[selected_columns]

# Displaying the first few rows of the merged and selected dataframe
merged_df_selected.head(), available_columns_games_rating, available_columns_vgsales


AttributeError: ignored

In [None]:
df1.to_csv('/content/drive/MyDrive/Senior Year/Machine Learning/Data/VideoGameSalesRating.tsv', sep='\t')

In [None]:
df1.head()

Unnamed: 0,name,platform,release_date,summary,metascore,userscore,Name
0,! SPACE ACCIDENT !,PC,"February 10, 2022",The year is 2119. A turning point has begun in...,tbd,tbd,! SPACE ACCIDENT !
1,! That Bastard Is Trying To Steal Our Gold !,PC,"May 11, 2014",It's a fun puzzle game where you need to steal...,tbd,3.4,! That Bastard Is Trying To Steal Our Gold !
2,!4RC4N01D!,PC,"January 12, 2018",Hardcore arkanoid in the spirit of old games f...,tbd,4.0,!4RC4N01D!
3,!4RC4N01D! 2: Retro Edition,PC,"February 6, 2018","Everyone dreams of returning 2007, but no one ...",tbd,3.8,!4RC4N01D! 2: Retro Edition
4,!4RC4N01D! 3: Cold Space,PC,"March 8, 2018",!4RC4N01D! returns! This time we have an accid...,tbd,3.0,!4RC4N01D! 3: Cold Space


In [None]:
# Compiling all the necessary code into a single cell for clarity and ease of use

import pandas as pd

# Assuming df1 (games_rating_df) and df2 (vgsales_df) are your original datasets
# Replace the file paths with the correct paths to your datasets

# Read the files
games_rating_df = pd.read_csv('/content/drive/MyDrive/Senior Year/Machine Learning/Data/gamesrating.csv')
vgsales_df = pd.read_csv('/content/drive/MyDrive/Senior Year/Machine Learning/Data/vgsales.csv')

# Renaming columns in vgsales_df to match the naming convention of games_rating_df
vgsales_df_renamed = vgsales_df.rename(columns={
    'Name': 'name',
    'Platform': 'platform',
    'Year': 'year',
    'Genre': 'genre',
    'Publisher': 'publisher'
})

# Standardizing date formats
games_rating_df['release_date'] = pd.to_datetime(games_rating_df['release_date'], errors='coerce', format='%B %d, %Y')
vgsales_df_renamed['release_date'] = pd.to_datetime(vgsales_df_renamed['year'], format='%Y', errors='coerce')

# Dropping the original 'year' column in vgsales_df as it's now redundant
vgsales_df_transformed = vgsales_df_renamed.drop(columns=['year'])

# Adjusting string operations for 'name' and 'platform' columns in both DataFrames
games_rating_df['name'] = games_rating_df['name'].str.strip().str.lower()
games_rating_df['platform'] = games_rating_df['platform'].str.strip().str.lower()

vgsales_df_transformed['name'] = vgsales_df_transformed['name'].str.strip().str.lower()
vgsales_df_transformed['platform'] = vgsales_df_transformed['platform'].str.strip().str.lower()

# Performing the merge with available columns
merged_df = pd.merge(games_rating_df, vgsales_df_transformed, on=['name', 'platform'], how='inner')

# Selecting relevant columns, ensuring they are present in the merged dataframe
selected_columns = ['name', 'platform', 'summary', 'metascore', 'userscore', 'Global_Sales']
selected_columns = [col for col in selected_columns if col in merged_df.columns]

merged_df_selected = merged_df[selected_columns]

merged_df_selected.head()


Unnamed: 0,name,platform,summary,metascore,userscore,Global_Sales
0,007: quantum of solace,pc,Introducing a more lethal and cunningly effici...,70,6.3,0.03
1,007: quantum of solace,wii,Introducing a more lethal and cunningly effici...,54,7.3,0.67
2,007: quantum of solace,ds,Introducing a more lethal and cunningly effici...,65,tbd,0.14
3,1 vs. 100,ds,1 vs. 100 puts you in the hot seat as you play...,38,3.5,0.09
4,10 minute solution,wii,10 Minute Solution will bring focused exercise...,tbd,tbd,0.08


In [None]:
merged_df_selected

Unnamed: 0,name,platform,summary,metascore,userscore,Global_Sales
0,007: quantum of solace,pc,Introducing a more lethal and cunningly effici...,70,6.3,0.03
1,007: quantum of solace,wii,Introducing a more lethal and cunningly effici...,54,7.3,0.67
2,007: quantum of solace,ds,Introducing a more lethal and cunningly effici...,65,tbd,0.14
3,1 vs. 100,ds,1 vs. 100 puts you in the hot seat as you play...,38,3.5,0.09
4,10 minute solution,wii,10 Minute Solution will bring focused exercise...,tbd,tbd,0.08
...,...,...,...,...,...,...
3750,thinksmart,ds,Power UP Your Brain! Think outside the box and...,tbd,tbd,0.10
3751,thinksmart,ds,This original brain training program helps you...,tbd,tbd,0.10
3752,thinksmart: chess for kids,ds,Greatly enhances your child's thinking process...,tbd,tbd,0.01
3753,udraw studio,wii,"Packaged with the uDraw GameTablet, uDraw Stud...",71,tbd,2.46
