In [15]:
import pandas as pd
from tabulate import tabulate
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import DistanceMetric, pairwise_distances

In [16]:
data = pd.read_csv("/Users/michaeljeon/Desktop/INST414/Module 3 Assignment/Video_Games.csv")

clean_data = data.drop(['Year_of_Release', 'EU_Sales', 'JP_Sales', 'Other_Sales', 'Critic_Count', 'User_Count'], axis=1)

sorted_data = clean_data.sort_values(by='Global_Sales', ascending=False)

print(tabulate(sorted_data.head(10), headers='keys'))

    Name                       Platform    Genre         Publisher      NA_Sales    Global_Sales    Critic_Score    User_Score  Developer    Rating
--  -------------------------  ----------  ------------  -----------  ----------  --------------  --------------  ------------  -----------  --------
 0  Wii Sports                 Wii         Sports        Nintendo          41.36           82.53              76           8    Nintendo     E
 1  Super Mario Bros.          NES         Platform      Nintendo          29.08           40.24             nan         nan    nan          nan
 2  Mario Kart Wii             Wii         Racing        Nintendo          15.68           35.52              82           8.3  Nintendo     E
 3  Wii Sports Resort          Wii         Sports        Nintendo          15.61           32.77              80           8    Nintendo     E
 4  Pokemon Red/Pokemon Blue   GB          Role-Playing  Nintendo          11.27           31.37             nan         nan    

In [47]:
print(data.isnull().sum())
print(data.dtypes)
print("Duplicates: ", data.duplicated().sum())



Name                  2
Platform              0
Year_of_Release     269
Genre                 2
Publisher            54
NA_Sales              0
EU_Sales              0
JP_Sales              0
Other_Sales           0
Global_Sales          0
Critic_Score       8582
Critic_Count       8582
User_Score         9129
User_Count         9129
Developer          6623
Rating             6769
dtype: int64
Name                object
Platform            object
Year_of_Release    float64
Genre               object
Publisher           object
NA_Sales           float64
EU_Sales           float64
JP_Sales           float64
Other_Sales        float64
Global_Sales       float64
Critic_Score       float64
Critic_Count       float64
User_Score         float64
User_Count         float64
Developer           object
Rating              object
dtype: object
Duplicates:  0


In [37]:
data_clean = data.dropna(subset=['Genre', 'Global_Sales']).copy()

wii_row = data_clean[data_clean['Name'] == 'Wii Sports']
if wii_row.empty:
    print("Wii Sports not found in dataset.")
else:
    wii_genre = set(wii_row['Genre'].values[0].lower().split(', '))

    sorted_by_sales = data_clean.sort_values(by='Global_Sales', ascending=False).copy()

    def jaccard_similarity(genre_str):
        other_genre = set(genre_str.lower().split(', '))
        intersection = wii_genre.intersection(other_genre)
        union = wii_genre.union(other_genre)
        return len(intersection) / len(union) if union else 0

    sorted_by_sales['Jaccard_Similarity'] = sorted_by_sales['Genre'].apply(jaccard_similarity)

    exact_matches = sorted_by_sales[sorted_by_sales['Jaccard_Similarity'] == 1]

    top_10_exact = exact_matches.head(11)

    print(top_10_exact[['Name', 'Genre', 'Global_Sales', 'Jaccard_Similarity']].to_string(index=False))


                              Name  Genre  Global_Sales  Jaccard_Similarity
                        Wii Sports Sports         82.53                 1.0
                 Wii Sports Resort Sports         32.77                 1.0
                           Wii Fit Sports         22.70                 1.0
                      Wii Fit Plus Sports         21.79                 1.0
                           FIFA 16 Sports          8.57                 1.0
Mario & Sonic at the Olympic Games Sports          7.99                 1.0
                           FIFA 17 Sports          7.59                 1.0
                     Zumba Fitness Sports          6.71                 1.0
                           FIFA 12 Sports          6.65                 1.0
                           FIFA 14 Sports          6.47                 1.0
                     Kinect Sports Sports          6.19                 1.0


In [40]:
data_clean = data.dropna(subset=['Rating', 'Global_Sales']).copy()

wii_row = data_clean[data_clean['Name'] == 'Wii Sports']
if wii_row.empty:
    print("Wii Sports not found in dataset.")
else:
    wii_rating = wii_row['Rating'].values[0].lower()

    sorted_by_sales = data_clean.sort_values(by='Global_Sales', ascending=False).copy()

    def jaccard_similarity(rating_str):
        return 1 if rating_str.lower() == wii_rating else 0

    sorted_by_sales['Jaccard_Similarity'] = sorted_by_sales['Rating'].apply(jaccard_similarity)

    exact_matches = sorted_by_sales[sorted_by_sales['Jaccard_Similarity'] == 1]

    top_10_exact = exact_matches.head(11)

    print(top_10_exact[['Name', 'Rating', 'Global_Sales', 'Jaccard_Similarity']].to_string(index=False))


                                        Name Rating  Global_Sales  Jaccard_Similarity
                                  Wii Sports      E         82.53                   1
                              Mario Kart Wii      E         35.52                   1
                           Wii Sports Resort      E         32.77                   1
                       New Super Mario Bros.      E         29.80                   1
                                    Wii Play      E         28.92                   1
                   New Super Mario Bros. Wii      E         28.32                   1
                               Mario Kart DS      E         23.21                   1
                                     Wii Fit      E         22.70                   1
                          Kinect Adventures!      E         21.81                   1
                                Wii Fit Plus      E         21.79                   1
Brain Age: Train Your Brain in Minutes a Day      E   

In [None]:
data['User_Score'] = pd.to_numeric(data['User_Score'], errors='coerce')

data_clean = data.dropna(subset=['User_Score', 'Global_Sales']).copy()

wii_row = data_clean[data_clean['Name'] == 'Wii Sports']
if wii_row.empty:
    print("Wii Sports not found in dataset.")
else:
    wii_score = wii_row['User_Score'].values[0]

    data_clean = data_clean[data_clean['Name'] != 'Wii Sports'].copy()
    data_clean['Euclidean_Distance'] = data_clean['User_Score'].apply(lambda x: np.sqrt((x - wii_score) ** 2))

    exact_matches = data_clean[data_clean['Euclidean_Distance'] == 0].copy()

    top_10_exact_matches = exact_matches.sort_values(by='Global_Sales', ascending=False).head(11)

    print(top_10_exact_matches[['Name', 'Global_Sales', 'User_Score', 'Euclidean_Distance']].to_string(index=False))


                                  Name  Global_Sales  User_Score  Euclidean_Distance
                     Wii Sports Resort         32.77         8.0                 0.0
                            Just Dance          7.20         8.0                 0.0
                    Forza Motorsport 3          5.49         8.0                 0.0
The Legend of Zelda: Phantom Hourglass          5.08         8.0                 0.0
                        FIFA Soccer 11          5.07         8.0                 0.0
   Grand Theft Auto: Vice City Stories          5.03         8.0                 0.0
                         Madden NFL 06          4.91         8.0                 0.0
    The Legend of Zelda: Skyward Sword          3.95         8.0                 0.0
                             Fallout 3          3.94         8.0                 0.0
                   The Sims: Unleashed          3.76         8.0                 0.0
   Cooking Mama 2: Dinner With Friends          3.58         8.0 