In [48]:
import csv
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Preprocessing functions
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return filtered_tokens

def lemmatize_tokens(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens

# User input
user_input = input("Enter your query: ")

# Preprocess user input
input_tokens = word_tokenize(user_input)
input_tokens = remove_stopwords(input_tokens)
input_tokens = lemmatize_tokens(input_tokens)

# Load CSV data
csv_file = 'video_game_cleaned.csv'  # Replace with your CSV file path
rows = []
with open(csv_file, 'r') as file:
    csv_reader = csv.reader(file)
    for row in csv_reader:
        rows.append(row)

# Preprocess and tokenize CSV data
documents = [' '.join(row) for row in rows]
corpus_tokens = [word_tokenize(doc) for doc in documents]
corpus_tokens = [remove_stopwords(tokens) for tokens in corpus_tokens]
corpus_tokens = [lemmatize_tokens(tokens) for tokens in corpus_tokens]

# Convert tokens back to strings for vectorization
corpus_texts = [' '.join(tokens) for tokens in corpus_tokens]

# Vectorize input and corpus texts
vectorizer = TfidfVectorizer()
vectorized_input = vectorizer.fit_transform([user_input])
vectorized_corpus = vectorizer.transform(corpus_texts)

# Compute cosine similarity between input and corpus
cosine_similarities = cosine_similarity(vectorized_input, vectorized_corpus).flatten()

# Find indices of rows with the highest cosine similarity
top_similar_indices = cosine_similarities.argsort()[::-1]

# Define the number of top matches you want to retrieve
top_matches_count = 1

# Get the top matching rows
top_matching_rows = [rows[i] for i in top_similar_indices[:top_matches_count]]




[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rutgersbootcamp/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rutgersbootcamp/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rutgersbootcamp/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/rutgersbootcamp/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Enter your query: madden


In [49]:
# Print the top matching rows
print(f"Top {top_matches_count} matching rows:")
for row in top_matching_rows:
    input_video_game = row[1].lower()
    
input_video_game

Top 1 matching rows:


'madden nfl 09'

In [50]:
# Import dependencies
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

df = pd.read_csv('https://raw.githubusercontent.com/mshawn12/video-game-sales-analysis/main/resources/video_game_cleaned.csv')
df.head()

Unnamed: 0,uniqueid,name,yearreleased,genre,publisher,developer,rating,nasales,eusales,jpsales,othersales,globalsales,criticscore,criticcount,userscore,usercount
0,1,.hack//Infection Part 1,2002,Role-Playing,Atari,CyberConnect2,T,0.49,0.38,0.26,0.13,1.27,75,35,8.5,60
1,2,.hack//Mutation Part 2,2002,Role-Playing,Atari,CyberConnect2,T,0.23,0.18,0.2,0.06,0.68,76,24,8.9,81
2,3,.hack//Outbreak Part 3,2002,Role-Playing,Atari,CyberConnect2,T,0.14,0.11,0.17,0.04,0.46,70,23,8.7,19
3,4,[Prototype],2009,Action,Activision,Radical Entertainment,M,0.84,0.35,0.0,0.12,1.31,78,83,7.8,356
4,5,[Prototype],2009,Action,Activision,Radical Entertainment,M,0.65,0.4,0.0,0.19,1.24,79,53,7.7,308


In [51]:
df = df.drop_duplicates(subset='name')

In [52]:
sample_size = 4420
df = df.sample(n=sample_size, replace=False, random_state=390)

df = df.reset_index()
df = df.drop('index',axis=1)

In [53]:
df.head(50)

Unnamed: 0,uniqueid,name,yearreleased,genre,publisher,developer,rating,nasales,eusales,jpsales,othersales,globalsales,criticscore,criticcount,userscore,usercount
0,63,Activision Anthology,2002,Misc,Activision,Contraband Entertainment,E,0.25,0.19,0.0,0.06,0.5,75,13,7.7,6
1,2246,Grand Theft Auto: Vice City,2002,Action,Take-Two Interactive,Rockstar North,M,8.41,5.49,0.47,1.78,16.15,95,62,8.7,730
2,2005,Fishing Master,2007,Misc,Konami Digital Entertainment,Hudson Soft,E,0.29,0.0,0.0,0.02,0.31,58,9,8.1,9
3,1933,Fighter Maker 2,2002,Fighting,Enterbrain,Enterbrain,T,0.03,0.03,0.0,0.01,0.07,58,11,7.8,5
4,4772,Romance of the Three Kingdoms X,2005,Strategy,Tecmo Koei,Koei,T,0.04,0.03,0.0,0.01,0.09,71,13,8.5,10
5,1245,Destruction Derby Arenas,2004,Racing,Sony Computer Entertainment,Studio 33,T,0.06,0.05,0.0,0.02,0.13,57,27,7.8,14
6,2390,Hammerin' Hero,2008,Action,Irem Software Engineering,Irem,E10+,0.06,0.0,0.0,0.01,0.07,73,19,7.3,8
7,3346,Metal Gear Solid: Digital Graphic Novel,2006,Misc,Konami Digital Entertainment,Kojima Productions,M,0.02,0.0,0.02,0.0,0.04,78,22,7.2,21
8,1443,Dragon Ball Z: Attack of the Saiyans,2009,Role-Playing,Namco Bandai Games,Monolith Soft,E10+,0.12,0.02,0.21,0.01,0.36,73,21,7.9,24
9,90,Age of Empires III,2005,Strategy,Microsoft Game Studios,Ensemble Studios,T,0.0,0.33,0.0,0.05,0.38,81,52,7.7,519


In [54]:
def clean_text_pub(publisher):
    result = str(publisher).lower()
    return(result.replace(' ',''))

In [55]:
df['publisher'] = df['publisher'].apply(clean_text_pub)

In [56]:
def clean_text_dev(developer):
    result = str(developer).lower()
    return(result.replace(' ',''))

In [57]:
df['developer'] = df['developer'].apply(clean_text_dev)

In [58]:
df['name'] = df['name'].str.lower()
df['publisher'] = df['publisher'].str.lower()
df['genre'] = df['genre'].str.lower()
df['developer'] = df['developer'].str.lower()

In [59]:
df.columns


Index(['uniqueid', 'name', 'yearreleased', 'genre', 'publisher', 'developer',
       'rating', 'nasales', 'eusales', 'jpsales', 'othersales', 'globalsales',
       'criticscore', 'criticcount', 'userscore', 'usercount'],
      dtype='object')

In [60]:
df2 = df.drop(['nasales', 'eusales', 'jpsales', 'othersales', 'globalsales',
       'criticscore', 'criticcount', 'userscore', 'usercount'],axis=1)

df2['data'] = df2[df2.columns[1:]].apply(
    lambda x: ' '.join(x.dropna().astype(str)),
    axis=1
)

In [61]:
vectorizer = CountVectorizer()
vectorized = vectorizer.fit_transform(df2['data'])

In [62]:
similarities = cosine_similarity(vectorized)

In [41]:
# df = pd.DataFrame(similarities)
# df.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4410,4411,4412,4413,4414,4415,4416,4417,4418,4419
0,1.0,0.111803,0.144338,0.125,0.0,0.0,0.0,0.111803,0.0,0.0,...,0.0,0.125,0.133631,0.0,0.0,0.0,0.0,0.133631,0.0,0.0
1,0.111803,1.0,0.0,0.111803,0.0,0.0,0.119523,0.0,0.0,0.0,...,0.0,0.111803,0.0,0.0,0.0,0.0,0.0,0.119523,0.0,0.0
2,0.144338,0.0,1.0,0.0,0.0,0.0,0.0,0.258199,0.0,0.0,...,0.0,0.0,0.154303,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.125,0.111803,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.125,0.0,0.0,0.0,0.0,0.0,0.133631,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.19245,0.353553,...,0.0,0.0,0.0,0.0,0.111111,0.0,0.117851,0.0,0.0,0.117851
5,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.133631,0.285714,0.0,0.125988,0.0,0.0,0.142857,0.239046,0.0
6,0.0,0.119523,0.0,0.0,0.0,0.0,1.0,0.0,0.109109,0.0,...,0.0,0.0,0.142857,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.111803,0.0,0.258199,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.119523,0.0,0.0,0.1,0.0,0.0,0.0,0.111803
8,0.0,0.0,0.0,0.0,0.19245,0.0,0.109109,0.0,1.0,0.102062,...,0.0,0.0,0.109109,0.204124,0.0,0.182574,0.102062,0.0,0.0,0.204124
9,0.0,0.0,0.0,0.0,0.353553,0.0,0.0,0.0,0.102062,1.0,...,0.0,0.0,0.0,0.0,0.117851,0.0,0.125,0.0,0.0,0.125


In [63]:
df = pd.DataFrame(similarities, columns=df['name'], index=df['name']).reset_index()

In [66]:
recommendations = pd.DataFrame(df.nlargest(21,input_video_game)['name'])
recommendations = recommendations[recommendations['name']!=input_video_game]
print('Here are some recommended video games for you from Group 1')
print(recommendations)

Here are some recommended video games for you from Group 1
                         name
2923   madden nfl 09 all-play
3767        nfl head coach 09
145                  nfl tour
341             madden nfl 11
515             madden nfl 12
805             madden nfl 13
1544            madden nfl 08
2565          madden nfl 2005
2888         ncaa football 09
3163            madden nfl 10
3425          madden nfl 2004
4086      madden nfl football
3400  tiger woods pga tour 09
1359               nfl street
3387             nfl street 2
285        ncaa basketball 09
405             madden nfl 25
490           madden nfl 2001
496            fifa soccer 09
809             madden nfl 07


In [68]:
print(similarities)

[[1.         0.1118034  0.14433757 ... 0.13363062 0.         0.        ]
 [0.1118034  1.         0.         ... 0.11952286 0.         0.        ]
 [0.14433757 0.         1.         ... 0.         0.         0.        ]
 ...
 [0.13363062 0.11952286 0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [70]:
similarities_df = pd.DataFrame(similarities)

In [77]:
similarity_score = similarities_df.sort_values([0],ascending=False)
similarity_score.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4410,4411,4412,4413,4414,4415,4416,4417,4418,4419
0,1.0,0.111803,0.144338,0.125,0.0,0.0,0.0,0.111803,0.0,0.0,...,0.0,0.125,0.133631,0.0,0.0,0.0,0.0,0.133631,0.0,0.0
1620,0.533002,0.0,0.123091,0.0,0.201008,0.0,0.113961,0.095346,0.174078,0.213201,...,0.0,0.0,0.227921,0.0,0.100504,0.0,0.1066,0.0,0.0,0.1066
544,0.533002,0.0,0.123091,0.0,0.100504,0.0,0.0,0.095346,0.087039,0.0,...,0.0,0.0,0.113961,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1783,0.533002,0.0,0.123091,0.0,0.100504,0.0,0.113961,0.095346,0.174078,0.0,...,0.13484,0.0,0.227921,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1853,0.49029,0.0,0.113228,0.0,0.0,0.0,0.209657,0.087706,0.160128,0.0,...,0.0,0.0,0.209657,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3442,0.474342,0.141421,0.0,0.158114,0.0,0.0,0.0,0.0,0.0,0.0,...,0.2,0.158114,0.0,0.0,0.0,0.0,0.158114,0.169031,0.0,0.0
4376,0.474342,0.282843,0.0,0.158114,0.0,0.0,0.169031,0.0,0.0,0.0,...,0.0,0.158114,0.0,0.0,0.0,0.0,0.0,0.169031,0.0,0.0
4216,0.471405,0.0,0.0,0.0,0.111111,0.0,0.0,0.0,0.19245,0.0,...,0.149071,0.0,0.0,0.0,0.0,0.0,0.117851,0.0,0.0,0.0
2475,0.447214,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.141421,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1,0.0
4188,0.433013,0.0,0.166667,0.0,0.0,0.0,0.154303,0.129099,0.117851,0.0,...,0.0,0.0,0.154303,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
top_20 = similarity_score.iloc[1:20].mean()
top_20

0       0.448790
1       0.055295
2       0.084196
3       0.052937
4       0.033638
          ...   
4415    0.012581
4416    0.020135
4417    0.049073
4418    0.012058
4419    0.019677
Length: 4420, dtype: float64

In [84]:
similarity_score.reset_index()
similarity_score

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4410,4411,4412,4413,4414,4415,4416,4417,4418,4419
0,1.000000,0.111803,0.144338,0.125,0.000000,0.0,0.000000,0.111803,0.000000,0.000000,...,0.00000,0.125,0.133631,0.00,0.000000,0.000000,0.0000,0.133631,0.000000,0.0000
1620,0.533002,0.000000,0.123091,0.000,0.201008,0.0,0.113961,0.095346,0.174078,0.213201,...,0.00000,0.000,0.227921,0.00,0.100504,0.000000,0.1066,0.000000,0.000000,0.1066
544,0.533002,0.000000,0.123091,0.000,0.100504,0.0,0.000000,0.095346,0.087039,0.000000,...,0.00000,0.000,0.113961,0.00,0.000000,0.000000,0.0000,0.000000,0.000000,0.0000
1783,0.533002,0.000000,0.123091,0.000,0.100504,0.0,0.113961,0.095346,0.174078,0.000000,...,0.13484,0.000,0.227921,0.00,0.000000,0.000000,0.0000,0.000000,0.000000,0.0000
1853,0.490290,0.000000,0.113228,0.000,0.000000,0.0,0.209657,0.087706,0.160128,0.000000,...,0.00000,0.000,0.209657,0.00,0.000000,0.000000,0.0000,0.000000,0.000000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1609,0.000000,0.119523,0.000000,0.000,0.000000,0.0,0.142857,0.000000,0.000000,0.000000,...,0.00000,0.000,0.000000,0.00,0.000000,0.000000,0.0000,0.000000,0.000000,0.0000
1610,0.000000,0.119523,0.000000,0.000,0.000000,0.0,0.142857,0.000000,0.000000,0.000000,...,0.00000,0.000,0.000000,0.00,0.251976,0.000000,0.0000,0.000000,0.000000,0.0000
1611,0.000000,0.000000,0.000000,0.000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000,0.000000,0.00,0.000000,0.000000,0.0000,0.000000,0.111803,0.0000
1612,0.000000,0.000000,0.154303,0.000,0.125988,0.0,0.142857,0.000000,0.109109,0.133631,...,0.00000,0.000,0.142857,0.00,0.000000,0.000000,0.0000,0.000000,0.000000,0.0000


In [87]:
c1 = similarity_score[[0]].reset_index()
c1

Unnamed: 0,index,0
0,0,1.000000
1,1620,0.533002
2,544,0.533002
3,1783,0.533002
4,1853,0.490290
...,...,...
4415,1609,0.000000
4416,1610,0.000000
4417,1611,0.000000
4418,1612,0.000000


In [89]:
top_20 = c1[0].iloc[1:20]
top_20

1     0.533002
2     0.533002
3     0.533002
4     0.490290
5     0.474342
6     0.474342
7     0.471405
8     0.447214
9     0.433013
10    0.433013
11    0.433013
12    0.433013
13    0.433013
14    0.400892
15    0.400892
16    0.400892
17    0.400892
18    0.400892
19    0.400892
Name: 0, dtype: float64

In [90]:
top_20.mean()

0.44879009534352415