In [1]:
# ! pip install nltk scikit-learn langdetect transformers wordcloud bs4
!pip list

Package                           Version
--------------------------------- --------------------
absl-py                           2.1.0
accelerate                        0.30.1
aiofiles                          22.1.0
aiohttp                           3.9.5
aiosignal                         1.3.1
aiosqlite                         0.20.0
altair                            5.3.0
aniso8601                         9.0.1
annotated-types                   0.7.0
anyio                             4.4.0
argcomplete                       3.3.0
argon2-cffi                       23.1.0
argon2-cffi-bindings              21.2.0
arrow                             1.3.0
astroid                           3.2.2
asttokens                         2.4.1
async-timeout                     4.0.3
attrs                             23.1.0
autopep8                          1.5.5
Babel                             2.15.0
beautifulsoup4                    4.12.3
bleach                          

In [15]:
import os
import glob

import requests
import numpy as np
import pandas as pd
import nltk
import json
import matplotlib.pyplot as plt
# import gensim
import matplotlib.pyplot as plt
import time

from tqdm import tqdm
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler
from transformers import BertTokenizer, BertModel
# from gensim import corpora
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from langchain_openai import OpenAIEmbeddings

In [44]:
def fetch_platforms_and_genres(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36'}
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f'Failed to fetch page: {response.status_code}')
        return ''
    soup = BeautifulSoup(response.text, 'html.parser')
    platforms = soup.select('div.c-gameDetails_Platforms li.c-gameDetails_listItem')
    genres = soup.select('li.c-genreList_item')
    return " ".join(platform.get_text(strip=True).lower().replace(' ', '-').replace('/', '').replace('(', '').replace(')', '') for platform in platforms), ",".join(genre.get_text(strip=True) for genre in genres)

In [2]:
def fetch_reviews(url):
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.6533.73 Safari/537.36'}
    response = requests.get(url, headers=headers)
    if response.status_code != 200:
        print(f'Failed to fetch page: {response.status_code}')
        return []
    data = response.json()
    reviews = [item['quote'].lower() for item in data['data']['items'] if is_english(item['quote'])]
    print(f'Fetched {len(reviews)}/{len(data["data"]["items"])} reviews')
    return reviews

In [3]:
def preprocess_text(text, join_output=True):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    if join_output:
        return ' ' .join(tokens)
    return tokens

In [4]:
def generate_wordcloud(text):
    wordcloud = WordCloud(width=800, height=400).generate(text)
    plt.figure(figsize=(10, 10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.show()

In [5]:
def extract_relevant_words(processed_reviews, num_clusters=2):
    # Create TF-IDF matrix
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(processed_reviews)
    
    # Perform K-means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    kmeans.fit(tfidf_matrix)
    
    # Get cluster centers
    cluster_centers = kmeans.cluster_centers_
    
    # Get feature names (words)
    feature_names = vectorizer.get_feature_names_out()
    
    # Extract top words from each cluster
    top_words = []
    for center in cluster_centers:
        top_indices = center.argsort()[-50:][::-1]  # Get top 50 words
        top_words.extend([feature_names[i] for i in top_indices])
    
    return list(set(top_words))  # Remove duplicates

In [6]:
def is_english(text):
    try:
        if type(text) != str:
            return False
        return detect(text) == 'en'
    except LangDetectException:
        return False

In [7]:
def get_sentence_embedding(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=512, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).detach().numpy().mean(axis=0)

In [8]:
def get_game_embedding(review, model, tokenizer):
    embeddings = np.array([get_sentence_embedding(sentence, model, tokenizer) for sentence in review])
    return embeddings.mean(axis=0)

In [9]:
def find_similar_games(game_embedding, all_embeddings):
    similarities = cosine_similarity([game_embedding], all_embeddings)
    return np.argsort(similarities[0])[::-1]

In [32]:
def read_bio(path):
    # read csv
    bio = pd.read_csv(path)
    bio['Gender'], _ = pd.factorize(bio['Gender'])
    bio['Play Frequency'], _ = pd.factorize(bio['Play Frequency'])
    bio['Gamer'], _ = pd.factorize(bio['Gamer'])
    
    bio = bio.drop(['Education', 'Country', 'Left-Handed', 'Platform', 'Favourite Game'], axis=1)
    return bio

In [13]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/supermoon/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/supermoon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/supermoon/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Embedding game reviews
bio = read_bio('data/biographical_data_tmp.csv')
games = bio['Metacritic Code'].unique()
# games = ['american-truck-simulator-starter-pack-california', 'guilty-gear-xx-accent-core-plus', 'guitar-hero', 'jak-3', 'kingdom-rush', 'pokemon-go', 'shenmue-ii', 'super-mario-advance-4-super-mario-bros-3', 'valkyrie-profile-2-silmeria']

limit = 200
sentiment = 'positive'
dir_path = 'data/positive_reviews'
if not os.path.exists(dir_path):
    os.makedirs(dir_path)

pbar = tqdm(games)
for game in pbar:
    if game == '-':
        continue
    metacritic_url = f'https://www.metacritic.com/game/{game}/details'
    platforms, genres = fetch_platforms_and_genres(metacritic_url)
    platforms = platforms.split(' ')
    genres = genres.split(',')
    api_key = '1MOZgmNFxvmljaQR1X9KAij9Mo4xAY3u'
    offset = 0

    all_reviews = []
    # Add genre column to bio
    bio.loc[bio['Metacritic Code'] == game, 'Genre'] = genres[0]
bio
    # if review_dict.get(game) is None:
    #     review_dict[game] = []
    # print(f'platforms for {game}: {platforms}, {genres}')
    # for platform in platforms:
    #     while True:
    #         review_api_url = f'https://internal-prod.apigee.fandom.net/v1/xapi/reviews/metacritic/user/games/{game}/platform/{platform}/web?apiKey={api_key}&offset={offset}&limit={limit}&filterBySentiment={sentiment}&sort=date&componentName=user-reviews&componentDisplayName=user%20Reviews&componentType=ReviewList'
    #         print(f'Fetching reviews for {game}-{platform} offset {offset}: {review_api_url}')
    #         reviews_text = fetch_reviews(review_api_url)
    #         if len(reviews_text) == 0:
    #             break
    #         all_reviews.extend(reviews_text)
    #         offset += limit
    #     offset = 0
    #     time.sleep(0.5)
    # # save reviews to file
    # with open(f'{dir_path}/{game}.json', 'w') as f:
    #     json.dump(all_reviews, f, indent=4)

In [47]:
bio.to_csv('data/biographical_data_with_genre.csv', index=False)

In [16]:
from dotenv import load_dotenv
load_dotenv()

dir_path = 'data/positive_reviews'
embeddings = OpenAIEmbeddings(model='text-embedding-3-large')
review_embeddings = {}
review_sentences = {}
review_list = glob.glob(f'{dir_path}/*.json')
for filename in tqdm(review_list):
    reviews = json.load(open(f'{filename}'))
    title = filename.split('.')[0].split('/')[-1]
    review_embeddings[title] = np.array(embeddings.embed_documents(reviews))
    review_sentences[title] = reviews
    
review_embeddings

100%|██████████| 77/77 [00:00<00:00, 387.00it/s]


{}

In [6]:
# write embeddings to h5
import h5py
with h5py.File('data/positive_review_embeddings.h5', 'w') as f:
    for key, value in review_embeddings.items():
        f.create_dataset(key, data=value)

In [17]:
# read embeddings from h5
import h5py
review_embeddings = {}
all_embeddings = []
with h5py.File('data/positive_review_embeddings.h5', 'r') as f:
    for key in f.keys():
        review_embeddings[key] = list(f[key])

In [19]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D

all_embeddings = []
all_reviews = []
titles = []
games = []
i = 0
for game, embeddings in review_embeddings.items():
    # if i == 5:
    #     break
    all_embeddings.extend(embeddings)
    all_reviews.extend(review_sentences[game])
    titles.extend([game] * len(embeddings))
    games.append(game)
    i += 1
    
all_embeddings = np.array(all_embeddings)
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans.fit(all_embeddings)
labels = kmeans.labels_

tsne = TSNE(n_components=3, perplexity=20, max_iter=500, learning_rate=100)
reduced_embeddings = tsne.fit_transform(all_embeddings)

In [25]:
import mplcursors
%matplotlib notebook

fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(111, projection='3d')

sc = ax.scatter(
    reduced_embeddings[:, 0], 
    reduced_embeddings[:, 1], 
    reduced_embeddings[:, 2], 
    c=labels,
    cmap='viridis'
)

cursor = mplcursors.cursor(sc, hover=True)
@cursor.connect("add")
def on_add(sel):
    index = sel.index
    sel.annotation.set_text(f'{games[index]}: {all_reviews[index]}')
    
# sns.scatterplot(x=reduced_embeddings[:, 0], y=reduced_embeddings[:, 1], hue=titles)
plt.title('PCA Visualization of Game Reviews')
plt.show()

<IPython.core.display.Javascript object>

In [28]:
clustered_reviews = {i: [] for i in range(5)}
for sentence, label in zip(all_reviews, labels):
    clustered_reviews[label].append(sentence)

for cluster_id, sentences in clustered_reviews.items():
    print(f'Cluster {cluster_id}:')
    print(sentences[:10])

Cluster 0:
['i am absolutely addicted to this game! beautiful graphics and attention to detail. hope that they come out with more content in the form of new states real soon. i want to drive some of my favorites. a++++', "two words: absolutely perfect. this stunning marvel **** captivates you with its vibrant high quality graphics, the gameplay is completely and utterly user friendly. the breathtaking backgrounds within this game is truly one of life's wonders and i can die happy and fulfilled knowing that it will be impossible to find a game that is as beautiful and fun as this one.", 'amazing game great graphics i just love it so much it’s perfect but i think they could improve on some stuff', 'bought this for my girlfriend. without a doubt, the best decision of my life.', 'this game gave me peace and hope in a really dark time. i had so much fun and i will always cherish it.', 'i think lot of people dont like it, becouse its not an action game, but its one of the best game in the co