In [9]:
!pip install sentence-transformers

In [8]:
import pandas as pd
import re
import nltk
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer
from sentence_transformers import SentenceTransformer
import ast
import faiss
# import json

In [None]:
nltk.download('punkt_tab')
nltk.download('stopwords')

# Load to Memory & Merge Features and Desc.

In [None]:
games_df = pd.read_csv('games.csv')
metadata_df = pd.read_json('games_metadata.json', lines=True)

df = pd.merge(games_df, metadata_df, on='app_id', how='inner')

games_df = None
metadata_df = None

In [None]:
# df.to_csv('Altered CSVs/merged_game_data.csv', index=False)

# MultiLabelBinarizer

In [None]:
# df = pd.read_csv('Altered CSVs/merged_game_data.csv')

In [None]:
mlb = MultiLabelBinarizer()

# Convert the 'tags' column from a string representation of a list to an actual list
df['tags'] = df['tags'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

tag_matrix = mlb.fit_transform(df['tags'])

tags_df = pd.DataFrame(tag_matrix, columns=mlb.classes_)

df = pd.concat([df, tags_df], axis=1)

In [None]:
# df.to_csv('Altered CSVs/MutliLabelBinarized.csv', index=False)

# Preprocessing

## Cleaning

In [None]:
# df = pd.read_csv('Altered CSVs/MutliLabelBinarized.csv')

In [6]:
# Fill Empty Descriptions
df['description'] = df['description'].fillna(df['title'] +' '+ df['tags'])

In [7]:
def clean_text(text):
    # Remove Special Characters
    text = re.sub(r'[^\w\s]', '', text)

    # Sets all characters to lowercase
    text = text.lower()

    # Removes URLs beginning with https, http, or www
    text = re.sub(r'https\S+|http\S+|www\S+', '', text, flags=re.MULTILINE)
    
    return text

In [8]:
df['description'] = df['description'].apply(clean_text)

# Print any rows with descriptions with only whitespace characters
empty_descriptions = df[df['description'].str.strip() == '']
empty_descriptions

In [9]:
# Print any rows with descriptions with only whitespace characters
empty_descriptions = df[df['description'].str.strip() == '']
empty_descriptions

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,...,Competitive,Faith,8-bit Music,Arcade,Relaxing,Lemmings,Emotional,1990s,Beat em up,Shoot Em Up
14865,1456880,ElecHead,2021-10-14,True,False,False,Overwhelmingly Positive,98,588,9.99,...,0,0,0,0,0,0,0,0,0,0
25173,342570,HIS (Heroes In the Sky),2015-03-03,True,False,False,Mostly Negative,38,539,0.0,...,0,0,0,1,0,0,0,0,0,0
44906,1837980,Our Elusive Suffering,2022-03-26,True,False,False,Mixed,68,70,0.0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
null_literal_descriptions = df[df['description'].str.strip() == 'null']
null_literal_descriptions

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,...,Competitive,Faith,8-bit Music,Arcade,Relaxing,Lemmings,Emotional,1990s,Beat em up,Shoot Em Up
29524,930840,东方百问~TouHouAsked,2018-10-07,True,False,False,Positive,93,29,0.99,...,0,1,0,0,0,0,0,0,0,0


Refill Empty, Whitespace Descriptions ~~and Null Literal Descriptions~~

In [11]:
rows = df['description'].str.strip() == ''
df.loc[rows, 'description'] = df.loc[rows, 'title'] +' '+ df.loc[rows, 'tags']

# Apply Cleaning to Whitespace Columns
df.loc[rows, 'description'] = df.loc[rows, 'description'].apply(clean_text)

# Reprint any rows with descriptions with only whitespace characters
empty_descriptions = df[df['description'].str.strip() == '']


In [12]:
# Apply Cleaning to Whitespace Columns
df.loc[rows, 'description'] = df.loc[rows, 'description'].apply(clean_text)

In [13]:
# Reprint any rows with descriptions with only whitespace characters
empty_descriptions = df[df['description'].str.strip() == '']
empty_descriptions

Unnamed: 0,app_id,title,date_release,win,mac,linux,rating,positive_ratio,user_reviews,price_final,...,Competitive,Faith,8-bit Music,Arcade,Relaxing,Lemmings,Emotional,1990s,Beat em up,Shoot Em Up


In [14]:
# Print Fixed Descriptions
df.loc[rows, 'description']

14865    elechead puzzle platformer puzzle platformer l...
25173    his heroes in the sky free to play multiplayer...
44906    our elusive suffering horror surreal explorati...
Name: description, dtype: object

In [None]:
# Checking Random Description Columns
pd.set_option('display.max_colwidth', None)
print(df['description'].sample(n=3))
pd.reset_option('display.max_colwidth')

In [None]:
# df.to_csv('Altered CSVs/after_cleaning.csv', index=False)

## Tokenization & Lemmatization

In [None]:
# df = pd.read_csv('Altered CSVs/after_cleaning.csv')

In [18]:
# Tokenize the descriptions
df['tokens'] = df['description'].apply(word_tokenize)

# Remove Stop Words from Tokens
stop_words = set(stopwords.words('english'))

df['tokens'] = df['tokens'].apply(
    lambda tokens: [word for word in tokens if word not in stop_words]
)

# Lemmatize the Tokens
lemmatizer = WordNetLemmatizer()
df['tokens'] = df['tokens'].apply(
    lambda tokens: [lemmatizer.lemmatize(word) for word in tokens]
)

In [21]:
# Remove Stop Words from Tokens
stop_words = set(stopwords.words('english'))

df['tokens'] = df['tokens'].apply(
    lambda tokens: [word for word in tokens if word not in stop_words]
)

In [23]:
# Lemmatize the Tokens
lemmatizer = WordNetLemmatizer()
df['tokens'] = df['tokens'].apply(
    lambda tokens: [lemmatizer.lemmatize(word) for word in tokens]
)

In [None]:
# df.to_csv('Altered CSVs/after_tokenization.csv', index=False)

# Sentance Transformer / Embeddings



In [None]:
# df = pd.read_csv('Altered CSVs/MutliLabelBinarized.csv')

In [None]:
model = SentenceTransformer('all-MiniLM-L6-v2')


df['title'] = df['title'].fillna('')
df['description'] = df['description'].fillna('')

texts = (df['title'] + ' ' + df['description'] + ' ' + df['tags']).tolist()

text_embeddings = model.encode(texts, show_progress_bar=True, convert_to_numpy=True)

df['embedding'] = list(text_embeddings)

In [None]:
# df.to_csv('Altered CSVs/with_embeddings.csv', index=False)

# Normalization

## Min-Max

In [None]:
# df = pd.read_csv('Altered CSVs/with_embeddings.csv')

**Applied for Ratings**

In [3]:
rating_dict = {
    'Overwhelmingly Positive': 9,
    'Very Positive': 8,
    'Positive': 7,
    'Mostly Positive': 6,
    'Mixed': 5,
    'Mostly Negative': 4,
    'Negative': 3,
    'Very Negative': 2,
    'Overwhelmingly Negative': 1
}

df['rating_normalized'] = (df['rating'].map(rating_dict) - 1) / 8

In [5]:
df['rating_normalized'] = (df['rating'].map(rating_dict) - 1) / 8

## Log Normalizing

**Applied for Positive Ratio, Price, and Review Count**

In [11]:
df['positive_ratio_log'] = np.log1p(df['positive_ratio'])
df['price_log'] = np.log1p(df['price_final'])
df['user_reviews_log'] = np.log1p(df['user_reviews'])

In [None]:
# df.to_csv('Altered CSVs/normalized.csv', index=False)

# Concatenating Features

In [20]:
# df = pd.read_csv('Altered CSVs/normalized.csv')

In [29]:
numerics = df[['price_log', 'positive_ratio_log', 'user_reviews_log']]

X = np.hstack([numerics])

In [None]:
X = np.hstack([numerics])

# Cosine Similarity

In [None]:
# df = pd.read_csv('Altered CSVs/after_tokenization.csv')
# Glue Tokens
df['tokens'] = df['tokens'].apply(lambda tokens: ' '.join(tokens))
def get_similar_games(app_id, top_n):
    # Get Description for the given app_id
    query = df[df['app_id'] == app_id]['tokens'].values[0]
    query_vector = tfidf_vectorizer.transform([query])

    # Compute cosine similarity
    cosine_similarities = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Get top_n indices of similar games; grabs the last n indexes from the sorted array and then flips them
    top_indices = cosine_similarities.argsort()[-top_n-1:-1][::-1]

    # Retrieve game titles and app_ids for the top similar games
    similar_games = df.iloc[top_indices][['app_id', 'title', 'tokens']]
    similar_games['similarity'] = cosine_similarities[top_indices]
    similar_games = similar_games.sort_values(by='similarity', ascending=False)

    # Return the DataFrame with similar games
    return similar_games