In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp
import pandas as pd
import numpy as np

### Prepare data
choose which columns to use

In [None]:
# prepare data (can add or remove some columns)
cleaned_data_dir = "../../data"
games = pd.read_csv(f'{cleaned_data_dir}/games_cleaned.csv')
mechanics = pd.read_csv(f'{cleaned_data_dir}/mechanics_cleaned.csv')
subcategories = pd.read_csv(f'{cleaned_data_dir}/subcategories_cleaned.csv')
themes = pd.read_csv(f'{cleaned_data_dir}/themes_cleaned.csv')
# print(games.dtypes)

df_conc = pd.merge(games, mechanics, on='BGGId')
df_conc = pd.merge(df_conc, subcategories, on='BGGId')
df_conc = pd.merge(df_conc, themes, on='BGGId')

df_conc.sort_values(by='BGGId')
df = df_conc.drop(columns=['BGGId', 'Description', 'Name', 'GoodPlayers', 'ImagePath', 'NumUserRatings', 'NumComments', 'NumAlternates', 'NumExpansions', 'NumImplementations', 'IsReimplementation', 'NumImplementations', 'Family'])

df = df[df.columns.drop(list(df.filter(regex='Rank')))]



### Prepare tf-idf matrix

matrix does not need to be computed every time, can be saved, because we have static dataset (no new games will appear)

In [None]:
def compute_tfidf_matrix(df):
    """
    in 'df' should be dataframe with chosen columns, returns matrix which will be used for computing cosine similarity
    """
    # define a dictionary that maps each data type to a preprocessing function
    preprocess_fn = {
        'float64': lambda x: MinMaxScaler().fit_transform(x.fillna(0).values.reshape(-1,1)),
        'int64': lambda x:   MinMaxScaler().fit_transform(x.fillna(0).values.reshape(-1,1)),  # dataset has binary flags saved as ints, so i would not be able to distinguish them from values like number of players
        'object': lambda x:  TfidfVectorizer().fit_transform(x.fillna(''))
    }

    tfidf_matrix_list = []
    # preprocess data using the corresponding preprocessing function
    for name, col in df.select_dtypes(include=['float', 'int', 'object']).items():
        data = preprocess_fn[str(col.dtypes)](col)
        if data.ndim != 2:
            raise ValueError("Data is not a 2D array")
        if type(data) != np.ndarray:
            print(type(data))
        tfidf_matrix_list.append(pd.Series(data.flatten()))

    # combine the dataframes using pd.concat()
    # temp = pd.concat(tfidf_matrix_list, axis=1)
    temp = np.stack(tfidf_matrix_list, axis=-1)
    print(temp.shape)
    tfidf_matrix = pd.DataFrame(temp)

    return tfidf_matrix

In [None]:

tfidf_matrix = compute_tfidf_matrix(df)

In [None]:
# i was not able to find out how it shuffles with indexes when used like this
"""
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

game_ids = [68, 99, 51, 24]
ratings = [10,10,10,10]

user_rated_games_indexes = df_conc[df_conc['BGGId'].isin(game_ids)].index

user_rated = tfidf_matrix.loc[user_rated_games_indexes]
rest_of_matrix = tfidf_matrix.drop(user_rated_games_indexes)

print(user_rated.shape)
print(rest_of_matrix.shape)

# compute the cosine similarity between rated items and all the other items

similarity_scores = cosine_similarity(user_rated, tfidf_matrix)

similarity_scores *= 0.1 * np.array(ratings).reshape(len(ratings), -1)  # weight scores based on user rating

print(similarity_scores.shape)

for j, id in enumerate(game_ids):
    similarity_scores_r = similarity_scores[j]  # extract the 1D array of similarity scores

    # sort the similarity scores in descending order and get the indices of the top k most similar items
    k = 5  # for example, get the top 5 most similar items

    most_similar_indices = np.argsort(-similarity_scores_r)[:k]

    # print the indices and similarity scores of the top k most similar items
    print(f"Top {k} most similar items to item {id}:")
    for i in most_similar_indices:
        print(f"Item {i+1}: similarity score = {similarity_scores_r[i]}")
"""

### Cosine Similarity
gets similarity score ..the way it is done the similarity matrix can be saved and not computed every time (after the used parameters are decided, it can be done)

In [83]:
similarity_scores = cosine_similarity(tfidf_matrix)

game_ids = [1, 99, 44, 24]
ratings = [10,10,10,10]

for idx, id in enumerate(game_ids):
  user_rated_games_indexes = df_conc[df_conc['BGGId'].isin([id])].index

  user_rated = tfidf_matrix.loc[user_rated_games_indexes]

  print(similarity_scores.shape)

  # -1 because BGGId start at 1, not at zero
  similarity_scores_r = similarity_scores[id-1] * 0.1 * ratings[idx] # extract the 1D array of similarity scores and weight it based on user's rating

  # sort the similarity scores in descending order and get the indices of the top k most similar items
  k = 5  # for example, get the top 5 most similar items

  most_similar_indices = np.argsort(-similarity_scores_r)[:k]

  # print the indices and similarity scores of the top k most similar items
  print(f"Top {k} most similar items to item {id}:")
  for i in most_similar_indices:
      print(f"Item {i+1}: similarity score = {similarity_scores_r[i]}")

# instead of printing them just store them as tuples bggid:score (in case of duplicity take highest)

(21919, 21919)
Top 5 most similar items to item 1:
Item 1: similarity score = 1.0
Item 407: similarity score = 0.7625212549886751
Item 696: similarity score = 0.737467801281255
Item 14909: similarity score = 0.7357843977615296
Item 856: similarity score = 0.732005111299106
(21919, 21919)
Top 5 most similar items to item 99:
Item 99: similarity score = 0.9999999999999998
Item 13870: similarity score = 0.928961450265995
Item 4734: similarity score = 0.8375013650414386
Item 2219: similarity score = 0.8371823934687246
Item 1400: similarity score = 0.8354986966341327
(21919, 21919)
Top 5 most similar items to item 44:
Item 44: similarity score = 1.0
Item 5980: similarity score = 0.7028944746635464
Item 7292: similarity score = 0.6967560093246704
Item 16756: similarity score = 0.687298798594787
Item 11747: similarity score = 0.6847875423149876
(21919, 21919)
Top 5 most similar items to item 24:
Item 24: similarity score = 0.9999999999999998
Item 7465: similarity score = 0.9386661273770147
It