In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
import numpy as np

In [2]:
df_games = pd.read_parquet('src/cleaned/games.parquet')
df_genres = pd.read_parquet('src/cleaned/genres.parquet')
df_items = pd.read_parquet('src/cleaned/items.parquet')
df_reviews = pd.read_parquet('src/cleaned/reviews.parquet')
df_users = pd.read_parquet('src/cleaned/users.parquet')
df_user_distance = pd.read_parquet('src/cleaned/user_distance_matrix.parquet')

In [4]:
len(df_items.loc[df_items['user_id'] == 'Bluegills', 'item_id'].values.tolist())

1280

In [122]:
df_good_games_by_user = df_reviews.loc[df_reviews['sentiment_analysis'] > 0].groupby('user_id').agg({'item_id': 'unique'}).reset_index()
df_bad_games_by_user = df_reviews.loc[df_reviews['sentiment_analysis'] == 0].groupby('user_id').agg({'item_id': 'unique'}).reset_index()

In [123]:
df_good_games_by_user.shape
df_bad_games_by_user.shape

(3063, 2)

In [124]:
good_dummies = pd.get_dummies(df_good_games_by_user['item_id'].explode(), prefix='good').groupby(level=0).sum()
bad_dummies = pd.get_dummies(df_bad_games_by_user['item_id'].explode(), prefix='bad').groupby(level=0).sum()

In [125]:
df_games_by_user = pd.merge(df_good_games_by_user, good_dummies, left_index=True, right_index=True)

In [126]:
df_games_by_user.shape

(22376, 2900)

In [127]:
df_games_by_user = pd.merge(df_games_by_user, bad_dummies, left_index=True, right_index=True, how='left').drop(columns=['item_id'])

In [131]:
df_games_by_user.fillna(0, inplace=True)

In [133]:
distances = pairwise_distances(df_games_by_user.drop(columns=['user_id']), metric='cosine')

In [134]:
df_distances = pd.DataFrame(distances, index=df_games_by_user['user_id'], columns=df_games_by_user['user_id'])

In [135]:
df_distances.columns

Index(['--000--', '--ace--', '--ionex--', '-2SV-vuLB-Kg', '-Azsael-',
       '-Beave-', '-I_AM_EPIC-', '-Kenny', '-Mad-', '-PRoSlayeR-',
       ...
       'zuilde', 'zukuta', 'zunbae', 'zuzuga2003', 'zv_odd', 'zvanik',
       'zwanzigdrei', 'zy0705', 'zynxgameth', 'zyr0n1c'],
      dtype='string', name='user_id', length=22376)

In [172]:
df_distances['Bluegills'].sort_values().head(10)

user_id
Bluegills            2.220446e-16
Marixtao             6.464466e-01
h00py                6.464466e-01
Leonzell             6.464466e-01
coolman1342          6.464466e-01
76561198062095122    6.464466e-01
76561198072365224    6.464466e-01
76561198069683661    6.464466e-01
narkly               6.464466e-01
Mineturdle321        6.464466e-01
Name: Bluegills, dtype: float64

In [160]:
df_test  = pd.merge(df_reviews, df_games, left_on='item_id', right_on='id', how='left')

In [179]:
df_test.loc[df_reviews['user_id'] == 'Bluegills', ['item_id', 'title', 'sentiment_analysis']]

Unnamed: 0,item_id,title,sentiment_analysis
893,293780,crawl,2
894,20530,red faction,1
895,208090,loadout,2
896,57300,amnesia the dark descent,1
897,39150,final fantasy viii,2
898,70,halflife,2
899,13210,unreal tournament 3 black,1
900,22120,penumbra black plague gold edition,1
