In [1]:
import pandas as pd
import numpy as np
from dataset.amazon.loader import AmazonDatasetLoader
from dataset.yelp.loader import YelpDatasetLoader
from models.nlp.KeyBERT import KeyBERTExtractor
from models.nlp.yake import YakeExtractor

amazon_dataloader = AmazonDatasetLoader()
yelp_dataloader = YelpDatasetLoader()

In [2]:
amazon_dataloader.filenames
idf = KeyBERTExtractor().extract_keywords_of_items(amazon_dataloader.get_pandas_df())

['/Users/mert.tunc/Code/ceng/thesis/dataset/amazon/Digital_Music_5.json']
     overall  vote  verified   reviewTime      reviewerID        asin  \
0          5   3.0      True   06 3, 2013  A2TYZ821XXK2YZ  3426958910   
1          5   NaN      True  10 11, 2014  A3OFSREZADFUDY  3426958910   
2          5   NaN      True  02 11, 2014  A2VAMODP8M77NG  3426958910   
3          4   3.0     False   12 7, 2013   AAKSLZ9IDTEH0  3426958910   
4          5   NaN      True  06 12, 2016  A3OH43OZJLKI09  5557706259   
..       ...   ...       ...          ...             ...         ...   
995        5   NaN      True  03 22, 2015  A33H2FCAJE6W8K  B000SX6NYQ   
996        5   NaN      True   01 1, 2015  A24NL78E7KQLZY  B000SX6NYQ   
997        5   NaN      True  12 23, 2014  A1ND57LZP5C7M4  B000SX6NYQ   
998        5   NaN      True  11 23, 2013   A3BXDU5PZ6WHA  B000SX6NYQ   
999        1   NaN      True  10 23, 2013  A2F4Z24LK8WJLQ  B000SX6NYQ   

                         style       reviewerName

In [3]:
udf = KeyBERTExtractor().extract_keywords_of_users(amazon_dataloader.get_pandas_df())

In [4]:
from gensim.models import Word2Vec
sentences = pd.concat([idf[['review']], udf[['review']]]),
sentences = sentences[0]['review'].tolist()
sentences



["this is awesome to listen to, a must-have for all slayer fans..sadly needed to be a triple disc set..they have so many hits!! bien it was great to hear the old stuff again and i like the new stuff too. i recommend it to any slayer fan. well best of's are a bit poison normally but this is not bad it's pretty good because i'd have put 90% hell await,reign in blood,south of,seasons ,divine and a couple musica's tracks and everything on god hates -at that point best of mean every cd mainly so this is not so bad  it dose put some great tracks that live shows don't play much out there like,213, skeletons of society,sex murder art and gemini and some rare track too,final six is just a bonus track on christ illusion but it's here with the mystery cover songs from unditstputed attitude cd(why these would be on a greatest hits collection i don't know) but the also put a couple of live tracks on here too.all in all it could be much worse but it's great for the car.",
 'what can i say? this is c

In [5]:
import re
len(sentences)
sentences = [ re.sub(' +', ' ', x).strip(' ').split(' ') for x in sentences ]

In [6]:
model = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)


In [7]:
model.train(sentences, total_examples=len(sentences), epochs=50)

(4296504, 5743300)

In [8]:
model.wv.most_similar(positive=["hits"])

[('collection', 0.5529413223266602),
 ("michael's", 0.5317878723144531),
 ('hits,', 0.5297430157661438),
 ('motown...like', 0.5284255146980286),
 ('sizable', 0.5152820944786072),
 ('career', 0.5088241100311279),
 ('entertainer', 0.5038806796073914),
 ('hits(his', 0.5004626512527466),
 ('recoridngs,', 0.49497881531715393),
 ('tracks', 0.4883747398853302)]

In [9]:
item_property_map = [ (x[0], x[2]) for x in list(idf.itertuples(index=False, name=None))]
item_property_map = { x[0]: [y[0] for y in x[1]] for x in item_property_map }
item_property_map

{'3426958910': ['slayer',
  'hits',
  'triple',
  'murder',
  'live',
  'fans',
  'blood',
  'poison',
  'sex',
  'skeletons'],
 '5557706259': ['gospel',
  'christians',
  'christian',
  'bible',
  'prayer',
  'ballad',
  'meditation',
  'favorite',
  'worship',
  'singer'],
 '5559166928': ['elvis',
  'gospel',
  'nashville',
  'favorites',
  'sing',
  'love',
  'sings',
  'songs',
  'album',
  'jesus'],
 '7799420340': ['bluesman',
  'motown',
  'brothers',
  'beatles',
  'songwriter',
  'timberlake',
  'decades',
  'uncle',
  'toughest',
  'brother'],
 '9714721180': ['zombieland',
  'metallica',
  'metalheads',
  'lovecraft',
  'nirvana',
  'metalhead',
  'metallicas',
  'megaforce',
  'saxon',
  'tuesday'],
 'B000002TTH': ['metallica',
  'sluggishness',
  'harrowing',
  'sinister',
  'songcraft',
  'overproduced',
  'acoustic',
  'mediocrity',
  'balladry',
  'boredom'],
 'B000006045': ['reggae',
  'geniuses',
  'favorite',
  'sexy',
  'soulful',
  'seductive',
  'angelic',
  'reznor

In [10]:
user_property_map = [ (x[0], x[2]) for x in list(udf.itertuples(index=False, name=None))]
user_property_map = { x[0]: [y[0] for y in x[1]] for x in user_property_map }
user_property_map

{'A1049BXR3MAWT4': ['christmas',
  'memories',
  'old',
  'childhood',
  'artists',
  'missing',
  'recording',
  'lucky',
  'carrols',
  'original'],
 'A108SSM7YMMTOM': ['inspiring',
  'smile',
  'entertainment',
  'master',
  'ashamed',
  'delightful',
  'overcame',
  'necessary',
  'meager',
  'remember'],
 'A10DNCYK7YISHU': ['eminem',
  'rap',
  'hiphop',
  'rapper',
  'rawkus',
  'lyricist',
  'punchlines',
  'ripping',
  'downfall',
  'unfortunately'],
 'A10HGE7LJ8EQA8': ['song',
  'unmatched',
  'timeless',
  'cocker',
  'different',
  'voice',
  'joe',
  'lovely'],
 'A10M52OJSDB8WK': ['song',
  'remembered',
  'artists',
  'originally',
  '60',
  'great',
  'exactly',
  'version',
  'looking'],
 'A10X28FN19BYTP': ['trivia',
  'science',
  'classic',
  'yelling',
  'song',
  'recording',
  'night',
  'excellent',
  'unison',
  'entire'],
 'A11F8B8GQURI84': ['betcha',
  'girls',
  'songs',
  'song',
  'memories',
  'owner',
  '1960',
  '1970',
  'mellow',
  'music'],
 'A11MI5QN5F

In [11]:
from queue import PriorityQueue
lim = 5

for user, interests in user_property_map.items():
    reco = PriorityQueue()
    for item, features in item_property_map.items():
        d = 0
        for interest in interests:
            try:
                d += np.sum(model.wv.distances(interest, features))/len(features)
            except: ## TODO xd
                d += 100000
        reco.put((d, item))
    print(f'For user {user} with generated interests: {user_property_map[f"{user}"]}')
    for i in range(5):
        r = reco.get()
        print(f'Recommended item {i}: {r}: {item_property_map[f"{r[1]}"]}')
    print('---------------------------------------------------')
    lim -= 1
    if lim < 0:
        break

For user A1049BXR3MAWT4 with generated interests: ['christmas', 'memories', 'old', 'childhood', 'artists', 'missing', 'recording', 'lucky', 'carrols', 'original']
Recommended item 0: (8.332682991027832, 'B000BD6NHU'): ['dad', '40s', 'favorites', 'cds', '50s', 'songs', 'old', '60s', 'cd', 'radio']
Recommended item 1: (8.590852880477904, 'B000S75FHI'): ['youtube', 'mtv', 'uplifting', 'love', 'sing', 'song', 'corporations', 'performers', 'memories', 'teenage']
Recommended item 2: (8.602792692184448, 'B000QO5DAC'): ['blues', 'midnight', 'favorites', 'old', 'friends', 'songs', 'band', 'oldie', 'sixties', 'detroit']
Recommended item 3: (8.636463928222657, 'B000QMDXDS'): ['betcha', 'love', 'songs', 'song', 'girls', 'mp3', 'memories', 'mellow', 'liked', 'teen']
Recommended item 4: (8.644681930541992, 'B000BD8WUQ'): ['dad', 'sandman', '40s', 'cds', '50s', 'aces', 'tonight', '60s', 'cd', 'songs']
---------------------------------------------------
For user A108SSM7YMMTOM with generated interests