In [1]:
import pandas as pd
import numpy as np
from dataset.amazon.loader import AmazonDatasetLoader
from dataset.yelp.loader import YelpDatasetLoader
from models.nlp.KeyBERT import KeyBERTExtractor
from models.nlp.yake import YakeExtractor


amazon_dataloader = AmazonDatasetLoader()
yelp_dataloader = YelpDatasetLoader()

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mert.tunc/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
amazon_dataloader.filenames
idf = KeyBERTExtractor().extract_keywords_of_items(amazon_dataloader.get_pandas_df(), {'top_n':10})

['/Users/mert.tunc/Code/ceng/thesis/dataset/amazon/Digital_Music_5.json']
     overall  vote  verified   reviewTime      reviewerID        asin  \
0          5   3.0      True   06 3, 2013  A2TYZ821XXK2YZ  3426958910   
1          5   NaN      True  10 11, 2014  A3OFSREZADFUDY  3426958910   
2          5   NaN      True  02 11, 2014  A2VAMODP8M77NG  3426958910   
3          4   3.0     False   12 7, 2013   AAKSLZ9IDTEH0  3426958910   
4          5   NaN      True  06 12, 2016  A3OH43OZJLKI09  5557706259   
..       ...   ...       ...          ...             ...         ...   
995        5   NaN      True  03 22, 2015  A33H2FCAJE6W8K  B000SX6NYQ   
996        5   NaN      True   01 1, 2015  A24NL78E7KQLZY  B000SX6NYQ   
997        5   NaN      True  12 23, 2014  A1ND57LZP5C7M4  B000SX6NYQ   
998        5   NaN      True  11 23, 2013   A3BXDU5PZ6WHA  B000SX6NYQ   
999        1   NaN      True  10 23, 2013  A2F4Z24LK8WJLQ  B000SX6NYQ   

                         style       reviewerName

In [3]:
udf = KeyBERTExtractor().extract_keywords_of_users(amazon_dataloader.get_pandas_df(), {'top_n':10})

In [4]:
from gensim.models import Word2Vec
sentences = pd.concat([idf[['review']], udf[['review']]]),
sentences
sentences = sentences[0]['review'].tolist()
sentences[:10]



['this be awesome to listen to a must have for all slayer fan sadly need to be a triple disc set they have so many hit bien it be great to hear the old stuff again and i like the new stuff too i recommend it to any slayer fan well best of s be a bite poison normally but this be not bad it s pretty good because i d have put 90 hell await reign in blood south of season divine and a couple musica s track and everything on god hat at that point best of mean every cd mainly so this be not so bad it dose put some great track that live show don t play much out there like 213 skeletons of society sex murder art and gemini and some rare track too final six be just a bonus track on christ illusion but it s here with the mystery cover songs from unditstputed attitude cd why these would be on a great hit collection i don t know but the also put a couple of live track on here too all in all it could be much bad but it s great for the car',
 'what can i say this be cast crown this be a good bless fi

In [9]:
sentences = [ x.split(' ') for x in sentences ]

In [16]:
model = Word2Vec(sentences=sentences, epochs=50, vector_size=100, window=5, min_count=1, workers=4)

In [17]:
model.wv.most_similar(positive=["hit"])

[('include', 0.45345818996429443),
 ('collection', 0.4487866461277008),
 ('career', 0.4301295578479767),
 ('artists', 0.408530056476593),
 ('disc', 0.3818663954734802),
 ('yet', 0.3800627887248993),
 ('songs', 0.3757317066192627),
 ('already', 0.3714917302131653),
 ('j5', 0.37027180194854736),
 ('compilation', 0.361930787563324)]

In [18]:
item_property_map = [ (x[0], x[2]) for x in list(idf.itertuples(index=False, name=None))]
item_property_map = { x[0]: [y[0] for y in x[1]] for x in item_property_map }

In [19]:
user_property_map = [ (x[0], x[2]) for x in list(udf.itertuples(index=False, name=None))]
user_property_map = { x[0]: [y[0] for y in x[1]] for x in user_property_map }

In [20]:
from queue import PriorityQueue
lim = 5

for user, interests in user_property_map.items():
    reco = PriorityQueue()
    for item, features in item_property_map.items():
        d = 0
        for interest in interests:
            try:
                d += np.sum(model.wv.distances(interest, features))/len(features)
            except: ## TODO xd
                d += 100000
                print("wtf")
        reco.put((d, item))
    print(f'For user {user} with generated interests: {user_property_map[f"{user}"]}')
    for i in range(5):
        r = reco.get()
        print(f'Recommended item {i}: {r}: {item_property_map[f"{r[1]}"]}')
    print('---------------------------------------------------')
    lim -= 1
    if lim < 0:
        break

For user A1049BXR3MAWT4 with generated interests: ['christmas', 'memories', 'old', 'childhood', 'lucky']
Recommended item 0: (3.8459523677825924, 'B000QOYJWU'): ['kindle', 'download', 'favorites', 'songs', 'old']
Recommended item 1: (3.931315326690674, 'B000QNT5TS'): ['husband', 'love', 'friends', 'marry', 'song']
Recommended item 2: (3.9327531337738035, 'B000QMHRP8'): ['love', 'song', 'thank', 'memories', 'great']
Recommended item 3: (3.9654109954833983, 'B000BD6NHU'): ['dad', '40s', 'favorites', '50s', 'old']
Recommended item 4: (3.9799186706542966, 'B000S3KQOE'): ['mp3', 'favorite', 'boyfriend', 'amazon', 'download']
---------------------------------------------------
For user A108SSM7YMMTOM with generated interests: ['smile', 'entertainment', 'inspire', 'master', 'ashamed']
Recommended item 0: (4.22891755104065, 'B000PHA2XS'): ['skynyrd', 'outlaw', 'blackfoot', 'ghost', 'midnight']
Recommended item 1: (4.239319992065429, 'B000S4GHYG'): ['bach', 'science', 'fugue', 'arthur', 'fictio

In [2]:

df = amazon_dataloader.get_pandas_df()
df.index

['/Users/mert.tunc/Code/ceng/thesis/dataset/amazon/Musical_Instruments_5.json']
        overall  verified   reviewTime      reviewerID        asin  \
0             5      True  10 30, 2016  A3FO5AKVTFRCRJ  0739079891   
1             5      True  06 30, 2016  A3UCGC1DHFMBCE  0739079891   
2             5      True   05 9, 2016  A2S9SLRYLPGYZB  0739079891   
3             4      True  04 10, 2016  A15RTJWPG8OKOE  0739079891   
4             1      True   02 6, 2016  A12ET1WO3OAVU7  0739079891   
...         ...       ...          ...             ...         ...   
231387        5      True  03 29, 2018  A2GLR2Q3M1PC7P  B01HIDOPP2   
231388        5      True  11 18, 2017  A3S1LSTRW57BP9  B01HIDOPP2   
231389        4      True   06 5, 2017  A2BNWRRADDBSVO  B01HIDOPP2   
231390        1      True  05 27, 2017  A1KRL5ZRON6DPT  B01HIDOPP2   
231391        5      True  03 24, 2017  A3QN3W0PJ1DXIT  B01HIDOPP2   

            reviewerName                                         reviewText  \


Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            231382, 231383, 231384, 231385, 231386, 231387, 231388, 231389,
            231390, 231391],
           dtype='int64', length=231344)

In [19]:

test = df.groupby('userID', as_index=False).nth(5)
test_indexes = test.index
train = df.loc[set(df.index) - set(test_indexes)]

print( len(df), len(train), len(test) )




231344 212559 18785


In [4]:
df.size

2776128

In [None]:
df.loc[set(df.index) - set(blacklist)]
