In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from implicit.als import AlternatingLeastSquares
import pickle

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
saved_model_fname = "model/finalized_model.sav"
data_fname = "data/ratings.csv"
item_fname = "data/movies_final.csv"
weight = 10

In [5]:
ratings_df = pd.read_csv(data_fname)

In [6]:
a = ratings_df["userId"].astype("category")

In [7]:
ratings_df["movieId"].astype("category").cat.codes

0            0
1            2
2            5
3           43
4           46
          ... 
100831    9416
100832    9443
100833    9444
100834    9445
100835    9485
Length: 100836, dtype: int16

In [29]:
a.cat.codes

0           0
1           0
2           0
3           0
4           0
         ... 
100831    609
100832    609
100833    609
100834    609
100835    609
Length: 100836, dtype: int16

In [16]:
ratings_df.groupby('userId').ngroup()

0           0
1           0
2           0
3           0
4           0
         ... 
100831    609
100832    609
100833    609
100834    609
100835    609
Length: 100836, dtype: int64

In [95]:
ratings_df.groupby('userId').ngroup()

0           0
1           0
2           0
3           0
4           0
         ... 
100831    609
100832    609
100833    609
100834    609
100835    609
Length: 100836, dtype: int64

In [130]:
1 in ratings_df.groupby('movieId').ngroup().unique().tolist()

True

In [137]:
ratings_df[ratings_df.groupby('movieId').ngroup() == 999]

Unnamed: 0,userId,movieId,rating,timestamp
273,3,1302,0.5,1306464189
4920,31,1302,4.0,850467485
6022,42,1302,4.0,996258272
6605,45,1302,5.0,1091305660
7509,51,1302,5.0,1230929661
9572,64,1302,4.0,1161529469
10092,66,1302,4.5,1113188364
10578,68,1302,3.0,1230498000
11692,70,1302,5.0,1355198101
12549,80,1302,4.0,1377306348


In [96]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [10]:
a.cat

<pandas.core.arrays.categorical.CategoricalAccessor object at 0x1056ebb20>

In [19]:

ratings_df = pd.read_csv(data_fname)
ratings_df["userId"] = ratings_df["userId"].astype("category")  # dtype : category
ratings_df["movieId"] = ratings_df["movieId"].astype("category")

# create a sparse matrix of all the users/repos
rating_matrix = coo_matrix((ratings_df["rating"].astype(np.float32),
        (
            ratings_df["movieId"].cat.codes.copy(),   # 이거 수행할려고 카테고리화
            ratings_df["userId"].cat.codes.copy(),
        ),
    )
)


In [20]:
rating_matrix

<9724x610 sparse matrix of type '<class 'numpy.float32'>'
	with 100836 stored elements in COOrdinate format>

In [21]:
als_model = AlternatingLeastSquares(
    factors=50, regularization=0.01, dtype=np.float64, iterations=50
)

als_model.fit(weight * rating_matrix)

100%|██████████| 50/50 [00:36<00:00,  1.38it/s]


In [22]:
als_model

<implicit.cpu.als.AlternatingLeastSquares at 0x127f69420>

In [199]:
als_model.similar_items(itemid=0)

(array([  0, 114, 265,  95,  38, 451, 492, 233,  26, 531], dtype=int32),
 array([1.        , 0.5225028 , 0.49628669, 0.49200082, 0.47754395,
        0.46692424, 0.45454716, 0.44459878, 0.44172463, 0.43412594]))

In [65]:
type(recs)

tuple

In [67]:
for a,b in [(1,2), (3,4) ]:
    pass

In [179]:
def calculate_item_based(model_item_id, items, n=11):

    
    recs = als_model.similar_items(itemid=int(model_item_id), N = n)
                                      
    return [str(items[model_ids]) for model_ids in recs[0]]

In [142]:
ratings_df["movieId"].cat.codes

0            0
1            2
2            5
3           43
4           46
          ... 
100831    9416
100832    9443
100833    9444
100834    9445
100835    9485
Length: 100836, dtype: int16

In [145]:
ratings_df["movieId"].cat.categories

Index([     1,      2,      3,      4,      5,      6,      7,      8,      9,
           10,
       ...
       193565, 193567, 193571, 193573, 193579, 193581, 193583, 193585, 193587,
       193609],
      dtype='int64', length=9724)

In [180]:
ratings_df = pd.read_csv(data_fname)
ratings_df["userId"] = ratings_df["userId"].astype("category")
ratings_df["movieId"] = ratings_df["movieId"].astype("category")
movies_df = pd.read_csv(item_fname)

items = dict(enumerate(ratings_df["movieId"].cat.categories)) # {model item id : data item id} 

In [151]:
ratings_df["movieId"].cat.categories.get_loc(888)

673

In [201]:
parsed_id = ratings_df["movieId"].cat.categories.get_loc(int(1))
result = calculate_item_based(parsed_id, items)

In [202]:
result

['1', '137', '305', '107', '42', '516', '567', '271', '27', '626', '241']

In [203]:
result = [int(x) for x in result]

In [204]:
result_items = movies_df[movies_df["movieId"].isin(result)].to_dict("records")

In [205]:
result_items

[{'movieId': 1,
  'title': 'Toy Story (1995)',
  'genres': 'Adventure|Animation|Children|Comedy|Fantasy',
  'imdbId': 114709,
  'tmdbId': 862.0,
  'url': 'http://www.imdb.com/title/tt0114709/',
  'rating_count': 215,
  'rating_avg': 3.92093023255814,
  'poster_path': 'https://image.tmdb.org/t/p/original/uXDfjJbdP4ijW5hWSBrPrlKpxab.jpg'},
 {'movieId': 27,
  'title': 'Now and Then (1995)',
  'genres': 'Children|Drama',
  'imdbId': 114011,
  'tmdbId': 9263.0,
  'url': 'http://www.imdb.com/title/tt0114011/',
  'rating_count': 9,
  'rating_avg': 3.333333333333333,
  'poster_path': 'https://image.tmdb.org/t/p/original/q31Ks8oAmM4Nq2CcGjyYYpdBAmh.jpg'},
 {'movieId': 42,
  'title': 'Dead Presidents (1995)',
  'genres': 'Action|Crime|Drama',
  'imdbId': 112819,
  'tmdbId': 11443.0,
  'url': 'http://www.imdb.com/title/tt0112819/',
  'rating_count': 7,
  'rating_avg': 3.0,
  'poster_path': 'https://image.tmdb.org/t/p/original/wRLepMZoUCwjLFb1WSbuUT6uVWR.jpg'},
 {'movieId': 107,
  'title': 'Muppet

In [104]:
ratings_df["movieId"]

0              1
1              3
2              6
3             47
4             50
           ...  
100831    166534
100832    168248
100833    168250
100834    168252
100835    170875
Name: movieId, Length: 100836, dtype: category
Categories (9724, int64): [1, 2, 3, 4, ..., 193583, 193585, 193587, 193609]

In [152]:
ratings_df["movieId"].isin([1,2])

0          True
1         False
2         False
3         False
4         False
          ...  
100831    False
100832    False
100833    False
100834    False
100835    False
Name: movieId, Length: 100836, dtype: bool

In [None]:
"Speed (1994)"

In [112]:
len(dict(enumerate(ratings_df["movieId"].cat.categories)))

9724

In [114]:
dict(enumerate(ratings_df["movieId"].cat.categories))[9723]

193609

In [84]:
parsed_id = ratings_df["movieId"].cat.categories.get_loc(int(1))
result = calculate_item_based(parsed_id, items)


In [87]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [85]:
parsed_id

0

In [9]:
ratings_df = pd.read_csv(data_fname)
ratings_df["userId"] = ratings_df["userId"].astype("category")
ratings_df["movieId"] = ratings_df["movieId"].astype("category")
movies_df = pd.read_csv(item_fname)

items_dict = dict(enumerate(ratings_df["movieId"].cat.categories)) # {model item id : data item id} 

In [12]:
'h' in {'a':'h'}

False

In [44]:
input_rating_dict = {1 : 3, 3 : 5}

In [45]:
# input(dict)이 들어올거고
item_ids = {r: i for i, r in items_dict.items()} # (after) {data item id : model item id}

mapped_idx = [item_ids[movie_id] for movie_id in input_rating_dict.keys() if movie_id in item_ids] # for model
data = [weight * float(x) for x in input_rating_dict.values()]

In [46]:
rows = [0 for _ in mapped_idx]

In [22]:
rows

[0, 0]

In [23]:
model = pickle.load(open(saved_model_fname, "rb"))

In [25]:
shape = (1, model.item_factors.shape[0])

In [26]:
shape

(1, 9724)

In [41]:
rows

[0, 0]

In [51]:
coo_matrix((data, (rows, mapped_idx)), shape=shape) # (1,#)

<1x9724 sparse matrix of type '<class 'numpy.float64'>'
	with 2 stored elements in COOrdinate format>

In [49]:
data

[30.0, 50.0]

In [50]:
import numpy as np
a.toarray().shape

(1, 3)

In [13]:
params = ['3:3','5:4']

In [16]:
{int(x.split(":")[0]) : float(x.split(":")[1]) for x in params}

{3: 3.0, 5: 4.0}

In [14]:
dict(
        (int(x.split(":")[0]), float(x.split(":")[1])) for x in params)

{3: 3.0, 5: 4.0}