In [579]:
import numpy as np
import pandas as pd
import os

import surprise
from scipy.sparse.linalg import svds
from options import RecSysOptions
import datetime
from sklearn.metrics import mean_squared_error
import surprise
from surprise import Reader, Dataset, SVD

from Levenshtein import distance as levenshtein
from sklearn.metrics.pairwise import cosine_similarity
import regex as re

import warnings
warnings.filterwarnings("ignore")

# import logging
# logging.basicConfig(filename='app.log', level=logging.INFO, format='%(asctime)s - %(message)s',
#                     datefmt='%d-%b-%y %H:%M:%S', filemode='a')
# logger = logging.getLogger(__name__)

In [534]:
class SvdModel():
    def __init__(self, n_vectors: int = 5):
        self.n_vectors = n_vectors
        self.data = None

    def load_data(self, options: RecSysOptions):
        model_path_split = os.path.splitext(options.model_data_path)
        options.model_name = model_path_split[0]
        options.model_extention = model_path_split[1][1:]

        if options.model_extention == 'csv':
            self.data = pd.read_csv(options.model_data_path,encoding=options.encoding)
        else:
            raise Exception("Wrong model extension")
        return

    def fit(self, matrix: pd.DataFrame, n_vectors: int, mean_user_rating: np.ndarray): #, std_user_rating: np.ndarray):
        u, sigma, vt = svds(matrix, k=n_vectors)
        sigma_diag_matrix = np.diag(sigma)
        # predicted_ratings = np.dot(np.dot(u, sigma_diag_matrix), vt) * std_user_rating + mean_user_rating
        predicted_ratings = np.dot(np.dot(u, sigma_diag_matrix), vt) + mean_user_rating
        self.data = pd.DataFrame(abs(predicted_ratings), columns = matrix.columns)
        return self.data

    def save(self, name: str, options: RecSysOptions):
        if name != options.model_name:
            options.renew_model_name_and_path(name)
        self.data.to_csv(options.model_data_path, index=False)
        return


In [694]:
class RecSysMF():
    def __init__(self, options: RecSysOptions):
        self.options = options
        self.model = None
        self.users_matrix = None
        self.items_matrix = None
        self.ratings_train = None
        self.ratings_test = None
        self.user_item_matrix = None
        self.mean_user_rating = None
        #self.std_user_rating = None
        self.n_users = None
        self.n_items = None
        self.trained = False
        self.surprise_matrix = None
        self.items_similarity_matrix = None
        pass

    # def warmup(self, model_name: str='model', model_extension: str='csv'):
    def warmup(self, model_type: str='SVD'):
        print(self.options.model_data_path, self.options.model_name, self.options.model_extention)
        if self.__is_model_exists(self.options.model_data_path):
            print('model exist')
            if model_type == 'SVD':
                self.model = SvdModel()
                self.model.load_data(self.options)
            else:
                raise Exception('Invalid model type!')
        else:
            print('model dont exist')

        self.users_matrix, self.n_users = self.__load_users_data(self.options.users_data_path)
        self.items_matrix, self.n_items = self.__load_items_data(self.options.items_data_path)
        
        self.users_matrix = self.__proceed_users(self.users_matrix)
        self.items_matrix = self.__proceed_items(self.items_matrix)
        # logger.info(f"Model: {self.options.model} successfully loaded: {datetime.now()}")
        pass

    def train(self, train_data_path: str=None):
        self.warmup()

        if train_data_path == None:
            train_data_path = self.options.train_data_path

        self.ratings_train = self.__load_ratings(train_data_path)
        print('here we go')
        self.user_item_matrix = self.__create_user_item_matrix(self.users_matrix, self.items_matrix, self.ratings_train)

        # predict 
        # matrix demeaned
        #self.user_item_matrix, self.mean_user_rating, self.std_user_rating = self.__normalize_matrix(self.user_item_matrix)
        self.user_item_matrix, self.mean_user_rating = self.__normalize_matrix(self.user_item_matrix)
        self.model.fit(self.user_item_matrix, self.options.n_vectors, self.mean_user_rating) #, self.std_user_rating)

        self.model.save(self.options.model_name, self.options)
        self.trained = True

        return
    
    def __is_model_exists(self, model_data_path: str):
        """
        все модели сохраняются в моделс сторе
        """
        return os.path.exists(model_data_path)

    def __create_user_item_matrix(self, users: pd.DataFrame, items: pd.DataFrame, ratings: pd.DataFrame):
        user_item_rating_dataframe = self.__create_user_item_rating_dataframe(users, items, ratings)
        matrix = user_item_rating_dataframe.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
        return matrix

    def __create_user_item_rating_dataframe(self, users: pd.DataFrame, items: pd.DataFrame, ratings: pd.DataFrame):
        dataframe = pd.merge(ratings, items, on='movie_id', how='left').merge(users, on='user_id', how='left')
        return dataframe

    def __load_model_data(self, model_path: str):
        model_path_split = os.path.splitext(model_path)
        self.options.model_name = model_path_split[0]
        self.options.model_extention = model_path_split[1][1:]

        if self.options.model_extention == 'csv':
            self.model = pd.read_csv(self.options.model_store + f'{model_path}',encoding=self.options.encoding)
        else:
            raise Exception("Wrong model extension")
        return

    def __load_users_data(self, users_data: str):
        users = pd.read_csv(users_data, names=['user_id', 'gender', 'age', 'occupation', 'zip-code'],
                 sep=self.options.data_loading_sep, engine=self.options.data_loading_engine, encoding=self.options.encoding)
        n_users = users['user_id'].nunique()

        return users, n_users

    # вынести
    def __load_items_data(self, items_data: str):
        items = pd.read_csv(items_data, names=['movie_id', 'title', 'genres'],
                 sep=self.options.data_loading_sep, engine=self.options.data_loading_engine, encoding=self.options.encoding)
        n_items = items['movie_id'].nunique()

        return items, n_items

    # def __load_ratings(self, ratings_data_path: str, is_train: bool=True):    
    def __load_ratings(self, ratings_data_path: str):
        ratings = pd.read_csv(ratings_data_path, names=['user_id', 'movie_id', 'rating', 'date'],
                            sep=self.options.data_loading_sep, engine=self.options.data_loading_engine, encoding=self.options.encoding)
        # if is_train:
        #     self.ratings_train = ratings
        # else:
        #     self.ratings_test = ratings
        return ratings

    def __proceed_items(self, items_matrix: pd.DataFrame):
        items_matrix['release_year'] = items_matrix['title'].str.extract(r'(?:\((\d{4})\))?\s*$', expand=False)
        return items_matrix
    
    def __proceed_users(self, users_matrix: pd.DataFrame):
        return users_matrix

    def __normalize_matrix(self, matrix: pd.DataFrame):
        mean_user_rating = np.mean(matrix.values, axis=1).reshape(-1, 1)
        #std_user_rating = np.std(matrix.values, axis=1).reshape(-1, 1)
        matrix_normalized_values = (matrix.values - mean_user_rating) # / std_user_rating
        matrix = pd.DataFrame(data=matrix_normalized_values, index=matrix.index, columns=matrix.columns)
        
        return matrix, mean_user_rating #, std_user_rating

    def __get_movies_ids(self, predictions: pd.DataFrame):
        ids = predictions.columns.values
        return [int(x) for x in ids]

    def evaluate(self, test_data_path: str=None):
        if self.trained == False:
            raise Exception('Model not trained!')

        self.warmup()
        if test_data_path == None:
            test_data_path = self.options.test_data_path

        self.ratings_test = self.__load_ratings(test_data_path)
        test_dataset = self.__create_user_item_rating_dataframe(self.users_matrix, self.items_matrix, self.ratings_test)
        rmse = self.calculate_rmse(test_dataset, self.model.data)

        print(f'RMSE: {rmse}')
        return rmse

    def calculate_rmse(self, dataset: pd.DataFrame, preds: pd.DataFrame):

        real_marks = []
        predictions = []
        for index, row in dataset.iterrows():
            user_id = row['user_id'] - 1
            movie_id = str(row['movie_id'])
            rating = row['rating']
            if movie_id in preds.columns:
                real_marks.append(rating)
                predictions.append(preds[f'{movie_id}'][user_id])

        return mean_squared_error(real_marks, predictions, squared=False)


    #### surprise ####
    def surprise_train(self, train_data_path: str=None):
        if train_data_path == None:
            train_data_path = self.options.train_data_path

        self.ratings_train = self.__load_ratings(train_data_path)

        dataset = self.__surprise_get_dataset(self.ratings_train)
        self.__surprise_fit_model(dataset)
        self.trained = True
        return

    def __surprise_get_dataset(self, ratings: pd.DataFrame):
        return Dataset.load_from_df(ratings[['user_id', 'movie_id', 'rating']], reader=Reader(rating_scale=(1,5)))

    def __surprise_fit_model(self, dataset: surprise.Dataset):
        self.model = SVD(n_factors=50)
        self.model.fit(dataset.build_full_trainset())

    def __surprise_make_predictions(self, dataset: surprise.Dataset):
        real_marks = []
        predictions = []
        for row in dataset.build_full_trainset().build_testset():
            real_marks.append(row[2])
            predictions.append(self.model.predict(row[0], row[1]).est)

        return np.array(real_marks), np.array(predictions)

    def __surprise_calculate_rmse(self, real: np.matrix, pred: np.matrix):
        return mean_squared_error(real, pred, squared=False)

    def surprise_evaluate(self, test_data_path: str=None):
        if self.trained == False:
            raise Exception('Model not trained!')

        if test_data_path == None:
            test_data_path = self.options.test_data_path

        self.ratings_test = self.__load_ratings(test_data_path)

        dataset = self.__surprise_get_dataset(self.ratings_test)

        real_marks, predictions = self.__surprise_make_predictions(dataset)

        rmse = self.__surprise_calculate_rmse(real_marks, predictions)

        print(f'RMSE evaluate: {rmse}')



    # query handlers

    def __find_item_by_name(self, received_name: str):
        item_index = self.items_matrix['title'].apply(lambda title: levenshtein(re.sub(r' \([0-9]{4}\)', '', title.lower()), received_name.lower())).idxmin()
        item_id = self.items_matrix.loc[item_index]['movie_id']
        return item_id

    def __calculate_items_similarity_matrix(self, items_matrix: pd.DataFrame):
        # t тк вектор - фильм
        similarity_matrix = cosine_similarity(items_matrix, items_matrix)
        # нуля чтоб сам себя не рекоммендовал
        np.fill_diagonal(similarity_matrix, 0)
        similarity_df = pd.DataFrame(similarity_matrix, self.model.data.columns, self.model.data.columns)
        return similarity_df

    def __find_similar(self, movie_id: int, n: int=5):
        items_idxs = np.array(self.items_similarity_matrix[movie_id].sort_values(ascending=False)[:n].index.values, dtype=int).tolist()

        items = self.__sort_items_by_ids(self.items_matrix, items_idxs)

        items_idxs = [int(x) for x in items_idxs]
        return items_idxs, items

    def get_similar_items(self, received_name: str='Bambi (1942)', amount: int=5):
        item_id = self.__find_item_by_name(received_name)
        if self.items_similarity_matrix is None:
            self.items_similarity_matrix = self.__calculate_items_similarity_matrix(self.model.data.T)
        items_idxs, items = self.__find_similar(item_id, amount)
        return [items_idxs, items]

    def predict(self, items_ratings: list, M: int=10):
        if len(array.shape) == 1:
            raise Exception('Wrong input!')

        new_user_row = self.__init_new_row(items_ratings)

        # print(new_user_row)

        most_similar_user_id = pd.DataFrame(data=cosine_similarity(new_user_row, self.model.data),
                                         columns=self.model.data.T.columns.values).idxmax(axis=1).max()

        # print(most_similar_user_id)

        items_ids = [int(x) for x in self.model.data.T[most_similar_user_id].sort_values(ascending=False)[:M].index.values.tolist()]
        items_ratings = self.model.data.T[most_similar_user_id].sort_values(ascending=False)[:M].values.tolist()
        items_names = self.__find_items_by_ids(items_ids).title.values.tolist()

        return [items_names, items_ratings]

    def __find_items_by_ids(self, ids: list):
        return self.items_matrix.loc[self.items_matrix['movie_id'].isin(ids), :]

    def __init_new_row(self, items_ratings: list):

        items_ids = items_ratings[0]
        ratings = items_ratings[1]

        # for (items_id, rating) in zip(items_ids, ratings):
        #     # if items_id in new_user_row.columns:
        #     new_user_row[f'{items_id}'] = rating

        new_user_row = pd.DataFrame(data=[ratings],  columns=self.model.data.columns)
        return new_user_row

    def __create_indexer_dict(self, items_ids: list):
        indexer = {}
        for i, val in enumerate(items_ids):
            indexer[val] = i
        return indexer

    def __sort_items_by_ids(self, items: pd.DataFrame, items_ids: list):
        indexer = self.__create_indexer_dict(items_ids)
        items = items.loc[items['movie_id'].isin(items_ids), :]
        items.loc[:, ['order']] = items['movie_id'].map(indexer)
        names = items.sort_values('order')['title'].values.tolist()
        return names

In [695]:
options = RecSysOptions()

./store\model.csv


In [696]:
recsys = RecSysMF(options)

In [697]:
%%time

recsys.train('../data/train/ratings_train.dat')

./store\model.csv model csv
model exist
here we go
Wall time: 55.4 s


In [467]:
%%time

recsys.evaluate('./../data/test/ratings_test.dat')

./store\model.csv ./store\model csv
model exist
RMSE: 2.7436507693623815
Wall time: 8.27 s


2.7436507693623815

In [584]:
recsys.get_similar_items('terminator 2', 5)

[[1038, 3517, 2811, 666, 1160],
 ['Unhook the Stars (1996)',
  'Bells, The (1926)',
  'With Friends Like These... (1998)',
  'All Things Fair (1996)',
  'Six of a Kind (1934)']]

In [702]:
user_id = 50
data = [[int(x) for x in recsys.model.data.T[user_id].index.values.tolist()], recsys.model.data.T[user_id].tolist()]

In [701]:
# recsys.model.data.T[user_id].index.values.tolist()

In [700]:
# recsys.model.data

In [698]:
# row_data = np.zeros_like(recsys.model.data.columns)
#
# items_ids = data[0]
# ratings = data[1]
#
# for (i, rating) in enumerate(ratings):
#     print(i, rating)
#     # if items_id in new_user_row.columns:
#     row_data[i] = rating
#
# new_user_row = pd.DataFrame(data=[ratings],  columns=recsys.model.data.columns, dtype=float)
# new_user_row
#


In [705]:
recsys.predict(data)

[['Toy Story (1995)',
  'Babe (1995)',
  'Shawshank Redemption, The (1994)',
  'Groundhog Day (1993)',
  'Shakespeare in Love (1998)',
  'Election (1999)',
  'American Beauty (1999)',
  'Toy Story 2 (1999)',
  'X-Men (2000)',
  'Almost Famous (2000)'],
 [4.937720848094659,
  4.5137474506584425,
  4.024122789018908,
  3.703004951626429,
  3.5512888355791494,
  3.492567441156093,
  3.419010923191872,
  3.055025630521286,
  2.7751210065072396,
  2.328677939150544]]

In [703]:
# любимые фильмы первого
user_1_top_10_idx = recsys.model.data.T[user_id].sort_values(ascending=False)[:10].index.values
user_1_top_10_idx.tolist()

[3114, 1, 318, 34, 2599, 1265, 2396, 2858, 3793, 3897]

In [704]:
recsys.items_matrix.loc[recsys.items_matrix['movie_id'].isin(user_1_top_10_idx.tolist()), :]

Unnamed: 0,movie_id,title,genres,release_year
0,1,Toy Story (1995),Animation|Children's|Comedy,1995
33,34,Babe (1995),Children's|Comedy|Drama,1995
315,318,"Shawshank Redemption, The (1994)",Drama,1994
1245,1265,Groundhog Day (1993),Comedy|Romance,1993
2327,2396,Shakespeare in Love (1998),Comedy|Romance,1998
2530,2599,Election (1999),Comedy,1999
2789,2858,American Beauty (1999),Comedy|Drama,1999
3045,3114,Toy Story 2 (1999),Animation|Children's|Comedy,1999
3724,3793,X-Men (2000),Action|Sci-Fi,2000
3827,3897,Almost Famous (2000),Comedy|Drama,2000


In [616]:
sim_df['1'].sort_values(by='1', ascending=False)[:5].index.values

TypeError: sort_values() got an unexpected keyword argument 'by'

In [560]:
# recsys.items_similarity_matrix[61]

In [561]:
# cosine_similarity(recsys.model.data, recsys.model.data)

In [562]:
# magnitude  = np.sqrt(np.einsum('ij, ij -> i', recsys.model.data.values, recsys.model.data.values))

In [419]:
movie_id = 596

In [423]:
recsys.model.data[str(movie_id)]

0       1.471272
1       0.330625
2       0.081332
3       0.115323
4       0.013878
          ...   
6035    3.675408
6036    0.196762
6037    0.017805
6038    0.289937
6039    0.316164
Name: 596, Length: 6040, dtype: float64

In [427]:
recsys.model.data.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,6030,6031,6032,6033,6034,6035,6036,6037,6038,6039
1,4.871885,0.668710,0.714893,0.193247,0.361484,3.243722,0.090134,2.503542,4.657500,4.982542,...,0.198171,3.559481,0.608762,0.113156,3.702419,0.053738,0.655982,0.075344,0.269848,1.901347
2,0.438117,0.092948,0.575860,0.214896,0.358291,0.186826,0.057171,0.175612,0.307891,3.446350,...,0.141817,0.158343,0.215073,0.035222,0.521429,0.488578,0.543154,0.067651,0.345289,0.151289
3,0.236267,0.239192,0.038079,0.065603,0.086292,0.361229,0.195316,0.697256,0.097562,0.891705,...,0.030427,0.071279,0.064333,0.026030,1.009622,0.003457,0.041341,0.121138,0.050865,0.247654
4,0.068818,0.005992,0.009929,0.072932,0.192728,0.078846,0.067753,0.476411,0.069276,0.085827,...,0.011302,0.011248,0.066994,0.014847,0.345044,0.661493,0.027133,0.011392,0.000217,0.017847
5,0.111529,0.111383,0.011354,0.055503,0.380067,0.129457,0.066480,0.271837,0.058576,1.218124,...,0.162230,0.228063,0.047033,0.048250,0.845374,0.129167,0.055304,0.002136,0.169258,0.170130
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.237982,0.627199,0.043739,0.094150,0.273037,0.339920,0.207974,0.018805,0.766159,0.727699,...,0.298318,0.134509,0.041080,0.109624,0.549066,1.313711,0.226731,0.098053,0.891569,0.659637
3949,0.036789,0.293496,0.109650,0.099849,0.480053,0.050315,0.158560,0.259364,0.016426,0.189360,...,0.010939,0.241871,0.002943,0.107708,0.004363,1.678013,0.495063,0.005975,0.327751,0.329644
3950,0.023227,0.020154,0.036885,0.038574,0.006323,0.051571,0.104950,0.047434,0.091822,0.204647,...,0.111578,0.041979,0.005280,0.066220,0.056851,0.468784,0.039329,0.015083,0.011674,0.120566
3951,0.096212,0.034439,0.067831,0.007892,0.141444,0.041613,0.003199,0.046715,0.009995,0.169664,...,0.082670,0.076223,0.007305,0.032883,0.043014,0.321853,0.149732,0.015647,0.076091,0.118427


In [426]:
# similarity = np.dot(recsys.model.data[movie_id], recsys.model.data.values) / (magnitude[int(movie_id) - 1] * magnitude)
cosine_similarity(recsys.model.data.T, recsys.model.data.T)

array([[1.        , 0.61568425, 0.57162265, ..., 0.52623685, 0.51702564,
        0.50257585],
       [0.61568425, 1.        , 0.6475498 , ..., 0.522697  , 0.4778184 ,
        0.46935634],
       [0.57162265, 0.6475498 , 1.        , ..., 0.5094573 , 0.48454428,
        0.49444004],
       ...,
       [0.52623685, 0.522697  , 0.5094573 , ..., 1.        , 0.77118592,
        0.78409996],
       [0.51702564, 0.4778184 , 0.48454428, ..., 0.77118592, 1.        ,
        0.73623774],
       [0.50257585, 0.46935634, 0.49444004, ..., 0.78409996, 0.73623774,
        1.        ]])

In [213]:
# sim_df_data = cosine_similarity(recsys.model.data.T, recsys.model.data.T)
# np.fill_diagonal(sim_df_data, 0)

In [214]:
sim_df_data

array([[0.        , 0.61568425, 0.57162265, ..., 0.52623685, 0.51702564,
        0.50257585],
       [0.61568425, 0.        , 0.6475498 , ..., 0.522697  , 0.4778184 ,
        0.46935634],
       [0.57162265, 0.6475498 , 0.        , ..., 0.5094573 , 0.48454428,
        0.49444004],
       ...,
       [0.52623685, 0.522697  , 0.5094573 , ..., 0.        , 0.77118592,
        0.78409996],
       [0.51702564, 0.4778184 , 0.48454428, ..., 0.77118592, 0.        ,
        0.73623774],
       [0.50257585, 0.46935634, 0.49444004, ..., 0.78409996, 0.73623774,
        0.        ]])

In [215]:
sim_df = pd.DataFrame(sim_df_data, recsys.model.data.columns, recsys.model.data.columns)

In [479]:
sim_df[str('3461')]

1       0.545478
2       0.610611
3       0.527990
4       0.516279
5       0.520910
          ...   
3948    0.521449
3949    0.528689
3950    0.608248
3951    0.549604
3952    0.520427
Name: 3461, Length: 3700, dtype: float64

In [218]:
np.max(sim_df['596'])

0.9534783333144752

In [476]:
np.array(sim_df['3461'].sort_values(ascending=False)[:5].index.values, dtype=int)

array([1931, 1099, 3168, 2820, 1085])

In [514]:
top_n_movies_indexes, sim_df['596'].sort_values(ascending=False)[:5]

(array([2018, 1029, 2096,  594, 1022]),
 2018    0.953478
 1029    0.931716
 2096    0.919233
 594     0.914133
 1022    0.913597
 Name: 596, dtype: float64)

{2018: 0, 1029: 1, 2096: 2, 594: 3, 1022: 4}

In [580]:
top_movies = recsys.items_matrix.loc[recsys.items_matrix['movie_id'].isin(top_n_movies_indexes.tolist()), :]
top_movies['order'] = top_movies['movie_id'].map(indexer)
top_movies.sort_values('order')['title']values.tolist()

SyntaxError: invalid syntax (Temp/ipykernel_10112/3879629942.py, line 3)

0.9257690800883402

In [None]:
########################################################

In [209]:
# movie_name = 'pinocheo'

In [173]:
# ind = 60

In [200]:
# np.min(recsys.items_matrix['title'].apply(lambda title: levenshtein(re.sub(r' \([0-9]{4}\)', '', title.lower()), movie_name.lower())))

2

In [415]:
# it_min = recsys.items_matrix['title'].apply(lambda title: levenshtein(re.sub(r' \([0-9]{4}\)', '', title.lower()), movie_name.lower())).idxmin()
# it_min

592

In [422]:
recsys.items_matrix.loc[it_min]['movie_id']

596

In [224]:
#######################################

In [412]:
# recsys.items_matrix.loc[592]

movie_id                         596
title               Pinocchio (1940)
genres          Animation|Children's
release_year                    1940
Name: 592, dtype: object

In [None]:
# top 596

In [439]:
array = [[2087, 1042, 2165, 598, 1035], [5, 4, 3, 2, 4]]
array = np.array(array)

In [440]:
if len(array.shape) == 1:
    print('error')
else:
    print('norm')

norm


In [441]:
single_row = pd.DataFrame(data=[np.zeros_like(recsys.model.data.columns)],  columns=recsys.model.data.columns)

In [442]:
movie_from_array_ids = array[0]

In [443]:
ratings_from_array_ids = array[1]

In [444]:
for (movie_id, rating) in zip(movie_from_array_ids, ratings_from_array_ids):
    if str(movie_id) in single_row.columns:
        single_row[f'{movie_id}'] = rating

In [445]:
# for movie_id in movie_from_array_ids:
#     print(single_row[f'{movie_id}'])

In [625]:
new_user_to_athers_sim_df = pd.DataFrame(data=cosine_similarity(single_row, recsys.model.data),
                                         columns=recsys.model.data.T.columns.values)

In [631]:
new_user_to_athers_sim_df.idxmax(axis=1).max()

2342

In [454]:
# максимально похож на этого юзера
# user - int, movie - str
# new_user_to_athers_sim_df[2342]

In [460]:
new_user_to_athers_sim_df = pd.DataFrame(data=cosine_similarity(single_row, recsys.model.data),
                                         columns=recsys.model.data.T.columns.values).max()

[int(x) for x in recsys.model.data.T[2342].sort_values(ascending=False)[:5].index.values.tolist()],\
    recsys.model.data.T[2342].sort_values(ascending=False)[:5].values.tolist()

([899, 1947, 1035, 1028, 914],
 [4.118885978169936,
  3.966565191153752,
  3.74275893457156,
  3.55204608957321,
  3.523668979004352])

In [459]:
topn_mov = np.array(recsys.model.data.T[2342].sort_values(ascending=False)[:5].index.values, dtype=int)
recsys.items_matrix.loc[recsys.items_matrix['movie_id'].isin(topn_mov), :].title.values.tolist()

["Singin' in the Rain (1952)",
 'My Fair Lady (1964)',
 'Mary Poppins (1964)',
 'Sound of Music, The (1965)',
 'West Side Story (1961)']

In [385]:
recsys.model.data.T[2342][-1:]

3952    0.025732
Name: 2342, dtype: float64

In [15]:
# test_dataset = recsys.create_user_item_rating_dataframe(recsys.users_matrix, recsys.items_matrix, ratings_test)

In [14]:
# ratings_test = pd.read_csv(options.test_data_path, names=['user_id', 'movie_id', 'rating', 'date'],
#                             sep=options.data_loading_sep, engine=options.data_loading_engine, encoding=options.encoding)

In [21]:
# movies_ids = recsys.model.data.columns.values
# movies_ids = [int(x) for x in movies_ids]

In [73]:
# recsys.model.data

In [74]:
# recsys.model.data['1']

In [78]:

#real_marks = []
#predictions = []
# index, row in test_dataset.iterrows():
 #   user_id = row['user_id'] - 1
 #   movie_id = str(row['movie_id'])
 #   rating = row['rating']
 #   if movie_id in recsys.model.data.columns:
#        real_marks.append(rating)
#        predictions.append(recsys.model.data[f'{movie_id}'][user_id])

#    print(np.array(predictions).shape, np.array(real_marks).shape)

#mean_squared_error(real_marks, predictions, squared=False)

In [79]:
recsys.surprise_train('../data/train/ratings_train.dat')

In [80]:
recsys.surprise_evaluate('./../data/test/ratings_test.dat')

RMSE evaluate: 0.8592949502700236


<__main__.RecSysMF at 0x2cca56a68e0>

RMSE: 3.2545500777654968


3.2545500777654968

In [175]:
# def __load_users_data(users_data: str):
#         users = pd.read_csv(users_data, names=['user_id', 'gender', 'age', 'occupation', 'zip-code'],
#                  sep=options.data_loading_sep, engine=options.data_loading_engine, encoding=options.encoding)
#         n_users = users['user_id'].nunique()
#
#         return users, n_users
#
# # вынести
# def __load_items_data(items_data: str):
#     items = pd.read_csv(items_data, names=['movie_id', 'title', 'genres'],
#              sep=options.data_loading_sep, engine=options.data_loading_engine, encoding=options.encoding)
#     n_items = items['movie_id'].nunique()
#
#     return items, n_items
#
# def __load_ratings(ratings_data_path: str):
#         ratings = pd.read_csv(ratings_data_path, names=['user_id', 'movie_id', 'rating', 'date'],
#                             sep=options.data_loading_sep, engine=options.data_loading_engine, encoding=options.encoding)
#         # if is_train:
#         #     self.ratings_train = ratings
#         # else:
#         #     self.ratings_test = ratings
#         return ratings

In [169]:
# users, _ = __load_users_data(options.users_data_path)

In [171]:
# items, _ = __load_items_data(options.items_data_path)

In [176]:
# ratings = __load_ratings(options.train_data_path)

In [1]:
# __create_user_item_rating_dataframe(users, items, ratings)

In [173]:
# def __create_user_item_rating_dataframe(users: pd.DataFrame, items: pd.DataFrame, ratings: pd.DataFrame):
#         dataframe = pd.merge(ratings, items, on='movie_id', how='left').merge(users, on='user_id', how='left')
#         return dataframe

In [2]:
# def __create_user_item_matrix(users: pd.DataFrame, items: pd.DataFrame, ratings: pd.DataFrame):
#         user_item_rating_dataframe = __create_user_item_rating_dataframe(users, items, ratings)
#         matrix = user_item_rating_dataframe.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
#         return matrix

In [42]:
# # user_ids = np.array(test_set.user_id.values)
#
# user_ids = matrix.index.values
#
# movie_ids = matrix.T.index.values

In [3]:
# preds = []
# for user in user_ids:
#     row = []
#     for movie in movie_ids:
#         row.append(svd.predict(user - 1, movie - 1).est)
#     print(f'user {user} counted')
#     preds.append(row)

In [35]:
# user_item_rating_dataframe = test_set
# matrix = user_item_rating_dataframe.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)

In [58]:
# svd.predict(5, 4).est

1.9998207379392061

In [4]:
# preds_numpy = np.array(preds)

In [5]:
# preds_numpy.shape

In [6]:
# movie_ids

In [7]:
# mean_squared_error(matrix.values, preds_numpy, squared=False)

In [8]:
# real_marks = []
# predictions = []
# for index, row in test_df.iterrows():
#     user_id = row['user_id'] - 1
#     movie_id = row['movie_id'] - 1
#     rating = row['rating']
#     if movie_id in movie_ids:
#         real_marks.append(rating)
#         predictions.append(svd.predict(user_id, movie_id).est)

In [9]:
# mean_squared_error(np.array(real_marks), np.array(predictions), squared=False)

In [10]:
# svd.test(test_set)

In [12]:
# def import_data():
#     movies = pd.read_csv( './../data/train/movies.dat', encoding='windows-1251',
#                          names=['movie_id', 'title', 'genres'], sep='::', skiprows=0, engine='python')
#     users = pd.read_csv( './../data/train/users.dat', names=['user_id', 'gender',
#                                                        'age', 'occupation', 'zip-code'], sep='::', skiprows=0, engine='python')
#     ratings_train = pd.read_csv( './../data/train/ratings_train.dat', names=['user_id', 'movie_id', 'rating', 'date'],
#                                 sep='::', skiprows=0, engine='python')
#     ratings_test = pd.read_csv( './../data/test/ratings_test.dat', names=['user_id', 'movie_id', 'rating', 'date'],
#                                sep='::', skiprows=0, engine='python')
#
#     return split_into_datasets(*proceed_data(movies, users, ratings_train, ratings_test))
#
# def proceed_data(movies, users, ratings_train, ratings_test):
#
#     ages_map = {1: 'Under 18',
#             18: '18 - 24',
#             25: '25 - 34',
#             35: '35 - 44',
#             45: '45 - 49',
#             50: '50 - 55',
#             56: '56+'}
#
#     occupations_map = {0: 'Not specified',
#                        1: 'Academic / Educator',
#                        2: 'Artist',
#                        3: 'Clerical / Admin',
#                        4: 'College / Grad Student',
#                        5: 'Customer Service',
#                        6: 'Doctor / Health Care',
#                        7: 'Executive / Managerial',
#                        8: 'Farmer',
#                        9: 'Homemaker',
#                        10: 'K-12 student',
#                        11: 'Lawyer',
#                        12: 'Programmer',
#                        13: 'Retired',
#                        14: 'Sales / Marketing',
#                        15: 'Scientist',
#                        16: 'Self-Employed',
#                        17: 'Technician / Engineer',
#                        18: 'Tradesman / Craftsman',
#                        19: 'Unemployed',
#                        20: 'Writer'}
#
#     gender_map = {'M': 'Male', 'F': 'Female'}
#
#     users['age'] = users['age'].map(ages_map)
#     users['occupation'] = users['occupation'].map(occupations_map)
#     users['gender'] = users['gender'].map(gender_map)
#
#     ratings_train['date'] = pd.to_datetime(ratings_train['date'], unit='s')
#     ratings_test['date'] = pd.to_datetime(ratings_test['date'], unit='s')
#
#     movies['release_year'] = movies['title'].str.extract(r'(?:\((\d{4})\))?\s*$', expand=False)
#
#     return movies, users, ratings_train, ratings_test
#
# def create_user_item_matrix(train_set):
#     return train_set.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
#
# def normalize_matrix(matrix):
#     mean_user_rating = np.mean(matrix.values, axis=1).reshape(-1, 1)
#     matrix_demeaned_values = matrix.values - mean_user_rating
#     matrix = pd.DataFrame(data=matrix_demeaned_values, index=matrix.index, columns=matrix.columns)
#
#     return matrix, mean_user_rating
#
#
# def split_into_datasets(movies, users, ratings_train, ratings_test):
#     train_set = pd.merge(ratings_train, movies, on='movie_id', how='left').merge(users, on='user_id', how='left')
#     test_set = pd.merge(ratings_test, movies, on='movie_id', how='left').merge(users, on='user_id', how='left')
#
#     return train_set, test_set

In [13]:
# matrix