In [141]:
import numpy as np
import pandas as pd
import os

import surprise
from scipy.sparse.linalg import svds
from options import RecSysOptions
import datetime
from sklearn.metrics import mean_squared_error
import surprise
from surprise import Reader, Dataset, SVD

# import logging
# logging.basicConfig(filename='app.log', level=logging.INFO, format='%(asctime)s - %(message)s',
#                     datefmt='%d-%b-%y %H:%M:%S', filemode='a')
# logger = logging.getLogger(__name__)

In [270]:
class RecSysMF():
    def __init__(self, options: RecSysOptions):
        self.options = options
        self.model = None
        self.users_matrix = None
        self.items_matrix = None
        self.ratings_train = None
        self.ratings_test = None
        self.user_item_matrix = None
        self.mean_user_rating = None
        self.n_users = None
        self.n_items = None
        self.predicted_ratings_matrix = None
        self.trained = False
        self.surprise_matrix = None
        pass

    # def warmup(self, model_name: str='model', model_extension: str='csv'):
    def warmup(self):
        if self.__is_model_exists(self.options.model_path):
            self.model = self.__load_model(self.options.model_path)

        self.users_matrix, self.n_users = self.__load_users_data(self.options.users_data_path)
        self.items_matrix, self.n_items = self.__load_items_data(self.options.items_data_path)
        
        self.users_matrix = self.__proceed_users(self.users_matrix)
        self.items_matrix = self.__proceed_items(self.items_matrix)
        # logger.info(f"Model: {self.options.model} successfully loaded: {datetime.now()}")
        pass

    def train(self, train_data_path: str=None):
        self.warmup()

        if train_data_path == None:
            train_data_path = self.options.train_data_path

        self.ratings_train = self.__load_ratings(train_data_path)
        print('here we go')
        self.user_item_matrix = self.__create_user_item_matrix(self.users_matrix, self.items_matrix, self.ratings_train)

        # predict 
        # matrix demeaned
        self.user_item_matrix, self.mean_user_rating = self.__normalize_matrix(self.user_item_matrix)
        self.predicted_ratings_matrix = self.__predict_ratings(self.user_item_matrix, self.options.n_vectors, self.mean_user_rating)


        self.__save_model(self.predicted_ratings_matrix, self.options.model_name)
        self.trained = True

        return

    def __save_model(self, model: pd.DataFrame, name: str):
        if name != self.options.model_name:
            self.options.renew_model_name_and_path(name)
        model.to_csv(self.options.model_path, index=False)
        return
    
    def __is_model_exists(self, model: str):
        """
        все модели сохраняются в моделс сторе
        """
        return os.path.exists(os.path.join(self.options.model_store, model))

    def __create_user_item_matrix(self, users: pd.DataFrame, items: pd.DataFrame, ratings: pd.DataFrame):
        user_item_rating_dataframe = self.__create_user_item_rating_dataframe(users, items, ratings)
        matrix = user_item_rating_dataframe.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
        return matrix

    def __create_user_item_rating_dataframe(self, users: pd.DataFrame, items: pd.DataFrame, ratings: pd.DataFrame):
        x = ratings['movie_id']
        y = items['movie_id']
        dataframe = pd.merge(ratings, items, on='movie_id', how='left').merge(users, on='user_id', how='left')
        return dataframe

    def __load_model(self, model_path: str):
        model_path_split = os.path.splitext(model_path)
        self.options.model_name = model_path_split[0]
        self.options.model_extention = model_path_split[1][1:]

        if self.options.model_extention == 'csv':
            self.model = pd.read_csv(self.options.model_store + f'{model_path}',encoding=self.options.encoding)
        else:
            raise Exception("Wrong model extension")

        return

    def __load_users_data(self, users_data: str):
        users = pd.read_csv(users_data, names=['user_id', 'gender', 'age', 'occupation', 'zip-code'],
                 sep=self.options.data_loading_sep, engine=self.options.data_loading_engine, encoding=self.options.encoding)
        n_users = users['user_id'].nunique()

        return users, n_users

    # вынести
    def __load_items_data(self, items_data: str):
        items = pd.read_csv(items_data, names=['movie_id', 'title', 'genres'],
                 sep=self.options.data_loading_sep, engine=self.options.data_loading_engine, encoding=self.options.encoding)
        n_items = items['movie_id'].nunique()

        return items, n_items

    # def __load_ratings(self, ratings_data_path: str, is_train: bool=True):    
    def __load_ratings(self, ratings_data_path: str):
        ratings = pd.read_csv(ratings_data_path, names=['user_id', 'movie_id', 'rating', 'date'],
                            sep=self.options.data_loading_sep, engine=self.options.data_loading_engine, encoding=self.options.encoding)
        # if is_train:
        #     self.ratings_train = ratings
        # else:
        #     self.ratings_test = ratings
        return ratings

    def __proceed_items(self, items_matrix: pd.DataFrame):
        items_matrix['release_year'] = items_matrix['title'].str.extract(r'(?:\((\d{4})\))?\s*$', expand=False)
        return items_matrix
    
    def __proceed_users(self, users_matrix: pd.DataFrame):
        return users_matrix

    def __normalize_matrix(self, matrix: pd.DataFrame):
        mean_user_rating = np.mean(matrix.values, axis=1).reshape(-1, 1)
        matrix_demeaned_values = matrix.values - mean_user_rating
        matrix = pd.DataFrame(data=matrix_demeaned_values, index=matrix.index, columns=matrix.columns)
        
        return matrix, mean_user_rating

    def __predict_ratings(self, matrix: pd.DataFrame, n_vectors: int, mean_user_rating: np.ndarray):
        u, sigma, vt = svds(matrix, k=n_vectors)
        sigma_diag_matrix = np.diag(sigma)
        predicted_ratings = np.dot(np.dot(u, sigma_diag_matrix), vt) + mean_user_rating
        preds = pd.DataFrame(abs(predicted_ratings), columns = matrix.columns)

        return preds

    def __get_movies_ids(self, predictions: pd.DataFrame):
        ids = predictions.columns.values
        return ids

    def evaluate(self, test_data_path: str=None):
        if self.trained == False:
            raise Exception('Model not trained!')

        self.warmup()
        if test_data_path == None:
            test_data_path = self.options.test_data_path

        self.ratings_test = self.__load_ratings(test_data_path)
        test_dataset = self.__create_user_item_rating_dataframe(self.users_matrix, self.items_matrix, self.ratings_test)
        rmse = self.calculate_rmse(test_dataset, self.predicted_ratings_matrix)

        print(f'RMSE: {rmse}')
        return rmse

    def calculate_rmse(self, dataset: pd.DataFrame, prediction: pd.DataFrame):
        movies_ids = self.__get_movies_ids(self.predicted_ratings_matrix)
        real_marks = []
        predictions = []
        for index, row in dataset.iterrows():
            user_id = row['user_id'] - 1
            movie_id = row['movie_id'] - 1
            rating = row['rating']
            if movie_id in movies_ids:
                real_marks.append(rating)
                predictions.append(prediction.T[user_id][movie_id])

        return mean_squared_error(real_marks, predictions, squared=False)  




    #### surprise ####
    def surprise_train(self, train_data_path: str=None):
        if train_data_path == None:
            train_data_path = self.options.train_data_path

        self.ratings_train = self.__load_ratings(train_data_path)

        dataset = self.__surprise_get_dataset(self.ratings_train)
        self.__surprise_fit_model(dataset)
        self.trained = True
        return

    def __surprise_get_dataset(self, ratings: pd.DataFrame):
        return Dataset.load_from_df(ratings[['user_id', 'movie_id', 'rating']], reader=Reader(rating_scale=(1,5)))

    def __surprise_fit_model(self, dataset: surprise.Dataset):
        self.model = SVD(n_factors=50)
        self.model.fit(dataset.build_full_trainset())

    def __surprise_make_predictions(self, dataset: surprise.Dataset):
        real_marks = []
        predictions = []
        for row in dataset.build_full_trainset().build_testset():
            real_marks.append(row[2])
            predictions.append(self.model.predict(row[0], row[1]).est)

        return np.array(real_marks), np.array(predictions)

    def __surprise_calculate_rmse(self, real: np.matrix, pred: np.matrix):
        return mean_squared_error(real, pred, squared=False)

    def surprise_evaluate(self, test_data_path: str=None):
        if self.trained == False:
            raise Exception('Model not trained!')

        if test_data_path == None:
            test_data_path = self.options.test_data_path

        self.ratings_test = self.__load_ratings(test_data_path)

        dataset = self.__surprise_get_dataset(self.ratings_test)

        real_marks, predictions = self.__surprise_make_predictions(dataset)

        rmse = self.__surprise_calculate_rmse(real_marks, predictions)

        print(f'RMSE evaluate: {rmse}')


In [271]:
options = RecSysOptions()

In [273]:
recsys = RecSysMF(options)

In [274]:
options.test_data_path

'C:\\Users\\Acer\\Machine Learning\\recsys-mf\\model\\../data/test/ratings_test.dat'

In [239]:
recsys.train('../data/train/ratings_train.dat')

here we go


In [240]:
recsys.evaluate('./../data/test/ratings_test.dat')

RMSE: 3.2545500777654968


3.2545500777654968

In [276]:
recsys.surprise_train('../data/train/ratings_train.dat')

In [277]:
recsys.surprise_evaluate('./../data/test/ratings_test.dat')

RMSE evaluate: 0.8590168438202683


In [248]:
recsys.model

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x235ac0d8940>

In [175]:
def __load_users_data(users_data: str):
        users = pd.read_csv(users_data, names=['user_id', 'gender', 'age', 'occupation', 'zip-code'],
                 sep=options.data_loading_sep, engine=options.data_loading_engine, encoding=options.encoding)
        n_users = users['user_id'].nunique()

        return users, n_users

# вынести
def __load_items_data(items_data: str):
    items = pd.read_csv(items_data, names=['movie_id', 'title', 'genres'],
             sep=options.data_loading_sep, engine=options.data_loading_engine, encoding=options.encoding)
    n_items = items['movie_id'].nunique()

    return items, n_items

def __load_ratings(ratings_data_path: str):
        ratings = pd.read_csv(ratings_data_path, names=['user_id', 'movie_id', 'rating', 'date'],
                            sep=options.data_loading_sep, engine=options.data_loading_engine, encoding=options.encoding)
        # if is_train:
        #     self.ratings_train = ratings
        # else:
        #     self.ratings_test = ratings
        return ratings

In [169]:
users, _ = __load_users_data(options.users_data_path)

In [171]:
items, _ = __load_items_data(options.items_data_path)

In [176]:
ratings = __load_ratings(options.train_data_path)

In [178]:
__create_user_item_rating_dataframe(users, items, ratings)

Unnamed: 0,user_id,movie_id,rating,date,title,genres,gender,age,occupation,zip-code
0,3539,2478,5,966882141,Three Amigos! (1986),Comedy|Western,F,25,4,77006
1,5795,377,2,958143844,Speed (1994),Action|Romance|Thriller,M,25,1,92688
2,3513,3526,4,966974170,Parenthood (1989),Comedy|Drama,M,25,16,01890
3,4176,924,5,965330955,2001: A Space Odyssey (1968),Drama|Mystery|Sci-Fi|Thriller,M,45,7,19008
4,4657,1960,4,975803771,"Last Emperor, The (1987)",Drama|War,M,45,14,55416
...,...,...,...,...,...,...,...,...,...,...
950193,4148,969,5,965343002,"African Queen, The (1951)",Action|Adventure|Romance|War,F,35,7,55021
950194,4115,2393,5,965363379,Star Trek: Insurrection (1998),Action|Sci-Fi,M,25,15,99353
950195,3721,1061,4,966225726,Sleepers (1996),Crime|Drama,M,25,15,94065
950196,4946,1172,4,962641424,Cinema Paradiso (1988),Comedy|Drama|Romance,F,35,1,91106


In [173]:
def __create_user_item_rating_dataframe(users: pd.DataFrame, items: pd.DataFrame, ratings: pd.DataFrame):
        dataframe = pd.merge(ratings, items, on='movie_id', how='left').merge(users, on='user_id', how='left')
        return dataframe

In [None]:
def __create_user_item_matrix(users: pd.DataFrame, items: pd.DataFrame, ratings: pd.DataFrame):
        user_item_rating_dataframe = __create_user_item_rating_dataframe(users, items, ratings)
        matrix = user_item_rating_dataframe.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)
        return matrix

In [42]:
# user_ids = np.array(test_set.user_id.values)

user_ids = matrix.index.values

movie_ids = matrix.T.index.values

In [80]:
preds = []
for user in user_ids:
    row = []
    for movie in movie_ids:
        row.append(svd.predict(user - 1, movie - 1).est)
    print(f'user {user} counted')
    preds.append(row)

user 1 counted
user 2 counted
user 3 counted
user 4 counted
user 5 counted
user 6 counted
user 7 counted
user 8 counted
user 9 counted
user 10 counted
user 11 counted
user 13 counted
user 14 counted
user 15 counted
user 16 counted
user 17 counted
user 18 counted
user 19 counted
user 21 counted
user 22 counted
user 23 counted
user 24 counted
user 25 counted
user 26 counted
user 27 counted
user 28 counted
user 29 counted
user 31 counted
user 32 counted
user 33 counted
user 34 counted
user 35 counted
user 36 counted
user 37 counted
user 38 counted
user 39 counted
user 40 counted
user 42 counted
user 43 counted
user 44 counted
user 45 counted
user 46 counted
user 47 counted
user 48 counted
user 49 counted
user 52 counted
user 53 counted
user 54 counted
user 55 counted
user 56 counted
user 57 counted
user 58 counted
user 59 counted
user 60 counted
user 61 counted
user 62 counted
user 63 counted
user 64 counted
user 65 counted
user 66 counted
user 68 counted
user 69 counted
user 70 counted
u

In [35]:
user_item_rating_dataframe = test_set
matrix = user_item_rating_dataframe.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)

In [58]:
svd.predict(5, 4).est

1.9998207379392061

In [70]:
preds_numpy = np.array(preds)

In [74]:
preds_numpy.shape

(5628, 3097)

In [86]:
movie_ids

array([   1,    2,    3, ..., 3950, 3951, 3952], dtype=int64)

In [79]:
mean_squared_error(matrix.values, preds_numpy, squared=False)

3.3663903127986603

In [94]:
real_marks = []
predictions = []
for index, row in test_df.iterrows():
    user_id = row['user_id'] - 1
    movie_id = row['movie_id'] - 1
    rating = row['rating']
    if movie_id in movie_ids:
        real_marks.append(rating)
        predictions.append(svd.predict(user_id, movie_id).est)

In [98]:
mean_squared_error(np.array(real_marks), np.array(predictions), squared=False)

1.2489714830969187

In [99]:
svd.test(test_set)

[Prediction(uid=3842, iid=1645, r_ui=2.0, est=3.2607299996105548, details={'was_impossible': False}),
 Prediction(uid=3842, iid=1225, r_ui=5.0, est=5, details={'was_impossible': False}),
 Prediction(uid=3842, iid=919, r_ui=5.0, est=4.979811406077676, details={'was_impossible': False}),
 Prediction(uid=3842, iid=2455, r_ui=4.0, est=3.400370123479351, details={'was_impossible': False}),
 Prediction(uid=3842, iid=34, r_ui=5.0, est=4.693533166326759, details={'was_impossible': False}),
 Prediction(uid=3842, iid=1972, r_ui=1.0, est=2.1037555814656423, details={'was_impossible': False}),
 Prediction(uid=3842, iid=235, r_ui=5.0, est=4.5284490878507215, details={'was_impossible': False}),
 Prediction(uid=3842, iid=1589, r_ui=4.0, est=3.4232317089396713, details={'was_impossible': False}),
 Prediction(uid=3842, iid=2413, r_ui=3.0, est=3.333729805777336, details={'was_impossible': False}),
 Prediction(uid=3842, iid=2, r_ui=3.0, est=3.255046298133754, details={'was_impossible': False}),
 Predicti

0.8591616572462043

In [23]:
def import_data():
    movies = pd.read_csv( './../data/train/movies.dat', encoding='windows-1251',
                         names=['movie_id', 'title', 'genres'], sep='::', skiprows=0, engine='python')
    users = pd.read_csv( './../data/train/users.dat', names=['user_id', 'gender',
                                                       'age', 'occupation', 'zip-code'], sep='::', skiprows=0, engine='python')
    ratings_train = pd.read_csv( './../data/train/ratings_train.dat', names=['user_id', 'movie_id', 'rating', 'date'],
                                sep='::', skiprows=0, engine='python')
    ratings_test = pd.read_csv( './../data/test/ratings_test.dat', names=['user_id', 'movie_id', 'rating', 'date'],
                               sep='::', skiprows=0, engine='python')

    return split_into_datasets(*proceed_data(movies, users, ratings_train, ratings_test))

def proceed_data(movies, users, ratings_train, ratings_test):

    ages_map = {1: 'Under 18',
            18: '18 - 24',
            25: '25 - 34',
            35: '35 - 44',
            45: '45 - 49',
            50: '50 - 55',
            56: '56+'}

    occupations_map = {0: 'Not specified',
                       1: 'Academic / Educator',
                       2: 'Artist',
                       3: 'Clerical / Admin',
                       4: 'College / Grad Student',
                       5: 'Customer Service',
                       6: 'Doctor / Health Care',
                       7: 'Executive / Managerial',
                       8: 'Farmer',
                       9: 'Homemaker',
                       10: 'K-12 student',
                       11: 'Lawyer',
                       12: 'Programmer',
                       13: 'Retired',
                       14: 'Sales / Marketing',
                       15: 'Scientist',
                       16: 'Self-Employed',
                       17: 'Technician / Engineer',
                       18: 'Tradesman / Craftsman',
                       19: 'Unemployed',
                       20: 'Writer'}

    gender_map = {'M': 'Male', 'F': 'Female'}

    users['age'] = users['age'].map(ages_map)
    users['occupation'] = users['occupation'].map(occupations_map)
    users['gender'] = users['gender'].map(gender_map)

    ratings_train['date'] = pd.to_datetime(ratings_train['date'], unit='s')
    ratings_test['date'] = pd.to_datetime(ratings_test['date'], unit='s')

    movies['release_year'] = movies['title'].str.extract(r'(?:\((\d{4})\))?\s*$', expand=False)

    return movies, users, ratings_train, ratings_test

def create_user_item_matrix(train_set):
    return train_set.pivot(index='user_id', columns='movie_id', values='rating').fillna(0)

def normalize_matrix(matrix):
    mean_user_rating = np.mean(matrix.values, axis=1).reshape(-1, 1)
    matrix_demeaned_values = matrix.values - mean_user_rating
    matrix = pd.DataFrame(data=matrix_demeaned_values, index=matrix.index, columns=matrix.columns)

    return matrix, mean_user_rating


def split_into_datasets(movies, users, ratings_train, ratings_test):
    train_set = pd.merge(ratings_train, movies, on='movie_id', how='left').merge(users, on='user_id', how='left')
    test_set = pd.merge(ratings_test, movies, on='movie_id', how='left').merge(users, on='user_id', how='left')

    return train_set, test_set

In [46]:
matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6036,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6038,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
