In [7]:
import pandas as pd
import numpy as np
import random
import math

In [5]:
# Branje ocen
class UserItemData:

    #data from table
    #konstruktor

    def __init__(self, path, from_date=None, to_date=None, min_ratings=0):
        self.df = pd.read_csv(path, "\t", encoding='ISO-8859-1')

        #split from date to day, month, year
        if (from_date != None):
            from_date_a = from_date.split('.')
            to_date_a = to_date.split('.')
            # format of date is like that year-month-day
            from_date_datef = str(f'{from_date_a[2]}-{from_date_a[1]}-{from_date_a[0]}')
            to_date_datef = str(f'{to_date_a[2]}-{to_date_a[1]}-{to_date_a[0]}')
            #make new column dates
            self.df['dates'] = self.df["date_year"].astype(str).str.cat(self.df[['date_month', 'date_day']].astype(str), sep='-')
            self.df['dates'] = pd.to_datetime(self.df['dates'])
            self.df = self.df[self.df['dates'].between(from_date_datef, to_date_datef, inclusive=True)]

        self.df = self.df.groupby("movieID").filter(lambda ratings: len(ratings) >= min_ratings)

    def nratings(self):
        return len(self.df['rating'].values)

    def get_watched_movie_list(self, userId):
        return self.df[self.df["userID"] == userId]['movieID'].values
    
    def get_all_movies_id(self):
        return self.df['movieID'].values

    #vs - vsota vseh ocen za film movieId
    def get_sum_rating_movie(self, movieId):
        return sum(self.df[self.df['movieID'] == movieId]['rating'])
   

    def get_sum_rating_all_movies(self):
        return sum(self.df['rating'])
    
    # n - stevilo ocen ki jih je dobil film
    def get_number_rating_movie(self, movieId):
        return len(self.df[self.df['movieID'] == movieId])

    def get_rating_of_movie(self, movieId):
        return list(self.df[self.df['movieID'] == movieId]['rating'].values)

    def get_all_users(self):
        return list(set(self.df['userID']))

    def get_number_users_rated_movies(self, movie_id1, movie_id2):
        return len(self.df[(self.df['movieID'] == movie_id1) | (self.df['movieID'] == movie_id2)]['userID'])

    def return_numpy_df(self):
        return self.df[['movieID', 'userID', 'rating']].to_numpy()

    def get_rating_movie(self, user_id, movie_id):
        return self.df[(self.df['userID'] == user_id) & (self.df['movieID'] == movie_id)]['rating'].values


In [14]:
#Branje ocen

In [11]:
uim = UserItemData('data/user_ratedmovies.dat')
print(uim.nratings())

uim = UserItemData('data/user_ratedmovies.dat', from_date = '12.1.2007', to_date='16.2.2008', min_ratings=100)
print(uim.nratings())

855598
73657


In [16]:
class MovieData:
    def __init__(self, path):
        # encoding not working on mac air so thats why encoiding is set
        self.df = pd.read_csv(path, sep='\t', encoding='ISO-8859-1')

    def get_title(self, movieID):
        data = self.df[self.df.id == movieID]['title']
        return data.values[0]

In [17]:
# Branje filmov

In [19]:
md = MovieData('data/movies.dat')
print(md.get_title(1))

Toy story


In [21]:
# Nakljucni predikator

In [24]:
class RandomPredictor:
    def __init__(self, min, max):
        self.min = min
        self.max = max

    def predict(self, user_id):
        user_ratings = dict()
        #min_movieID = self.user_item_data.df["movieID"].min()
        #max_movieID = self.user_item_data.df["movieID"].max()
        list_of_movieIDs = list(set(self.user_item_data.get_all_movies_id()))
        list_of_movieIDs.sort()
        #print('list of ids ', list_of_movieIDs)
        for movieID in list_of_movieIDs:
            user_ratings[movieID] = random.randint(self.min, self.max)
        return user_ratings

    def fit(self, user_item_data):
        self.user_item_data = user_item_data


In [26]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rp.fit(uim)
pred = rp.predict(78)
print(type(pred))
items = [1, 3, 20, 50, 100]
for item in items:
    print("Film: {}, ocena: {}".format(md.get_title(item), pred[item]))

<class 'dict'>
Film: Toy story, ocena: 2
Film: Grumpy Old Men, ocena: 4
Film: Money Train, ocena: 3
Film: The Usual Suspects, ocena: 5
Film: City Hall, ocena: 1


In [28]:
class Recommender:
    def __init__(self, predicator):
        self.predicator = predicator
    
    def fit(self, user_item_data):
        self.user_item_data = user_item_data
        self.predicator.fit(self.user_item_data)

    def recommend(self, userID, n=10, rec_seen=True):
        predicted_grade = self.predicator.predict(userID)
        recommendet_movies = []
        watched_movie_list = list(self.user_item_data.get_watched_movie_list(userID))
        if (rec_seen):
            for movieId, grade in predicted_grade.items():
                if movieId in watched_movie_list:
                    recommendet_movies.append((movieId, grade))
        else:
            for movieId, grade in predicted_grade.items():
                if movieId not in watched_movie_list:
                    recommendet_movies.append((movieId, grade))
        #sort recomended movies
        recommendet_movies_sort = sorted(recommendet_movies, key=lambda t: t[1], reverse=True)
        return recommendet_movies_sort[:n]

In [29]:
# Priporocanje

In [31]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
rp = RandomPredictor(1, 5)
rec = Recommender(rp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: Grumpy Old Men, ocena: 5
Film: Father of the Bride Part II, ocena: 5
Film: Ninja Assassin, ocena: 5
Film: Othello, ocena: 5
Film: Wings of Courage, ocena: 5


In [38]:
class AveragePredictor:
    def __init__(self, b):
        self.b = b

    def fit(self, user_item_data):
        self.user_item_data = user_item_data
        self.g_avg = self.user_item_data.get_sum_rating_all_movies() / self.user_item_data.nratings()

    def calculate(self, movie_id):
        vs = self.user_item_data.get_sum_rating_movie(movie_id)
        n = self.user_item_data.get_number_rating_movie(movie_id)
        return (vs + self.b * self.g_avg) / (n + self.b)


    def predict(self, user_id):
        user_ratings = dict()
        list_of_movieIDs = list(set(self.user_item_data.get_all_movies_id()))
        list_of_movieIDs.sort()
        for movieID in list_of_movieIDs:
            user_ratings[movieID] = self.calculate(movieID)
        return user_ratings

In [35]:
### Napovedovanje s povprecjem

In [40]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
av = AveragePredictor(b=100)
rec = Recommender(av)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))  

Film: The Usual Suspects, ocena: 4.225944245560473
Film: The Godfather: Part II, ocena: 4.146907937910189
Film: Cidade de Deus, ocena: 4.116538340205236
Film: The Dark Knight, ocena: 4.10413904093503
Film: 12 Angry Men, ocena: 4.103639627096175


In [45]:
class ViewsPredicator:
    def __init__(self):
        pass

    def fit(self, user_item_data):
        self.user_item_data = user_item_data

    def predict(self, user_id):
        user_ratings = dict()
        list_of_movieIDs = list(set(self.user_item_data.get_all_movies_id()))
        list_of_movieIDs.sort()
        for movieID in list_of_movieIDs:
            user_ratings[movieID] = self.user_item_data.get_number_rating_movie(movieID)
        return user_ratings

In [46]:
# Priporočanje najbolj gledanih filmov

In [51]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat')
vp = ViewsPredicator()
rec = Recommender(vp)
rec.fit(uim)
rec_items = rec.recommend(78, n=5, rec_seen=False)
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val)) 

Film: The Lord of the Rings: The Fellowship of the Ring, ocena: 1576
Film: The Lord of the Rings: The Two Towers, ocena: 1528
Film: The Lord of the Rings: The Return of the King, ocena: 1457
Film: The Silence of the Lambs, ocena: 1431
Film: Shrek, ocena: 1404


In [63]:
class ItemBasedPredictor:
    def __init__(self, min_values=0, threshold=0):
        self.min_values = min_values
        self.threshold = threshold

    def fit(self, user_item_data):
        self.user_item_data = user_item_data
        self.all_users = self.user_item_data.get_all_users()
        self.df_numpy = self.user_item_data.return_numpy_df()
        # get all users and calculates avg rating
        self.user_avg = dict()
        for user in self.all_users:
            user_rating = self.user_item_data.df[self.user_item_data.df['userID'] == user]['rating']
            self.user_avg[user] = sum(user_rating) / len(user_rating)
        #self.all_sim = self.calculate_all_sim()
    
    # calculates all sim for every movie and returns dict 
    # with {(movieId1, movieId2): similarity} 
    def calculate_all_sim(self):
        # all movies and sim
        movie_sim = dict()
        all_movies_np1 = np.unique(self.df_numpy[:, 0])
        all_movies_np2 = np.unique(self.df_numpy[:, 0])
        for movieId1 in all_movies_np1:
            for movieId2 in all_movies_np2:
                if movieId1 != movieId2:
                    movie_sim[(movieId1, movieId2)] = self.similarity(movieId1, movieId2)
        return movie_sim

    def calcualte_all_sim_with_numpy(self):
        movis_sim_np_array = []
        all_movies_np1 = np.unique(self.df_numpy[:, 0])
        all_movies_np2 = np.unique(self.df_numpy[:, 0])
        for movieId1 in all_movies_np1:
            for movieId2 in all_movies_np2:
                if movieId1 != movieId2:
                    similarity = self.similarity(movieId1, movieId2)
                    if similarity > 0:
                        movis_sim_np_array.append([movieId1, movieId2, similarity])
        return np.array(movis_sim_np_array)
    
    
    # for every sim that is >0 get the number of rating that user gave movie
    # formula is sum(sim*rating_user)/sum(sim)
    def predict(self, user_id):

        pass

    def similarity(self, p1, p2):
        #check if number of users that graded two movies are enough
        if (self.user_item_data.get_number_users_rated_movies(p1, p2) < self.min_values):
            return 0.0

        frist_line_in_fromula = 0
        movie1_sqrt = 0
        movie2_sqrt = 0
        movie1_df_filter = self.df_numpy[np.where(self.df_numpy[:, 0] == p1)]
        movie2_df_filter = self.df_numpy[np.where(self.df_numpy[:, 0] == p2)]
        for user in self.user_avg.keys():
            ratings_movie1 = movie1_df_filter[np.where(movie1_df_filter[:, 1] == user)]
            ratings_movie2 = movie2_df_filter[np.where(movie2_df_filter[:, 1] == user)]
            if ratings_movie1.size > 0 and ratings_movie2.size > 0:
                avg_from_user = self.user_avg[user]
                rating_movie1_cal = ratings_movie1[:, -1][0] - avg_from_user
                rating_movie2_cal = ratings_movie2[:, -1][0] - avg_from_user
                frist_line_in_fromula += ((rating_movie1_cal) * (rating_movie2_cal))
                movie1_sqrt += ((rating_movie1_cal) ** 2)
                movie2_sqrt += ((rating_movie2_cal) ** 2)
        self.similarity_result = frist_line_in_fromula / (math.sqrt(movie1_sqrt)*math.sqrt(movie2_sqrt))
        if self.similarity_result < self.threshold:
            return 0.0
        return self.similarity_result

    # item is movieId
    def similarItems(self, item, n):
        movies_np_array = np.unique(self.df_numpy[:, 0])
        most_similar = list()
        for movieId in movies_np_array:
            if movieId != item:
                most_similar.append((movieId, self.similarity(item, movieId)))
        most_similar.sort(key=lambda x : x[1], reverse=True)
        return most_similar[:n]

In [65]:
md = MovieData('data/movies.dat')
uim = UserItemData('data/user_ratedmovies.dat', min_ratings=1000)
rp = ItemBasedPredictor()
rec = Recommender(rp)
rec.fit(uim)
print("Podobnost med filmoma 'Men in black'(1580) in 'Ghostbusters'(2716): ", rp.similarity(1580, 2716))
print("Podobnost med filmoma 'Men in black'(1580) in 'Schindler's List'(527): ", rp.similarity(1580, 527))
print("Podobnost med filmoma 'Men in black'(1580) in 'Independence day'(780): ", rp.similarity(1580, 780))

Podobnost med filmoma 'Men in black'(1580) in 'Ghostbusters'(2716):  0.2339552317675662
Podobnost med filmoma 'Men in black'(1580) in 'Schindler's List'(527):  0.0
Podobnost med filmoma 'Men in black'(1580) in 'Independence day'(780):  0.42466125844687624


In [59]:
movies = np.array(list(set(uim.get_all_movies_id())))
top_20_most_similar_movies = dict()
for movie1 in movies:
    for movie2 in movies:
        if movie1 != movie2:
            top_20_most_similar_movies[(movie1, movie2)] = (md.get_title(movie1), md.get_title(movie2))

top_20_most_similar_movies_sorted = sorted(top_20_most_similar_movies)

similarty_all_movies = list()
for moveIds in top_20_most_similar_movies_sorted:
    movieId1, movieId2 = moveIds
    name = f"Film1: {md.get_title(movieId1)}, Film2: {md.get_title(movieId2)}, podobnost:"
    podobnost = rp.similarity(movieId1, movieId2)
    similarty_all_movies.append((name, podobnost))

similarty_all_movies.sort(key=lambda x : x[1], reverse=True)

for name_sim in similarty_all_movies[:20]:
    name, sim = name_sim
    print(f'{name} {sim}')

Film1: The Lord of the Rings: The Two Towers, Film2: The Lord of the Rings: The Return of the King, podobnost: 0.8439842148481421
Film1: The Lord of the Rings: The Return of the King, Film2: The Lord of the Rings: The Two Towers, podobnost: 0.8439842148481421
Film1: The Lord of the Rings: The Fellowship of the Ring, Film2: The Lord of the Rings: The Two Towers, podobnost: 0.8231885401761893
Film1: The Lord of the Rings: The Two Towers, Film2: The Lord of the Rings: The Fellowship of the Ring, podobnost: 0.8231885401761893
Film1: The Lord of the Rings: The Fellowship of the Ring, Film2: The Lord of the Rings: The Return of the King, podobnost: 0.8079374897442505
Film1: The Lord of the Rings: The Return of the King, Film2: The Lord of the Rings: The Fellowship of the Ring, podobnost: 0.8079374897442505
Film1: Kill Bill: Vol. 2, Film2: Kill Bill: Vol. 2, podobnost: 0.7372340224381034
Film1: Kill Bill: Vol. 2, Film2: Kill Bill: Vol. 2, podobnost: 0.7372340224381034
Film1: Star Wars, Film2:

In [61]:
rec_items = rp.similarItems(4993, 10)
print('Filmi podobni "The Lord of the Rings: The Fellowship of the Ring": ')
for idmovie, val in rec_items:
    print("Film: {}, ocena: {}".format(md.get_title(idmovie), val))

Filmi podobni "The Lord of the Rings: The Fellowship of the Ring": 
Film: The Lord of the Rings: The Two Towers, ocena: 0.8231885401761893
Film: The Lord of the Rings: The Return of the King, ocena: 0.8079374897442505
Film: Star Wars: Episode V - The Empire Strikes Back, ocena: 0.23961943073496456
Film: Star Wars, ocena: 0.21965586527074096
Film: The Matrix, ocena: 0.21515552706880264
Film: Raiders of the Lost Ark, ocena: 0.19944276706345018
Film: The Usual Suspects, ocena: 0.18321188451910747
Film: Blade Runner, ocena: 0.16399681315410283
Film: Schindler's List, ocena: 0.16105905138148705
Film: Monty Python and the Holy Grail, ocena: 0.15780453798519112


In [6]:
all_sim = rp.calcualte_all_sim_with_numpy()
print(all_sim[0])

NameError: name 'rp' is not defined

In [78]:
all_sim[1,2]

0.0825715477438452

In [81]:
movieIds1 = all_sim[:, 0]
movieIds2 = all_sim[:, 1] 
sums = all_sim[:, 2]

In [None]:
def predict(user, movieId):
    
    
    

In [4]:
pred_for_user_78 = all_sim[np.where(all_sim[:, 0] == 47)]

NameError: name 'all_sim' is not defined

In [3]:
pred_for_user_78

NameError: name 'pred_for_user_78' is not defined

In [2]:

formula_first_line = 0
sum_sim = sum(pred_for_user_78[:, 2])
i = 0
for movieid2 in pred_for_user_78[:, 1]:
    rating = uim.get_rating_movie(78, movieid2)
    if rating.size > 0:
        sim = pred_for_user_78[i, 2]
        formula_first_line += (sim*rating[0])
    i+=1

pred = formula_first_line/sum_sim
pred

NameError: name 'pred_for_user_78' is not defined