# 🤖 **MOVIE LENS 10M - DATA GENERATOR**

Generate feature vectors for genuine or attack profiles!

In [1]:
import pandas as pd
import numpy as np
import json
from copy import deepcopy

In [2]:
class ML10M_FVG:

    def load_ratings(self):

        return pd.read_csv(
            './dataset/ratings.dat',
            sep='::',
            engine='python',
            header=None,
            names=['userID', 'movieID', 'rating', 'timestamp'])
            
        
    def load_movies(self):

        return pd.read_csv(
            './dataset/movies.dat',
            sep='::',
            engine='python',
            header=None,
            names=['movieID', 'Title', 'Genre'])


    def get_movies_global_statistics(self, items):
        """
        Parameters
        -------
        List containing the desired items. If none, the
        whole database is used.

        Returns
        -------
        Tuple
            Tuple containing the mean of the mean rating 
            of all movies and its standard desviation.
        """

        data = self.ratings
        means = []

        for item in items:
            ratings_movie = data.loc[data["movieID"] == item].values
            if len(ratings_movie) > 0:
                means.append(np.mean(ratings_movie[:, 2]))

        return np.mean(means), np.std(means)


    def __init__(self):

        self.ratings = self.load_ratings()
        self.movies = self.load_movies()
        
        self.movies_ids = self.movies.values[:, 0]
        self.users_ids = set(self.ratings['userID'])

        self.n_movies = len(self.movies_ids)
        self.n_users = len(self.users_ids)
        self.n_ratings = self.ratings.shape[0]

        self.windows = None
        self.n_windows = 0

        self.rd = np.random.RandomState(5)
        self.range_ratings = (0, 5)
        self.range_mean = ((self.range_ratings[1] - self.range_ratings[0]) / 2)

        self.most_popular_items = None
        self.most_popular_items_data = None
        self.n_most_popular_items = 0

        self.global_mean, self.global_std = self.get_movies_global_statistics(self.movies_ids)


    def dump_vectors_json(self, vectors, file):

        with open(file, 'w') as f:
            json_dumps_str = json.dumps(vectors, indent=2)
            print(json_dumps_str, file=f)
            f.close()


    def dump_vectors_csv(self, vectors, tag, file):
        """
        Dumps the vectors to a csv and tags them.
        """

        values = list(vectors.values())

        for value in values:
            value.append(tag) 

        np.savetxt(file, np.array(values), fmt = '%.10f', delimiter=",")


    def get_most_popular_items(self, number=50):
        """
        Returns
        -------
        np.array
            Array the most popular (voted) ratings and
            its mean rating.
            (film, number of ratings, mean of the ratings)
        """

        if self.most_popular_items is not None and self.n_most_popular_items == number:
            pass

        data = self.ratings
        movies_info = []

        for film in self.movies_ids:
            ratings_movie = data.loc[data["movieID"] == film].values

            if len(ratings_movie) > 0:
                movies_info.append( (film, len(ratings_movie), np.mean(ratings_movie[:, 2])) )
            
        ordered = sorted(movies_info, key = lambda x:x[1], reverse=True)
        
        self.most_popular_items_data = ordered[:number]
        self.most_popular_items = [i[0] for i in self.most_popular_items_data]


    def get_movies_particular_statistics(self, films_ids):
        """
        Returns individual statistics for each
        film in the array.

        Parameters
        -------
        films_ids: list with a bunch of ids.

        Returns
        -------
        Tuple
            Tuple containing the means and
            the desviations (in order) for the iems.
        """

        data = self.ratings
        means = []
        stds = []

        for film in films_ids:
            ratings_movie = data.loc[data["movieID"] == film].values

            if len(ratings_movie) > 0:
                means.append(np.mean(ratings_movie[:, 2]))
                stds.append(np.std(ratings_movie[:, 2]))

            else:
                means.append(None)
                stds.append(None)

        return means, stds


    def get_windows(self, n_windows):
        """
        Returns the items partitions made (by index).
        First item is included, while second one is not.

        Returns:
        -------
            List of tuples, where the tuples are 
            (row_first_item_window_INCLUDED, row:last_item_window_EXCLUDED)
        """

        if self.n_windows == n_windows and self.windows is not None:
            pass

        Q, q = divmod(self.n_movies, n_windows)
        index = 0
        windows = []

        for j in range(n_windows):

            if j < q: # No = here since range starts in 0
                n_items_window = Q + 1

            else:
                n_items_window = Q

            windows.append((index, index + n_items_window))
            index = index + n_items_window 

        self.windows = windows
        self.n_windows = n_windows
        

    def obtain_array_user(self, ratings_user, J):
        """
        Returns the feature vector of a user given
        its ratings and desired number of windows.

        Returns:
        -------
            np.array
                feature vector
        """
        
        self.get_windows(J) 

        n_ratings_user = len(ratings_user)
        user_vector = []

        for window in self.windows:
            movies_window = self.movies_ids[window[0] : window[1]]
            ratings_user_window = [rating for rating in ratings_user if int(rating[1]) in movies_window]
            n_ratings_user_window = len(ratings_user_window)

            user_vector.append(n_ratings_user_window) #NRW
            user_vector.append(n_ratings_user_window/n_ratings_user) #NRWR

        return user_vector


    def discard_elements(self, all_elements, excluded):

        all_elements = set(all_elements)
        
        if excluded is not None:
            for item in excluded:
                all_elements.discard(item)

        return list(all_elements)


    def obtain_genuine_vectors(self, number, J=40, filename='genuine', excluded=None):
        """
        Returns the feature vectors of random users in 
        the database given the desired number of windows
        and users.

        Returns:
        -------
            np.array
                array of features vectors
        """

        data = self.ratings
        available_users_ids = self.discard_elements(deepcopy(self.users_ids), excluded)

        users_ids = self.rd.choice(available_users_ids, replace = False, size=number)

        vectors = { **dict.fromkeys([int(i) for i in users_ids], [])} 

        for user_id in users_ids:
            ratings_user = data.loc[data["userID"] == user_id].values
            vectors[user_id] = self.obtain_array_user(ratings_user, 40)

        self.dump_vectors_json(vectors, filename + '.json')
        self.dump_vectors_csv(vectors, 0, filename + '.csv')

        return users_ids


    def get_filler_items(self, number, excluded):
        """
        Returns a random array of items ids.

        Returns:
        -------
            np.array
                vector of random items ids.
        """

        available_moviles = self.discard_elements(deepcopy(self.movies_ids), excluded)
        return self.rd.choice(available_moviles, replace = False, size=number)


    def rating_correction(self, rating):
        """
        Guarantees rating is inside a correct 
        interval.

        Returns:
        -------
            rating
                the rating if it was correct, max or
                min if not.
        """

        if rating > self.range_ratings[1]:
            return self.range_ratings[1]

        elif rating < self.range_ratings[0]:
            return self.range_ratings[0]

        return rating


    def generate_random_ratings(self, number_users, filler_number, target_items, rating_target):
        """
        For each user, generates a list of ratings applying
        random model.

        Returns:
        -------
            [ [[]], ]
        """

        ratings = []

        for i in range(number_users):

            id_usuario = -1 * i
            filler_items = self.get_filler_items(filler_number, target_items)
            filler_items_ratings = self.rd.normal(loc=self.global_mean, scale=self.global_std, size=len(filler_items))

            ratings_user = [ ]

            for item, rating in zip(filler_items, filler_items_ratings):
                ratings_user.append([id_usuario, item, self.rating_correction(rating)])

            for item in target_items:
                ratings_user.append([id_usuario, item, rating_target])

            ratings.append(ratings_user)

        return(ratings)


    #VAS POR AQUI self.rating_correction(rating)
    def generate_average_ratings(self, number_users, filler_number, target_items, rating_target):
        """
        For each user, generates a list of ratings applying
        average model.

        Returns:
        -------
            [ [[]], ]
        """

        ratings = []

        for i in range(number_users):

            id_usuario = -1 * i
            filler_items = self.get_filler_items(filler_number, target_items)
            means, distributions = self.get_movies_particular_statistics(filler_items)

            ratings_user = [ ]

            for i in range(filler_number):

                if means[i] is None:
                    rating = self.rd.normal(loc=self.range_mean, scale=0, size=1) 

                else:
                    rating = self.rd.normal(loc=means[i], scale=distributions[i], size=1)

                ratings_user.append([id_usuario, filler_items[i], self.rating_correction(rating)])

            for item in target_items:
                ratings_user.append([id_usuario, item, rating_target])

            ratings.append(ratings_user)

        return(ratings)


    def generate_bandwagon_ratings(self, number_users, filler_number, popular_number, target_items, rating_target):
        """
        For each user, generates a list of ratings applying
        bandwagon model.

        Used average bandwagon attack as in the paper

        Returns:
        -------
            [ [[]], ]
        """

        ratings = []

        self.get_most_popular_items(popular_number)

        for i in range(number_users):
            id_usuario = -1 * i
            filler_items = self.get_filler_items(filler_number, target_items + self.most_popular_items)
            filler_items_ratings = self.rd.normal(loc=self.global_mean, scale=self.global_std, size=filler_number)

            ratings_user = [ ]

            for item, rating in zip(filler_items, filler_items_ratings):
                ratings_user.append([id_usuario, item, self.rating_correction(rating)])

            for item in target_items:
                ratings_user.append([id_usuario, item, rating_target])

            for item in self.most_popular_items_data:

                if item[2] < self.range_mean:
                    ratings_user.append([id_usuario, item[0], self.range_ratings[0]])

                else:
                    ratings_user.append([id_usuario, item[0], self.range_ratings[1]])

            ratings.append(ratings_user)

        return(ratings)

    def generate_attack_vectors_ratings(self, ratings):
        """
        Given the ratings of certain users, returns the
        feature vectors.

        Returns:
        -------
            np.array
                array of features vectors
        """

        users_ids = [i[0][0] for i in ratings]
        vectors = { **dict.fromkeys([i for i in users_ids], [])} 

        for ratings_user in ratings:
            vectors[ratings_user[0][0]] = self.obtain_array_user(ratings_user, 40)

        return vectors


    def generate_attack_vectors(self, number, filler_number, target_items, popular_number=50, type='Random', push=True, filename='vectores'):
        """
        Returns the feature vectors of a certain number
        of attackers given the target items, type of attack,
        punctuation rate and push/nuke.

        Returns:
        -------
            Array of vectors
        """

        if (push):
            rating_target = self.range_ratings[1]
        else:
            rating_target = self.range_ratings[0]


        if type == 'Random':
            ratings = self.generate_random_ratings(number, filler_number, target_items, rating_target)

        elif type == 'Average':
            ratings = self.generate_average_ratings(number, filler_number, target_items, rating_target)
            
        elif type == 'Bandwagon':
            ratings = self.generate_bandwagon_ratings(number, filler_number, popular_number, target_items, rating_target)

        vectors = self.generate_attack_vectors_ratings(ratings)
        
        #dump_vectors_json(vectors, filename + '.json')
        self.dump_vectors_csv(vectors, 1, filename + '.csv')

        return vectors

In [5]:
# generator = ML10M_FVG()
# rd = np.random.RandomState(5)
# total = generator.n_movies
# used_users = np.array([], dtype=int)
# generator.get_most_popular_items()
# generator.get_windows(40)

# for i in [0.01, 0.03, 0.05, 0.1]:
#     target = rd.choice(generator.movies_ids, replace = False, size=1)
#     generator.generate_attack_vectors(10, int(i*total-1), target, type='Random', filename="train-random-{}".format(i))
#     generator.generate_attack_vectors(10, int(i*total-1), target, type='Average', filename="train-average-{}".format(i))
#     generator.generate_attack_vectors(10, int(i*total-1-50), target, type='Bandwagon', filename="train-bandwagon-{}".format(i))

# iteration_used_users = generator.obtain_genuine_vectors(1000, filename='train-genuine')
# used_users = np.concatenate((used_users, iteration_used_users))

# #Generate test sets
# for p in range(1, 11):
#     iteration_used_users = generator.obtain_genuine_vectors(1000, filename='test-{}-genuine'.format(p), excluded=used_users)
#     used_users = np.concatenate((used_users, iteration_used_users))

#     for i in [0.01, 0.03, 0.05, 0.1]:
#         for k in [10, 20, 50, 100]:
#             target = rd.choice(generator.movies_ids, replace = False, size=1)
#             generator.generate_attack_vectors(k, int(i*total-1), target, type='Random', filename="test-{}-random-{}-{}".format(p, i, k))
#             generator.generate_attack_vectors(k, int(i*total-1), target, type='Average', filename="test-{}-average-{}-{}".format(p, i, k))
#             generator.generate_attack_vectors(k, int(i*total-1-50), target, type='Bandwagon', filename="test-{}-bandwagon-{}-{}".format(p, i, k))


n_r = []
data = generator.ratings

for user_id in generator.users_ids:
            ratings_user = data.loc[data["userID"] == user_id].values
            n_r.append((len(ratings_user), (len(ratings_user))/total*100, user_id))
            
            
ordered = sorted(n_r, key = lambda x:x[0], reverse=True)

for i in ordered:
    print("{} - {} - {}".format(i[0], i[1], i[2]))

7359 - 68.89804325437693 - 59269
7047 - 65.97696844864713 - 67385
5169 - 48.39434509877352 - 14463
4483 - 41.971725493867616 - 68259
4449 - 41.65340323939706 - 27468
4165 - 38.99447617264301 - 3817
4165 - 38.99447617264301 - 19635
3755 - 35.15588428049808 - 63134
3697 - 34.612863964048316 - 58357
3479 - 32.5718565677371 - 27584
3414 - 31.96329931654339 - 6757
3225 - 30.19380207845707 - 56707
3202 - 29.978466435726993 - 19379
3187 - 29.83803014698998 - 7795
3164 - 29.6226945042599 - 8811
3027 - 28.340043067128544 - 30723
2909 - 27.23527759573074 - 30687
2886 - 27.019941953000654 - 31327
2827 - 26.46755921730175 - 30500
2812 - 26.32712292856474 - 47046
2806 - 26.270948413069938 - 42791
2801 - 26.224136316824264 - 47345
2766 - 25.89645164310458 - 62332
2753 - 25.774740192865835 - 14134
2697 - 25.250444714914334 - 57126
2644 - 24.754236494710234 - 30158
2634 - 24.660612302218894 - 51033
2608 - 24.41718940174141 - 59659
2601 - 24.35165246699747 - 59598
2555 - 23.92098118153731 - 38928
2533 