# 🤖 **MOVIE LENS 10M - DATA GENERATOR**

Generate feature vectors for genuine or attack profiles!

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
def dump_vectors_json(vectors, file):

    with open(file, 'w') as f:
        json_dumps_str = json.dumps(vectors, indent=2)
        print(json_dumps_str, file=f)
        f.close()


def dump_vectors_csv(vectors, tag, file):
    """
    Dumps the vectors to a csv and tags them.
    """

    values = list(vectors.values())

    for value in values:
        value.append(tag) 

    np.savetxt(file, np.array(values), fmt = '%.10f', delimiter=",")


def load_ratings():

    return pd.read_csv(
        './dataset/ratings.dat',
        sep='::',
        engine='python',
        header=None,
        names=['userID', 'movieID', 'rating', 'timestamp'])
        
      
def load_movies():

    return pd.read_csv(
        './dataset/movies.dat',
        sep='::',
        engine='python',
        header=None,
        names=['movieID', 'Title', 'Genre'])


def get_movies_id_list():
    """
    Returns
    -------
    np.array
        Array containing the ID of all available movies.
    """

    movies = pd.read_csv(
        './dataset/movies.dat',
        sep='::',
        engine='python',
        header=None,
        names=['movieID', 'Title', 'Genre'])

    return movies.values[:, 0]


def get_most_popular_items(k=0.005):
    """
    Returns
    -------
    np.array
        Array the most popular (voted) ratings and
        its mean rating.
        (film, number of ratings, mean of the ratings)
    """

    #data = load_ratings()
    # movies_ids = get_movies_id_list()
    k = int(k * len(movies_ids))

    number_ratings_movie = []

    for film in movies_ids:
        ratings_movie = data.loc[data["movieID"] == film].values

        if len(ratings_movie) > 0:
            number_ratings_movie.append( (film, len(ratings_movie), np.mean(ratings_movie[:, 2])) )
        else:
            number_ratings_movie.append( (film, 0, 2.5) ) # No ratings, no use
        
    #Ordered by the number of votes
    ordered = sorted(number_ratings_movie, key = lambda x:x[1], reverse=True)
    return ordered[:k]


def get_movies_global_statistics(items=get_movies_id_list()):
    """
    Parameters
    -------
    List containing the desired items. If none, the
    whole database is used.

    Returns
    -------
    Tuple
        Tuple containing the mean of the mean rating 
        of all movies and its standard desviation.
    """

    #data = load_ratings()
    means = []

    for item in items:
        ratings_movie = data.loc[data["movieID"] == item].values
        if len(ratings_movie) > 0:
            means.append(np.mean(ratings_movie[:, 2]))

    return np.mean(means), np.std(means)


def get_movies_particular_statistics(films_ids):
    """
    Returns individual statistics for each
    film in the array.

    Parameters
    -------
    films_ids: list with a bunch of ids.

    Returns
    -------
    Tuple
        Tuple containing the means and
        the desviations (in order) for the iems.
    """

    #data = load_ratings()
    means = []
    stds = []

    for film in films_ids:
        ratings_movie = data.loc[data["movieID"] == film].values

        if len(ratings_movie) > 0:
            means.append(np.mean(ratings_movie[:, 2]))
            stds.append(np.std(ratings_movie[:, 2]))
        else:
            means.append(None)
            stds.append(None)


    return means, stds


def get_windows(n_items, n_windows=40):
    """
    Returns the items partitions made (by index).
    First item is included, while second one is not.

    Returns:
    -------
        List of tuples, where the tuples are 
        (row_first_item_window_INCLUDED, row:last_item_window_EXCLUDED)
    """

    Q, q = divmod(n_items, n_windows)
    index = 0
    windows = []

    for j in range(n_windows):

        if j < q: # No = here since range starts in 0
          n_items_window = Q + 1

        else:
          n_items_window = Q

        windows.append((index, index + n_items_window))
        index = index + n_items_window 

    return windows
    

def obtain_array_user(ratings_user, J=40):
    """
    Returns the feature vector of a user given
    its ratings and desired number of windows.

    Returns:
    -------
        np.array
            feature vector
    """
    
    #movies_ids = get_movies_id_list()
    n_movies = len(movies_ids)
    windows = get_windows(n_movies, J) #Indexes

    n_ratings_user = len(ratings_user)
    user_vector = []

    for window in windows:
        movies_window = movies_ids[window[0] : window[1]]
        ratings_user_window = [rating for rating in ratings_user if int(rating[1]) in movies_window]
        n_ratings_user_window = len(ratings_user_window)

        user_vector.append(n_ratings_user_window) #NRW
        user_vector.append(n_ratings_user_window/n_ratings_user) #NRWR

    return user_vector


def obtain_genuine_vectors(number=-1, J=40, filename='genuine', rd=np.random.RandomState(5)):
    """
    Returns the feature vectors of random users in 
    the database given the desired number of windows
    and users.

    Returns:
    -------
        np.array
            array of features vectors
    """

    #data = load_ratings()
    all_users_ids = list(set(data['userID']))

    if number != -1:
        users_ids = rd.choice(all_users_ids, replace = False, size=number)
    else:
        users_ids = all_users_ids

    vectors = { **dict.fromkeys([int(i) for i in users_ids], [])} 

    for user_id in users_ids:
        ratings_user = data.loc[data["userID"] == user_id].values
        vectors[user_id] = obtain_array_user(ratings_user, 40)

    #dump_vectors_json(vectors, filename + '.json')
    dump_vectors_csv(vectors, 0, filename + '.csv')
    return vectors


def get_filler_items(filler_size, excluded_items, rd=np.random.RandomState(5)):
    """
    Returns a random array of items ids.

    Returns:
    -------
        np.array
            vector of random items ids.
    """

    all_items = set(get_movies_id_list())

    for item in excluded_items:
        all_items.discard(item)

    all_items = list(all_items)

    return rd.choice(all_items, replace = False, size=int(filler_size * len(all_items)))


def rating_correction(rating, range_ratings):
    """
    Guarantees rating is inside a correct 
    interval.

    Returns:
    -------
        rating
            the rating if it was correct, max or
            min if not.
    """

    if rating > range_ratings[1]:
        return range_ratings[1]

    elif rating < range_ratings[0]:
        return range_ratings[0]

    return rating


def generate_random_ratings(number_users, filler_size, target_items, rating_target, range_ratings, rd):
    """
    For each user, generates a list of ratings applying
    random model.

    Returns:
    -------
        [ [[]], ]
    """

    ratings = []
    mean, distribution = get_movies_global_statistics()

    for i in range(number_users):

        id_usuario = -1 * i
        filler_items = get_filler_items(filler_size, target_items, rd)
        filler_items_ratings = rd.normal(loc=mean, scale=distribution, size=len(filler_items))

        ratings_user = [ ]

        for item, rating in zip(filler_items, filler_items_ratings):
            ratings_user.append([id_usuario, item, rating_correction(rating, range_ratings)])

        for item in target_items:
            ratings_user.append([id_usuario, item, rating_target])

        ratings.append(ratings_user)

    return(ratings)



def generate_average_ratings(number_users, filler_size, target_items, rating_target, range_ratings, rd):
    """
    For each user, generates a list of ratings applying
    average model.

    Returns:
    -------
        [ [[]], ]
    """

    ratings = []

    for i in range(number_users):

        id_usuario = -1 * i
        filler_items = get_filler_items(filler_size, target_items, rd)
        means, distributions = get_movies_particular_statistics(filler_items)

        ratings_user = [ ]

        for i in range(len(filler_items)):

            if means[i] is None:
                rating = rd.normal(loc=((range_ratings[1] - range_ratings[0]) / 2), scale=0, size=1) 

            else:
                rating = rd.normal(loc=means[i], scale=distributions[i], size=1)

            ratings_user.append([id_usuario, filler_items[i], rating_correction(rating, range_ratings)])

        for item in target_items:
            ratings_user.append([id_usuario, item, rating_target])

        ratings.append(ratings_user)

    return(ratings)


def generate_bandwagon_ratings(number_users, filler_size, target_items, rating_target, range_ratings, k, rd):
    """
    For each user, generates a list of ratings applying
    bandwagon model.

    Used average bandwagon attack as in the paper

    Returns:
    -------
        [ [[]], ]
    """

    ratings = []
    mean, distribution = get_movies_global_statistics()

    k_most_popular_data = get_most_popular_items(k)
    k_most_popular = [i[0] for i in k_most_popular_data]

    for i in range(number_users):
        id_usuario = -1 * i
        filler_items = get_filler_items(filler_size, target_items + k_most_popular, rd)
        filler_items_ratings = rd.normal(loc=mean, scale=distribution, size=len(filler_items))

        ratings_user = [ ]

        for item, rating in zip(filler_items, filler_items_ratings):
            ratings_user.append([id_usuario, item, rating_correction(rating, range_ratings)])

        for item in target_items:
            ratings_user.append([id_usuario, item, rating_target])

        #Recibe siempre la máxima puntuación independientemente del tipo de attack
        # yo esto lo corregiría y le daría la puntación que se asemeje
        mean_range = (range_ratings[0] + range_ratings[1]) / 2

        for item in k_most_popular_data:

            if item[2] < mean_range:
                ratings_user.append([id_usuario, item[0], range_ratings[0]])

            else:
                ratings_user.append([id_usuario, item[0], range_ratings[1]])

        ratings.append(ratings_user)

    return(ratings)

def generate_attack_vectors_ratings(ratings, rd=np.random.RandomState(5)):
    """
    Given the ratings of certain users, returns the
    feature vectors.

    Returns:
    -------
        np.array
            array of features vectors
    """

    users_ids = [i[0][0] for i in ratings]
    vectors = { **dict.fromkeys([i for i in users_ids], [])} 

    for ratings_user in ratings:
        vectors[ratings_user[0][0]] = obtain_array_user(ratings_user, 40)

    return vectors


def generate_attack_vectors(number, filler_size, target_items, type='Random', puntuation_range=(0, 5), push=True, most_popular_size=0.005, rd=np.random.RandomState(5), filename='vectores'):
    """
    Returns the feature vectors of a certain number
    of attackers given the target items, type of attack,
    punctuation rate and push/nuke.

    Returns:
    -------
        Array of vectors
    """

    if (push):
        rating_target = puntuation_range[1]
    else:
        rating_target = puntuation_range[0]


    if type == 'Random':
        ratings = generate_random_ratings(number, filler_size, target_items, rating_target, puntuation_range, rd)

    elif type == 'Average':
        ratings = generate_average_ratings(number, filler_size, target_items, rating_target, puntuation_range, rd)
        
    elif type == 'Bandwagon':
        ratings = generate_bandwagon_ratings(number, filler_size, target_items, rating_target, puntuation_range, most_popular_size, rd)

    vectors = generate_attack_vectors_ratings(ratings, rd)
    
    #dump_vectors_json(vectors, filename + '.json')
    dump_vectors_csv(vectors, 1, filename + '.csv')

    return vectors

In [3]:
data = load_ratings()
movies_ids = get_movies_id_list()
rd=np.random.RandomState(5)


for i in [0.01, 0.03, 0.05, 0.1]:
    generate_attack_vectors(100, i, rd.choice(movies_ids, replace = False, size=1), type='Random', puntuation_range=(0, 5), push=True, filename="./train-test-sets/test-random-{}".format(i))
    generate_attack_vectors(100, i, rd.choice(movies_ids, replace = False, size=1), type='Average', puntuation_range=(0, 5), push=True, filename="./train-test-sets/test-average-{}".format(i))
    generate_attack_vectors(100, i, rd.choice(movies_ids, replace = False, size=1), type='Bandwagon', puntuation_range=(0, 5), push=True, most_popular_size = i, filename="./train-test-sets/test-bandwagon-{}".format(i))

obtain_genuine_vectors(1000, 40, filename='./train-test-sets/test-genuine')
print("Files generated.")

Files generated.
