# MOVIE LENS 10M

In [26]:
import pandas as pd
import numpy as np
import json

In [27]:
def load_ratings():

    return pd.read_csv(
        './dataset/ratings_cortito.dat',
        sep='::',
        engine='python',
        header=None,
        names=['userID', 'movieID', 'rating', 'timestamp'])
      
def load_movies():

    return pd.read_csv(
        './dataset/movies.dat',
        sep='::',
        engine='python',
        header=None,
        names=['movieID', 'Title', 'Genre'])


def get_movies_id_list():

        movies = pd.read_csv(
        './dataset/movies.dat',
        sep='::',
        engine='python',
        header=None,
        names=['movieID', 'Title', 'Genre'])

        return movies.values[:, 0]


def get_movies_statistics(items=None):

    if items == None:
        items = get_movies_id_list()

    data = load_ratings()
    means = [] #Media de cada peli

    for item in items:
        ratings_movie = data.loc[data["movieID"] == item].values
        if len(ratings_movie) > 0:
            means.append(np.mean(ratings_movie[:, 2]))

    return np.mean(means), np.std(means)


def get_windows(n_items, n_windows=40):
    """
    Returns the items partitions made.
    Both first item and last item are included.

    Returns:
        List of tuples, where the tuples are 
        (first_item_window_INCLUDED, last_item_window_EXCLUDED)
    """

    Q, q = divmod(n_items, n_windows)
    index = 0
    windows = []

    for j in range(n_windows):

        if j < q: #Cambiado de su paper porque no sense el =
          n_items_window = Q + 1

        else:
          n_items_window = Q

        windows.append((index, index + n_items_window))
        index = index + n_items_window 

    return windows
    

def obtain_array_user(ratings_user, J=40):
    """Entrada: array con las ratings
    """

    movies_ids = get_movies_id_list()
    n_movies = len(movies_ids)
    windows = get_windows(n_movies, J)

    n_ratings_user = len(ratings_user)
    user_vector = []

    for window in windows:

        movies_window = movies_ids[window[0] : window[1]]
        ratings_user_window = [rating for rating in ratings_user if int(rating[1]) in movies_window]
        n_ratings_user_window = len(ratings_user_window)

        user_vector.append(n_ratings_user_window) #NRW
        user_vector.append(n_ratings_user_window/n_ratings_user) #NRWR

    return user_vector

def dump_vectors_json(vectors, file):

    with open(file, 'w') as f:
        json_dumps_str = json.dumps(vectors, indent=2)
        print(json_dumps_str, file=f)
        f.close()

def obtain_genuine_vectors(J=40):

    data = load_ratings()

    users_ids = list(set(data['userID']))
    n_users = len(users_ids)
    vectors = { **dict.fromkeys([i for i in users_ids], [])} 

    for user_id in users_ids:
        ratings_user = data.loc[data["userId"] == user_id].values
        vectors[user_id] = obtain_array_user(ratings_user, 40)

    dump_vectors_json(vectors, 'vectors-genuine-users-ML10M.json')
    return vectors


def get_filler_items(filler_size, rd=np.random.RandomState(5)):
    all_items = get_movies_id_list()
    return rd.choice(all_items, replace = False, size=int(filler_size * len(all_items)))


def rating_correction(rating, range_ratings):

    if rating > range_ratings[1]:
        return range_ratings[1]

    elif rating < range_ratings[0]:
        return range_ratings[0]

    return rating

def generate_random_average_ratings(number_users, filler_size, target_items, rating_target, range_ratings, type, rd):
    """
    Filler_size : percentage
    No añaden time stamp, es innecesario
    """

    ratings = []

    if type == 'Random':
        mean, distribution = get_movies_statistics()

    for i in range(number_users):

        id_usuario = -1 * i
        filler_items = get_filler_items(filler_size, rd)

        if (type == 'Average'):
            mean, distribution = get_movies_statistics(filler_items)

        filler_items_ratings = rd.normal(loc=mean, scale=distribution, size=len(filler_items))

        ratings_user = [ ]

        for item, rating in zip(filler_items, filler_items_ratings):
            ratings_user.append([id_usuario, item, rating_correction(rating, range_ratings)])

        for item in target_items:
            ratings_user.append([id_usuario, item, rating_target])

        ratings.append(ratings_user)

    return(ratings)


def generate_attack_vectors_ratings(ratings, rd):

    users_ids = [i[0][0] for i in ratings]
    vectors = { **dict.fromkeys([i for i in users_ids], [])} 

    for ratings_user in ratings:
        vectors[ratings_user[0][0]] = obtain_array_user(ratings_user, 40)

    dump_vectors_json(vectors, 'vectors-random-attack-ML10M.json')
    return vectors


def generate_attack_vectors(number, filler_size, target_items, type='Random', puntuation_range=(0, 5), push=True, rd=np.random.RandomState(5)):

    if (push):
        rating_target = puntuation_range[1]
    else:
        rating_target = puntuation_range[0]

    if type == 'Random' or type == 'Average':
        ratings = generate_random_average_ratings(number, filler_size, target_items, rating_target, puntuation_range, type, rd)
        
    elif type == 'Bandwagon':
        return 5

    return generate_attack_vectors_ratings(ratings, rd)


In [28]:
#print(generate_random_average_ratings(3, 0.1, [1,2,3], 5, (0,5) , type='Random', rd=np.random.RandomState(5)))

generate_attack_vectors(4, 0.01, [1,2,3], type='Random', puntuation_range=(0, 5), push=True, rd=np.random.RandomState(5))

{0: [9,
  0.08256880733944955,
  3,
  0.027522935779816515,
  4,
  0.03669724770642202,
  6,
  0.05504587155963303,
  2,
  0.01834862385321101,
  5,
  0.045871559633027525,
  1,
  0.009174311926605505,
  3,
  0.027522935779816515,
  2,
  0.01834862385321101,
  1,
  0.009174311926605505,
  3,
  0.027522935779816515,
  1,
  0.009174311926605505,
  1,
  0.009174311926605505,
  2,
  0.01834862385321101,
  1,
  0.009174311926605505,
  4,
  0.03669724770642202,
  4,
  0.03669724770642202,
  2,
  0.01834862385321101,
  2,
  0.01834862385321101,
  4,
  0.03669724770642202,
  3,
  0.027522935779816515,
  3,
  0.027522935779816515,
  3,
  0.027522935779816515,
  4,
  0.03669724770642202,
  0,
  0.0,
  1,
  0.009174311926605505,
  3,
  0.027522935779816515,
  4,
  0.03669724770642202,
  4,
  0.03669724770642202,
  3,
  0.027522935779816515,
  0,
  0.0,
  3,
  0.027522935779816515,
  2,
  0.01834862385321101,
  3,
  0.027522935779816515,
  3,
  0.027522935779816515,
  0,
  0.0,
  1,
  0.0091743119