# MOVIE LENS 10M

In [74]:
import pandas as pd
import numpy as np
import json

In [75]:
def load_ratings():

    return pd.read_csv(
        './dataset/ratings_cortito.dat',
        sep='::',
        header=None,
        names=['userId', 'movieId', 'rating', 'timestamp'])

      
def load_movies():

    return pd.read_csv(
        './dataset/movies.dat',
        sep='::',
        header=None,
        names=['movieID', 'Title', 'Genre'])


def get_movies_id_list():

        movies = pd.read_csv(
        './dataset/movies.dat',
        sep='::',
        header=None,
        names=['movieID', 'Title', 'Genre'])

        retorno = movies.values[:, 0]
        return retorno


def get_global_ratings_statistics():
    ratings = load_ratings()
    return np.mean(ratings["rating"]), np.std(ratings["rating"])


def get_windows(n_items, n_windows=40):
    """
    Returns the items partitions made.
    Both first item and last item are included.

    Returns:
        List of tuples, where the tuples are 
        (first_item_window_INCLUDED, last_item_window_EXCLUDED)
    """

    Q, q = divmod(n_items, n_windows)
    index = 0
    windows = []

    for j in range(n_windows):

        if j < q: #Cambiado de su paper porque no sense el =
          n_items_window = Q + 1

        else:
          n_items_window = Q

        windows.append((index, index + n_items_window))
        index = index + n_items_window 

    return windows
    

def obtain_array_user(ratings_user, J=40):
    """Entrada: array con las ratings
    """

    movies_ids = get_movies_id_list()
    n_movies = len(movies_ids)
    windows = get_windows(n_movies, J)

    n_ratings_user = len(ratings_user)
    user_vector = []

    for window in windows:

        movies_window = movies_ids[window[0] : window[1]]
        ratings_user_window = [rating for rating in ratings_user if int(rating[1]) in movies_window]
        n_ratings_user_window = len(ratings_user_window)

        user_vector.append(n_ratings_user_window) #NRW
        user_vector.append(n_ratings_user_window/n_ratings_user) #NRWR

    return user_vector



def dump_vectors_json(vectors, file):

    with open(file, 'w') as f:
        json_dumps_str = json.dumps(vectors, indent=2)
        print(json_dumps_str, file=f)
        f.close()


def obtain_genuine_vectors(J=40):

    data = load_ratings()

    users_ids = list(set(data['userId']))
    n_users = len(users_ids)
    vectors = { **dict.fromkeys([i for i in users_ids], [])} 

    for user_id in users_ids:
        ratings_user = data.loc[data["userId"] == user_id].values
        vectors[user_id] = obtain_array_user(ratings_user, 40)

    dump_vectors_json(vectors, 'vectors-genuine-users-ML10M.json')
    return vectors


def get_filler_items(filler_size, rd=np.random.RandomState(5)):

    all_items = get_movies_id_list()
    return rd.choice(all_items, replace = False, size=int(filler_size * len(all_items)))

def generate_random_attack_ratings(number_users, filler_size, target_items, puntuation_range=(0, 5), type='push', rd=np.random.RandomState(5)):
    """
    Filler_size : percentage
    No añaden time stamp, es innecesario
    """

    if type == 'push':
        rating_target = puntuation_range[1]
    else:
        rating_target = puntuation_range[0]

    global_mean, global_distibution = get_global_ratings_statistics()

    ratings = [ ]

    for i in range(number_users):

        id_usuario = -1 * i
        filler_items = get_filler_items(filler_size, rd)
        filler_items_ratings = rd.normal(loc=global_mean, scale=global_distibution, size=len(filler_items))

        ratings_user = [ ]

        for item, rating in zip(filler_items, filler_items_ratings):
            ratings_user.append([id_usuario, item, rating])

        for item in target_items:
            ratings_user.append([id_usuario, item, rating_target])

        ratings.append(ratings_user)

    return(ratings)


def generate_random_attack_vectors(number_users, filler_size, target_items, puntuation_range=(0, 5), type='push', rd=np.random.RandomState(5)):

    ratings = generate_random_attack_ratings(number_users, filler_size, target_items, puntuation_range, type, rd)

    users_ids = [i[0][0] for i in ratings]
    n_users = len(users_ids)
    vectors = { **dict.fromkeys([i for i in users_ids], [])} 

    for ratings_user in ratings:
        vectors[ratings_user[0][0]] = obtain_array_user(ratings_user, 40)

    dump_vectors_json(vectors, 'vectors-random-attack-ML10M.json')
    return vectors



def generate_attack_vectors(number, filler_size, target_items, type='Random', puntuation_range=(0, 5), push=True, rd=np.random.RandomState(5)):

    if type == 'Random':
        return 1

    elif type == 'Average':
        return 0

    elif type == 'Bandwagon':
        return 5


In [76]:
#print(get_filler_items(0.001))

random_attack_vectors = generate_random_attack_vectors(3, 0.1, [1,2,3,4])

  return pd.read_csv(
  movies = pd.read_csv(
  movies = pd.read_csv(
  movies = pd.read_csv(
  movies = pd.read_csv(
  movies = pd.read_csv(
  movies = pd.read_csv(


{0: [37,
  0.03451492537313433,
  26,
  0.024253731343283583,
  28,
  0.026119402985074626,
  31,
  0.028917910447761194,
  26,
  0.024253731343283583,
  30,
  0.027985074626865673,
  28,
  0.026119402985074626,
  26,
  0.024253731343283583,
  23,
  0.021455223880597014,
  23,
  0.021455223880597014,
  32,
  0.029850746268656716,
  22,
  0.020522388059701493,
  21,
  0.01958955223880597,
  25,
  0.02332089552238806,
  24,
  0.022388059701492536,
  27,
  0.025186567164179104,
  26,
  0.024253731343283583,
  26,
  0.024253731343283583,
  24,
  0.022388059701492536,
  35,
  0.03264925373134328,
  23,
  0.021455223880597014,
  37,
  0.03451492537313433,
  15,
  0.013992537313432836,
  24,
  0.022388059701492536,
  27,
  0.025186567164179104,
  27,
  0.025186567164179104,
  21,
  0.01958955223880597,
  29,
  0.027052238805970148,
  30,
  0.027985074626865673,
  25,
  0.02332089552238806,
  25,
  0.02332089552238806,
  38,
  0.03544776119402985,
  29,
  0.027052238805970148,
  21,
  0.019589