## Dummy entries

Run the scoring function on dummy entries with varying number of common movies, rating noise and timestamp noise.

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from random import sample, randint

In [None]:
dbb = pd.read_csv("time_fixed_DB.csv", encoding="UTF-8",sep=";") # import MovieLens dataset

dbb.rename(columns={"userId":"uM"},inplace=1)
dbb.set_index(["movieId"],inplace=True)

list_movies = list(dbb.index.unique())

dff= pd.read_csv("time_fixed_DF.csv", encoding="UTF-8",sep=";") # import Netflix dataset to compute weight

weight = 1/np.log10(dff.loc[:,"movieId"].value_counts())

del dff # RAM plz

In [None]:
def single_computeScore(db, df, weigth, r0=1.5, d0=30, com_movies=None):

        if com_movies == None:
            com_movies = df.index.unique().intersection(db.index.unique()) # compute movies in common
        
        # ordering
        db = db.loc[com_movies,:]
        df = df.loc[com_movies,:]
        
        # compute score
        tmp = np.abs(df.subtract(db,fill_value=0).astype(np.int64))
        tmp.loc[:,"score"] = weight.loc[com_movies]*(np.exp(-tmp.loc[:,'rating']/r0) + np.exp(-tmp.loc[:,'timestamp']/d0))
        
        tmp.drop(["rating", "timestamp"],axis=1,inplace=True) # RAM
        
        tmp.set_index(['uF','uM'],inplace=True) # better representation
        
        R = tmp.loc[:,'score'].groupby(level=[0,1]).sum() 
        
        # compute eccentricity
        
        R_std = R.std(level=0) # loneny match → NAN (will be discarded anyway)
        
        R_2best = R.groupby(level=0, group_keys=False).nlargest(2) # take each 2 best matches (each df_user get 2 db_user)
        
        R_ecc = R_2best.groupby(level=0).diff(-1).dropna()/R_std # eccentricity
        
        return R_ecc.sum() # we only want the sum of, not each

## Movies

In [None]:
nb_movies = 30
iteration = 100

result_movies = [0.0] # 0 movie → no match
for nb_movies in range(1,nb_movies+1):
    sumecc = 0
    
    #create fake user
    new_NF_userId = np.full(nb_movies,2649430)
    new_ML_userId = np.full(nb_movies,138494)
    
    for i in range(iteration):
        db = dbb.copy()
        
        selected_movies = sample(list_movies,nb_movies) # take random list of movies
        
        # gen fake user's random attributes
        new_rates = np.random.randint(1,6,nb_movies)
        new_timestamp = np.random.randint(0,2649,nb_movies)

        db = db.append(pd.DataFrame({'uM': new_ML_userId,'rating':new_rates,"timestamp":new_timestamp},index=selected_movies))
        df = pd.DataFrame({'uF': new_NF_userId,'rating':new_rates,"timestamp":new_timestamp},index=selected_movies)

        db = db.loc[selected_movies,:]
        
        sumecc += single_computeScore(db, df, weight, com_movies=selected_movies)

    result_movies.append(sumecc/iteration)

In [None]:
plt.plot(result_movies)
plt.show()

# Noisy Rating

In [None]:
nb_movies = 30
iteration = 100


result_rating = []

new_NF_userId = np.full(nb_movies,2649430)
new_ML_userId = np.full(nb_movies,138494)

for noise in range(1,11):
    sumecc = 0
    for i in range(iteration):
        db = dbb.copy()
        
        selected_movies = sample(list_movies,nb_movies)
        
        new_rates = np.random.randint(1,6,nb_movies)
        new_ratesNF = new_rates +  np.random.randint(-noise,noise+1,nb_movies)/2
        new_ratesNF[new_ratesNF > 5] = 5
        new_ratesNF[new_ratesNF < 1] = 1
        new_timestamp = np.random.randint(0,2649,nb_movies)
        
        db = db.append(pd.DataFrame({'uM': new_ML_userId,'rating':new_ratesNF,"timestamp":new_timestamp},index=selected_movies))
        df = pd.DataFrame({'uF': new_NF_userId,'rating':new_rates,"timestamp":new_timestamp},index=selected_movies)
        
        db = db.loc[selected_movies,:]
        
        sumecc += single_computeScore(db, df, weight, com_movies=selected_movies)
        
    result_rating.append(sumecc/iteration)

In [None]:
plt.plot(result_rating)
plt.show()

# Noisy Timestamp

In [None]:
nb_movies = 30
iteration = 100

result_timestamp = []

new_NF_userId = np.full(nb_movies,2649430)
new_ML_userId = np.full(nb_movies,138494)

for noise in [3,7,14,21,31,45,60,90]:
    sumecc = 0
    for i in range(iteration):
        db = dbb.copy()
        
        selected_movies = sample(list_movies,nb_movies)
        
        new_rates = np.random.randint(1,6,nb_movies)
        new_timestamp = np.random.randint(0,2649,nb_movies)
        new_timestampNF = new_timestamp +  np.random.randint(-noise,noise+1,nb_movies)
        new_timestampNF[new_timestampNF > 2649] = 2649
        new_timestampNF[new_timestampNF < 0] = 0
        
        db = db.append(pd.DataFrame({'uM': new_ML_userId,'rating':new_rates,"timestamp":new_timestampNF},index=selected_movies))
        df = pd.DataFrame({'uF': new_NF_userId,'rating':new_rates,"timestamp":new_timestamp},index=selected_movies)

        db = db.loc[selected_movies,:]
        
        sumecc += single_computeScore(db, df, weight, com_movies=selected_movies)
        
    result_timestamp.append(sumecc/iteration)

In [None]:
plt.plot(result_timestamp)
plt.show()

In [None]:
import csv
_input = [('nb_movies.csv',result_movies), ('noisy_rating.csv',result_rating), ('noisy_day.csv',result_timestamp)]
for name, data in _input:
    np.savetxt(name, data)