In [53]:
import pandas as pd                                          #importo libreria pandas per gestire dati in formato DataFrame
import numpy as np                                           # importo libreria numpy per operare su array multidimensionali
from sklearn.metrics.pairwise import cosine_similarity       #importo funzione cosine_similarity per calcolare similarity tra due users
from sklearn.metrics import pairwise_distances               #importo funzione pairwise_distance per calcolare la distanza di matrici
import random                                                #permette di generare numeri casuali

In [54]:
#Creo due dataFrame movies e ratings

movies = pd.read_csv(r'/content/movies.csv', sep=',', names=['movieId', 'title', 'genres'],  encoding="Latin1")

# pd.read_csv: Utilizza la funzione read_csv della libreria pandas per leggere i dati da un file CSV.
# "movies.csv": Specifica il percorso del file CSV.
# sep=',': Specifica che il separatore tra le colonne nel file CSV è la virgola.
# names=['movieId', 'title', 'genres']: Assegna i nomi alle colonne del DataFrame.
# encoding="Latin1": Specifica l'encoding del file. Latin1 è comune per gestire caratteri speciali in testi in lingue europee.

ratings = pd.read_csv(r'/content/ratings.csv', sep=',', names =['userId', 'movieId', 'rating', 'timestamp'], encoding = "Latin1")

# pd.read_csv: Utilizza la funzione read_csv della libreria pandas per leggere i dati da un file CSV.
# "ratings.csv": Specifica il percorso del file CSV.
# sep=',': Specifica che il separatore tra le colonne nel file CSV è la virgola.
# names=['userId', 'moviedId', 'rating', 'timestamp']: Assegna i nomi alle colonne del DataFrame.
# encoding="Latin1": Specifica l'encoding del file. Latin1 è comune per gestire caratteri speciali in testi in lingue europee.


In [55]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [56]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [57]:
#Elimino le colonne che non interessano (genres),(timestamp)
movies.drop(['genres'], axis=1, inplace=True)
#['genres']: Specifica il nome della colonna da rimuovere, che è 'genres' in questo caso.
#axis=1: Indica che la rimozione deve avvenire lungo l'asse delle colonne.
#inplace=True: Indica che la modifica deve essere effettuata direttamente sul DataFrame
ratings.drop(['timestamp'], axis=1, inplace=True)

In [58]:
movies.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [59]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [60]:
#numero di utenti unici nel file ratings
print(ratings['userId'].nunique())


610


In [61]:
#verifico del formato della colonna rating
print(ratings['rating'].dtypes)


float64


In [62]:
#effettuo la media dei voti per ogni singolo user
mean = ratings.groupby(by="userId",as_index=False)['rating'].mean()
mean.head()


Unnamed: 0,userId,rating
0,1,4.366379
1,2,3.948276
2,3,2.435897
3,4,3.555556
4,5,3.636364


In [63]:
#Normalizzazione dei voti dei singoli utenti
rating_avg = pd.merge(ratings,mean,on='userId')
rating_avg.head()
#faccio la fusione dei due dataframe di ratings e mean sulla base dell'userId

Unnamed: 0,userId,movieId,rating_x,rating_y
0,1,1,4.0,4.366379
1,1,3,4.0,4.366379
2,1,6,4.0,4.366379
3,1,47,5.0,4.366379
4,1,50,5.0,4.366379


In [64]:
rating_avg = rating_avg.rename(columns={"rating_y": "rating_avg"})
rating_avg['adg_rating']=rating_avg['rating_x']-rating_avg['rating_avg']
#Creo una nuova colonna adg_rating che deve essere la differenza tra il voto dato ad un movie e la media dei voti dati dall'utente
rating_avg.head()

Unnamed: 0,userId,movieId,rating_x,rating_avg,adg_rating
0,1,1,4.0,4.366379,-0.366379
1,1,3,4.0,4.366379,-0.366379
2,1,6,4.0,4.366379,-0.366379
3,1,47,5.0,4.366379,0.633621
4,1,50,5.0,4.366379,0.633621


In [65]:
#Creo una tabella pivot con indice l'user id e righe movieId, andando ad inserire come valori i voti e ottenendo NaN per i film che non sono stati votati dagli n utenti
check = pd.pivot_table(rating_avg,values='rating_x',index='userId',columns='movieId')
check.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [66]:
final=pd.pivot_table(rating_avg,values='adg_rating',index='userId',columns='movieId')
final
#Creo una tabella pivot con i valori della colonna 'adg_rating' usando userId come indice e movieId come colonna
#a differenza della tabella precedente invece dei voti abbiamo i voti normalizzati

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,,-0.366379,,,-0.366379,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,0.363636,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-1.157399,,,,,,-1.157399,,,,...,,,,,,,,,,
607,0.213904,,,,,,,,,,...,,,,,,,,,,
608,-0.634176,-1.134176,-1.134176,,,,,,,0.865824,...,,,,,,,,,,
609,-0.270270,,,,,,,,,0.729730,...,,,,,,,,,,


In [67]:

#Sostituisco i valori dulli con i voti medi dei film
final_movie = final.fillna(final.mean(axis=0))
#axis=0 indica che la media va calcolata sull'asse delle righe
final_movie


movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.366379,-0.053158,-0.366379,-1.096045,-0.522626,-0.366379,-0.400728,-0.625024,-0.455446,-0.056326,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024
2,0.312167,-0.053158,-0.234798,-1.096045,-0.522626,0.378461,-0.400728,-0.625024,-0.455446,-0.056326,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024
3,0.312167,-0.053158,-0.234798,-1.096045,-0.522626,0.378461,-0.400728,-0.625024,-0.455446,-0.056326,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024
4,0.312167,-0.053158,-0.234798,-1.096045,-0.522626,0.378461,-0.400728,-0.625024,-0.455446,-0.056326,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024
5,0.363636,-0.053158,-0.234798,-1.096045,-0.522626,0.378461,-0.400728,-0.625024,-0.455446,-0.056326,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-1.157399,-0.053158,-0.234798,-1.096045,-0.522626,0.378461,-1.157399,-0.625024,-0.455446,-0.056326,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024
607,0.213904,-0.053158,-0.234798,-1.096045,-0.522626,0.378461,-0.400728,-0.625024,-0.455446,-0.056326,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024
608,-0.634176,-1.134176,-1.134176,-1.096045,-0.522626,0.378461,-0.400728,-0.625024,-0.455446,0.865824,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024
609,-0.270270,-0.053158,-0.234798,-1.096045,-0.522626,0.378461,-0.400728,-0.625024,-0.455446,0.729730,...,-0.205224,-0.705224,0.294776,0.294776,-0.205224,0.294776,-0.205224,-0.205224,-0.205224,0.372024


In [68]:
#Uso Cosine Similarity per trovare i vicini
final_user = final.apply(lambda row: row.fillna(row.mean()), axis=1)
final_user

#final.apply(...): Applica una funzione a ogni riga della tabella final.
#lambda row: row.fillna(row.mean()): Per ogni riga, sostituisci i valori mancanti (NaN) con la media dei valori non mancanti della stessa riga.
#axis=1: Specifica che l'applicazione della funzione deve avvenire lungo le colonne (cioè, lungo le righe originali).
#final_user: Il risultato di questa operazione è un nuovo DataFrame chiamato final_user, che contiene la tabella final con i valori mancanti sostituiti dalla media della riga corrispondente.

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-3.663793e-01,1.837611e-16,-3.663793e-01,1.837611e-16,1.837611e-16,-3.663793e-01,1.837611e-16,1.837611e-16,1.837611e-16,1.837611e-16,...,1.837611e-16,1.837611e-16,1.837611e-16,1.837611e-16,1.837611e-16,1.837611e-16,1.837611e-16,1.837611e-16,1.837611e-16,1.837611e-16
2,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,...,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16,2.143879e-16
3,1.821904e-16,1.821904e-16,1.821904e-16,1.821904e-16,1.821904e-16,1.821904e-16,1.821904e-16,1.821904e-16,1.821904e-16,1.821904e-16,...,1.821904e-16,1.821904e-16,1.821904e-16,1.821904e-16,1.821904e-16,1.821904e-16,1.821904e-16,1.821904e-16,1.821904e-16,1.821904e-16
4,2.055969e-16,2.055969e-16,2.055969e-16,2.055969e-16,2.055969e-16,2.055969e-16,2.055969e-16,2.055969e-16,2.055969e-16,2.055969e-16,...,2.055969e-16,2.055969e-16,2.055969e-16,2.055969e-16,2.055969e-16,2.055969e-16,2.055969e-16,2.055969e-16,2.055969e-16,2.055969e-16
5,3.636364e-01,1.110223e-16,1.110223e-16,1.110223e-16,1.110223e-16,1.110223e-16,1.110223e-16,1.110223e-16,1.110223e-16,1.110223e-16,...,1.110223e-16,1.110223e-16,1.110223e-16,1.110223e-16,1.110223e-16,1.110223e-16,1.110223e-16,1.110223e-16,1.110223e-16,1.110223e-16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,-1.157399e+00,-1.656871e-16,-1.656871e-16,-1.656871e-16,-1.656871e-16,-1.656871e-16,-1.157399e+00,-1.656871e-16,-1.656871e-16,-1.656871e-16,...,-1.656871e-16,-1.656871e-16,-1.656871e-16,-1.656871e-16,-1.656871e-16,-1.656871e-16,-1.656871e-16,-1.656871e-16,-1.656871e-16,-1.656871e-16
607,2.139037e-01,-1.899847e-17,-1.899847e-17,-1.899847e-17,-1.899847e-17,-1.899847e-17,-1.899847e-17,-1.899847e-17,-1.899847e-17,-1.899847e-17,...,-1.899847e-17,-1.899847e-17,-1.899847e-17,-1.899847e-17,-1.899847e-17,-1.899847e-17,-1.899847e-17,-1.899847e-17,-1.899847e-17,-1.899847e-17
608,-6.341757e-01,-1.134176e+00,-1.134176e+00,1.539082e-16,1.539082e-16,1.539082e-16,1.539082e-16,1.539082e-16,1.539082e-16,8.658243e-01,...,1.539082e-16,1.539082e-16,1.539082e-16,1.539082e-16,1.539082e-16,1.539082e-16,1.539082e-16,1.539082e-16,1.539082e-16,1.539082e-16
609,-2.702703e-01,9.601929e-17,9.601929e-17,9.601929e-17,9.601929e-17,9.601929e-17,9.601929e-17,9.601929e-17,9.601929e-17,7.297297e-01,...,9.601929e-17,9.601929e-17,9.601929e-17,9.601929e-17,9.601929e-17,9.601929e-17,9.601929e-17,9.601929e-17,9.601929e-17,9.601929e-17


In [69]:
#Applico il cosine similarity, creando una nuova tabella in cui sia sulle righe che sulle colonne abbiamo l'userId e i valori corrispondono alla similarità tra i due utenti che varia tra 0 e 1

cosine = cosine_similarity(final_movie)
#Calcola la similarità coseno tra le righe di final_movie. cosine sarà una matrice quadrata dove ogni elemento (i, j) rappresenta la similarità coseno tra la riga i e la riga j di

np.fill_diagonal(cosine, 0 )
#Imposta a zero tutti gli elementi sulla diagonale di cosine. Questo è fatto perché la similarità coseno di un vettore con se stesso è sempre

similarity_with_movie =pd.DataFrame(cosine,index=final_movie.index)
#Crea un DataFrame similarity_with_movie utilizzando la matrice di similarità coseno cosine come dati. Le righe e le colonne di questo DataFrame sono indicate dagli indici di final_movie

similarity_with_movie.columns=final_user.index
#Rinomina le colonne di similarity_with_movie con gli indici di final_user.

similarity_with_movie.head()
#Restituisce le prime righe del DataFrame risultante.

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.988283,0.978406,0.96422,0.986819,0.970456,0.971643,0.987468,0.986382,0.973397,...,0.987335,0.978916,0.917922,0.983978,0.978638,0.959693,0.97612,0.932806,0.98938,0.952774
2,0.988283,0.0,0.987141,0.971166,0.995793,0.979893,0.981852,0.995168,0.995108,0.981285,...,0.996067,0.988455,0.929086,0.993014,0.988206,0.968868,0.983619,0.940224,0.997957,0.963114
3,0.978406,0.987141,0.0,0.961237,0.985179,0.970773,0.971932,0.98514,0.985263,0.971464,...,0.986072,0.978562,0.921433,0.983193,0.978363,0.957067,0.974114,0.930653,0.988086,0.954265
4,0.96422,0.971166,0.961237,0.0,0.968638,0.955187,0.958876,0.97009,0.969158,0.959626,...,0.970625,0.964815,0.903118,0.967106,0.963962,0.942701,0.958891,0.911591,0.9721,0.935866
5,0.986819,0.995793,0.985179,0.968638,0.0,0.978368,0.980011,0.992905,0.993494,0.979161,...,0.994448,0.986028,0.928126,0.991066,0.98609,0.96723,0.982366,0.938353,0.996584,0.960854


In [70]:
#calcolo la similarità coseno tra le righe della tabella final_user e creare un DataFrame chiamato similarity_with_user

b = cosine_similarity(final_user)
#Calcola la similarità coseno tra le righe di final_user. b sarà una matrice quadrata

np.fill_diagonal(b, 0 )
#Imposta a zero tutti gli elementi sulla diagonale di cosine

similarity_with_user = pd.DataFrame(b,index=final_user.index)
#Crea un DataFrame similarity_with_user utilizzando la matrice di similarità coseno b come dati

similarity_with_user.columns=final_user.index
#Rinomina le colonne di similarity_with_user con gli indici di final_user.

similarity_with_user.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.001264516,0.0005525772,0.048419,0.021847,-0.045497,-0.006199672,0.047013,0.01950985,-0.008754088,...,0.018127,-0.017172,-0.015221,-0.03705875,-0.02912138,0.012016,0.055261,0.075224,-0.02571255,0.010932
2,0.001265,0.0,4.958179e-30,-0.017164,0.021796,-0.021051,-0.01111357,-0.048085,1.029595e-29,0.003011629,...,-0.050551,-0.031581,-0.001688,1.163171e-30,-4.534179e-30,0.006226,-0.020504,-0.006001,-0.060091,0.024999
3,0.000553,4.958179e-30,0.0,-0.01126,-0.031539,0.0048,4.903098e-31,-0.032471,2.3241849999999997e-30,8.791995000000001e-31,...,-0.004904,-0.016117,0.017749,2.836948e-31,-0.001430628,-0.037289,-0.007789,-0.013001,4.3435379999999996e-30,0.01955
4,0.048419,-0.01716402,-0.01125978,0.0,-0.02962,0.013956,0.05809139,0.002065,-0.005873603,0.05159032,...,-0.037687,0.063122,0.02764,-0.01378212,0.04003747,0.02059,0.014628,-0.037569,-0.01788358,-0.000995
5,0.021847,0.02179571,-0.03153892,-0.02962,0.0,0.009111,0.01011715,-0.012284,2.7419629999999997e-30,-0.03316512,...,0.015964,0.012427,0.027076,0.01246135,-0.03627206,0.026319,0.031896,-0.001751,0.09382892,-0.000278


In [71]:
#Definisco una funzione che trova n vicini di un determinato id
def find_n_neighbours(df,n):
    order = np.argsort(df.values, axis=1)[:, :n]
    #Per ogni riga di df, restituisce gli indici ordinati degli elementi in modo crescente, selezionando solo i primi n indici per ogni riga

    # Applica una funzione lambda per ottenere gli indici dei primi "n" vicini ordinati per similarità decrescente
    df = df.apply(lambda x: pd.Series(x.sort_values(ascending=False)
           .iloc[:n].index,
          index=['top{}'.format(i) for i in range(1, n+1)]), axis=1)
    #Per ogni riga x, crea una Serie che contiene gli indici degli elementi ordinati in modo decrescente.
    return df



In [72]:
#Top 10 neighbours for each user
sim_user_10_u = find_n_neighbours(similarity_with_user,10)
sim_user_10_u

Unnamed: 0_level_0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,301,597,414,477,57,369,206,535,590,418
2,189,246,378,209,227,326,393,332,196,528
3,441,496,549,231,527,537,313,518,244,246
4,75,137,590,391,43,128,462,250,290,85
5,145,35,565,134,58,444,446,347,530,142
...,...,...,...,...,...,...,...,...,...,...
606,590,286,182,434,18,474,387,489,354,288
607,164,183,78,126,498,588,176,557,370,574
608,45,307,387,428,599,91,580,414,316,425
609,565,242,455,102,530,444,54,512,589,566


In [73]:
#Top 10 neighbours for each movie
sim_movie_10_m = find_n_neighbours(similarity_with_movie,10)
sim_movie_10_m

Unnamed: 0_level_0,top1,top2,top3,top4,top5,top6,top7,top8,top9,top10
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,49,54,72,53,515,550,189,133,26,513
2,49,189,515,53,25,145,54,26,87,194
3,515,49,25,53,496,54,442,72,26,87
4,581,54,189,49,25,515,300,53,472,251
5,145,49,515,53,609,26,189,550,35,87
...,...,...,...,...,...,...,...,...,...,...
606,54,609,289,49,550,53,515,25,87,26
607,54,164,53,163,25,87,293,515,49,550
608,431,158,289,316,293,26,53,54,13,157
609,54,53,49,515,25,87,194,550,26,189


Find neighbors in common from two person

In [74]:
#Funzione che seleziona casualmente due persone diverse dal dataframe ratings
def candidate_group_person():
    people_tot = []           #lista vuota
    found = True              #Variabile booleano

    #Loop di selezione casuale
    while found:
        person_one = random.randint(1, 610)
        person_two = random.randint(1, 610)
        #controllo che le due persone devono essere diverse, e devono esistere nel dataframe ratings, se esisto imposto found su false ed esco dal ciclo
        if(person_one != person_two and (ratings['userId'] == person_one).any() and (ratings['userId'] == person_two).any()):
            found = False
            people_tot.append(person_one)
            people_tot.append(person_two)
            #Aggiungo le persone alla lista

    return people_tot

In [75]:
#Funzione che trova elementi comuni all'interno di due serie
def find_common_elements(series1, series2):
    common_elements = series1[series1.isin(series2)]
    return common_elements

In [76]:
people=[0,1]
sim_user_30_u1 = sim_user_10_u.iloc[0]
sim_user_30_u2 = sim_user_10_u.iloc[1]
#Inizializzo delle persone e seleziono le prime due righe di similitudini

find_neighbors = find_common_elements(sim_user_30_u1, sim_user_30_u2)
#Ricerca degli elementi in comune tra i due vettori

while find_neighbors.empty :
    people = candidate_group_person()
    sim_user_10_u1 = sim_user_10_u.iloc[people[0]]
    sim_user_10_u2 = sim_user_10_u.iloc[people[1]]
    find_neighbors = find_common_elements(sim_user_10_u1, sim_user_10_u2)
#Loop finche non si trovano elementi in comune

In [77]:
people

[296, 234]

In [78]:
find_neighbors

top5    512
Name: 297, dtype: int64

In [79]:
sim_user_10_u1

top1     588
top2     368
top3     372
top4     102
top5     512
top6      32
top7     437
top8     574
top9     112
top10    481
Name: 297, dtype: int64

In [80]:
sim_user_10_u2

top1     107
top2      46
top3     512
top4     179
top5     498
top6     347
top7     455
top8     394
top9     117
top10    229
Name: 235, dtype: int64

In [81]:
#Creazione di un dataFrame neighbors_dataframe
neighbors_dataFrame = pd.DataFrame(columns=['i_u1', 'i_u2', 'value', 'weighted_average']) #con colonne quei nomi li

#Itero per gli elementi di find_neightbors
#i_u1 e i_u2: Estraggono gli indici da sim_user_10_u1 e sim_user_10_u2, eseguendo alcune operazioni di formattazione sulla stringa per ottenere l'indice desiderato.
for index, value in find_neighbors.iteritems():
    i_u1 = sim_user_10_u1.index[sim_user_10_u1 == value][0]
    i_u1 = i_u1.split("top")
    i_u1 = i_u1[1]
    i_u2 = sim_user_10_u2.index[sim_user_10_u2 == value][0]
    i_u2 = i_u2.split("top")
    i_u2 = i_u2[1]
    weighted_average = int(i_u1) + int(i_u2)    # Calcola la somma degli indici convertiti in interi
    new_rows = pd.DataFrame([[ i_u1, i_u2, value, weighted_average]], columns=['i_u1', 'i_u2', 'value', 'weighted_average'])  #Crea un nuovo DataFrame con le informazioni estratte e calcolate
    neighbors_dataFrame = pd.concat([neighbors_dataFrame, new_rows], ignore_index=True)       #Concatena il nuovo DataFrame a neighbors_dataFrame.
    find_neighbors = find_neighbors.drop(index=index)     # Rimuove l'elemento corrente da find_neighbors

neighbors_dataFrame['weighted_average'] = neighbors_dataFrame['weighted_average'].astype(float)   #Converte la colonna 'weighted_average' in tipo float.
neighbors_dataFrame

  for index, value in find_neighbors.iteritems():


Unnamed: 0,i_u1,i_u2,value,weighted_average
0,5,3,512,8.0


In [82]:
#Funzione che cerca film che sono stati valutati da due utenti
def get_user_similar_movies( user1, user2 ):
  #Filtra le righe del DataFrame rating_avg per ottenere solo le valutazioni dell'utente user1
    common_movies = rating_avg[rating_avg.userId == user1].merge(
    rating_avg[rating_avg.userId == user2],
    on = "movieId",
    how = "inner" )
    #Esegue una fusione interna tra le valutazioni dell'utente user1 e user2 basandosi sulla colonna 'movie_id'.

    return common_movies.merge( movies, on = 'movieId' )
    #Ritorna la fusione tra il DataFrame risultante (common_movies) e il DataFrame movies

In [83]:
a = get_user_similar_movies(370,116)
a = a.loc[ : , ['rating_x_x','rating_x_y','title']]
a.head()

Unnamed: 0,rating_x_x,rating_x_y,title
0,3.5,3.5,Seven (a.k.a. Se7en) (1995)
1,3.5,5.0,"Mask, The (1994)"
2,3.5,5.0,Blade Runner (1982)
3,4.0,4.0,Terminator 2: Judgment Day (1991)
4,4.0,2.0,"Silence of the Lambs, The (1991)"


Film da raccomandare

In [84]:
#nsmallest per ottenere la riga con il valore minimo nella colonna 'weighted_average' nel DataFrame neighbors_dataFrame
neighbor_similar_row = neighbors_dataFrame.nsmallest(1, 'weighted_average')
neighbor_similar_row

Unnamed: 0,i_u1,i_u2,value,weighted_average
0,5,3,512,8.0


In [85]:

df_filtered_u1 = ratings.loc[ratings['userId'] == people[0]]
df_filtered_u1
#filtrare il DataFrame ratings per ottenere solo le righe dove la colonna 'userId' è uguale al primo elemento dell'array people

Unnamed: 0,userId,movieId,rating
44436,296,50,5.0
44437,296,110,5.0
44438,296,296,4.5
44439,296,318,5.0
44440,296,356,5.0
44441,296,527,4.5
44442,296,1201,5.0
44443,296,1704,5.0
44444,296,2028,4.5
44445,296,2324,5.0


In [86]:
df_filtered_u2 = ratings.loc[ratings['userId'] == people[1]]
df_filtered_u2
#filtrare il DataFrame ratings per ottenere solo le righe dove la colonna 'userId' è uguale al secondo elemento dell'array people

Unnamed: 0,userId,movieId,rating
34910,234,1,5.0
34911,234,24,5.0
34912,234,34,3.0
34913,234,42,4.0
34914,234,48,5.0
...,...,...,...
35107,234,4351,4.0
35108,234,4477,1.0
35109,234,4519,4.0
35110,234,4571,4.0


In [87]:
# filtrare il DataFrame ratings per ottenere solo le righe dove la colonna 'user_id' è uguale al valore nella colonna 'value' della riga più simile nel DataFrame neighbors_dataFrame
df_filtered_neighbor = ratings.loc[ratings['userId'] == neighbor_similar_row['value'].iloc[0]]   #Il metodo iloc[0] restituisce il valore della prima riga.
df_filtered_neighbor


Unnamed: 0,userId,movieId,rating
81000,512,2,3.0
81001,512,32,5.0
81002,512,39,4.0
81003,512,47,5.0
81004,512,50,5.0
81005,512,110,5.0
81006,512,150,4.0
81007,512,151,5.0
81008,512,160,4.0
81009,512,161,4.0


In [88]:
#unisco i 3 dataFrame
merged_df = df_filtered_neighbor.merge(df_filtered_u2, on="movieId", how="outer", suffixes=("_neighbor", "_u2"))
merged_df = merged_df.merge(df_filtered_u1, on="movieId", how="outer")

# Found missing values
missing_movies = merged_df.loc[~merged_df["movieId"].isin(df_filtered_u2["movieId"]) & ~merged_df["movieId"].isin(df_filtered_u1["movieId"]), ["movieId", "rating_neighbor"]]
# cerca i film mancanti che sono stati valutati dall'utente vicino (df_filtered_neighbor) ma non dall'utente 1 (df_filtered_u1) o dall'utente 2 (df_filtered_u2)

# Rename column
missing_movies.rename(columns={"rating_neighbor": "rating"}, inplace=True)


missing_movies = missing_movies.sort_values(by=['rating'], ascending=False)

missing_movies

Unnamed: 0,movieId,rating
14,253,5.0
34,457,5.0
3,47,5.0
7,151,5.0
1,32,5.0
10,165,5.0
47,593,5.0
15,288,5.0
49,597,4.0
35,474,4.0


In [89]:
#restituisco solo i film con tating >3
result = missing_movies.loc[missing_movies['rating'] >= 3]
result


Unnamed: 0,movieId,rating
14,253,5.0
34,457,5.0
3,47,5.0
7,151,5.0
1,32,5.0
10,165,5.0
47,593,5.0
15,288,5.0
49,597,4.0
35,474,4.0


In [90]:
id_movie = int(result.iloc[0]['movieId'])
movie_row = movies.loc[movies['movieId'] == id_movie]
#estraggo l'ID del primo film dal DataFrame result, convertirlo in un intero, quindi utilizzare questo ID per ottenere la riga corrispondente dal DataFrame movies.

In [91]:
print("Ciao, il film consigliato per gli utenti: " , people[0], " e ", people [1], " è: " , movie_row['title'].item())

Ciao, il film consigliato per gli utenti:  296  e  234  è:  Interview with the Vampire: The Vampire Chronicles (1994)
