# Imports

In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity

# User Vectors

Let us build the user vectors using the following info:

- age (int)
- sex (boolean)
- occupation (one-hot encoded)
- genre favor vector (sum of `user's rating to a movie * one-hot vector of the movie genres` for all the user ratings)

### Read the tables

In [2]:
df_users = pd.read_csv(
    '../data/interim/preprocessed/users.csv',
    index_col=0
)

print(df_users.shape)
df_users.head()

(943, 3)


Unnamed: 0_level_0,age,sex,occupation
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,24,M,technician
1,53,F,other
2,23,M,writer
3,24,M,technician
4,33,F,other


In [3]:
df_items = pd.read_csv(
    '../data/interim/preprocessed/items.csv',
    index_col=0
)

print(df_items.shape)
df_items.head()

(1680, 20)


Unnamed: 0_level_0,title,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,Toy Story (1995),0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0


In [4]:
genres = ['Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime',
          'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
          'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

df_ratings = pd.read_csv(
    '../data/interim/preprocessed/ratings.csv',
    index_col=0
)

df_ratings = df_ratings.merge(df_items[genres], how='inner', left_on='movie_id', right_index=True)

print(df_ratings.shape)
df_ratings.head()

(99990, 20)


Unnamed: 0_level_0,movie_id,rating,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
195,242,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
62,242,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
225,242,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
153,242,3,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
305,242,5,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


### Add genre favor data

In [5]:
for genre in genres:
    df_ratings[genre] = df_ratings['rating'] * df_ratings[genre]

user_genre_sum = df_ratings.groupby('user_id')[genres].sum()#.reset_index()

# normalize each user's genre fevor vector independently
user_genre_sum[genres] = user_genre_sum[genres].div(user_genre_sum[genres].max(axis=1), axis=0)

# merge the genre favors into df_users
df_users = pd.merge(df_users, user_genre_sum, how='left', on='user_id')

# fill nans with 0 (users without ratings for a genre)
df_users[genres] = df_users[genres].fillna(0)

df_users.head()

Unnamed: 0_level_0,age,sex,occupation,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,24,M,technician,0.595238,0.292857,0.095238,0.130952,0.752381,0.204762,0.057143,...,0.016667,0.011905,0.107143,0.090476,0.042857,0.411905,0.409524,0.447619,0.219048,0.052381
1,53,F,other,0.283582,0.097015,0.029851,0.089552,0.455224,0.253731,0.0,...,0.022388,0.067164,0.044776,0.022388,0.104478,0.492537,0.11194,0.320896,0.08209,0.0
2,23,M,writer,0.609375,0.21875,0.0,0.0,0.484375,0.46875,0.078125,...,0.0,0.078125,0.1875,0.0625,0.546875,0.265625,0.34375,0.828125,0.21875,0.0
3,24,M,technician,0.72093,0.325581,0.0,0.0,0.465116,0.44186,0.116279,...,0.0,0.0,0.093023,0.116279,0.465116,0.302326,0.534884,1.0,0.209302,0.0
4,33,F,other,0.715447,0.434959,0.215447,0.288618,1.0,0.142276,0.0,...,0.020325,0.020325,0.288618,0.162602,0.036585,0.178862,0.471545,0.227642,0.182927,0.020325


# Encode age, sex, and occupation

In [6]:
print(df_users['age'].max())

# take as a maximum age
max_age = 100
df_users['age'] /= max_age

73


In [7]:
# one-hot encode the sex column
df_users = pd.get_dummies(df_users, columns=['sex'], prefix='sex').drop(columns=['sex_F'])

In [8]:
# one-hot encode the occupation column
df_users = pd.get_dummies(df_users, columns=['occupation'], prefix='occ')

In [9]:
df_users

Unnamed: 0_level_0,age,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,...,occ_marketing,occ_none,occ_other,occ_programmer,occ_retired,occ_salesman,occ_scientist,occ_student,occ_technician,occ_writer
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.24,0.595238,0.292857,0.095238,0.130952,0.752381,0.204762,0.057143,1.000000,0.016667,...,False,False,False,False,False,False,False,False,True,False
1,0.53,0.283582,0.097015,0.029851,0.089552,0.455224,0.253731,0.000000,1.000000,0.022388,...,False,False,True,False,False,False,False,False,False,False
2,0.23,0.609375,0.218750,0.000000,0.000000,0.484375,0.468750,0.078125,1.000000,0.000000,...,False,False,False,False,False,False,False,False,False,True
3,0.24,0.720930,0.325581,0.000000,0.000000,0.465116,0.441860,0.116279,0.627907,0.000000,...,False,False,False,False,False,False,False,False,True,False
4,0.33,0.715447,0.434959,0.215447,0.288618,1.000000,0.142276,0.000000,0.292683,0.020325,...,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,0.26,0.890244,0.426829,0.048780,0.048780,0.829268,0.158537,0.000000,1.000000,0.048780,...,False,False,False,False,False,False,False,True,False,False
939,0.32,0.456647,0.208092,0.052023,0.121387,0.855491,0.173410,0.000000,1.000000,0.000000,...,False,False,False,False,False,False,False,False,False,False
940,0.20,1.000000,0.710526,0.368421,0.236842,0.763158,0.078947,0.000000,0.552632,0.000000,...,False,False,False,False,False,False,False,True,False,False
941,0.48,0.540146,0.379562,0.138686,0.408759,0.656934,0.000000,0.000000,1.000000,0.058394,...,False,False,False,False,False,False,False,False,False,False


# Cosine similarity

Let us compute the cosine similarities between user vectors. Then for each user save the list of other user ids sorted by descending of their similarity.

In [10]:
# extract user vectors as numpy arrays
user_vectors = df_users[df_users.columns].to_numpy()

# compute cosine similarity
similarity_matrix = cosine_similarity(user_vectors, user_vectors)

# dict to store the similarity info
users_similarity = {'user_id': [], 'similar_ids': []}

for i in range(len(df_users)):
    user_ids = range(len(similarity_matrix[i]))

    # get the list of user indices sorted by the cosine similarity with user i
    similar_indices = sorted(user_ids, key=lambda j: similarity_matrix[i][j], reverse=True)
    similar_indices = [(j, similarity_matrix[i][j]) for j in similar_indices if j != i]

    # save the similarity list
    users_similarity['user_id'].append(i)
    users_similarity['similar_ids'].append(similar_indices)

df_users_similarity = pd.DataFrame(users_similarity).set_index('user_id')
df_users_similarity

Unnamed: 0_level_0,similar_ids
user_id,Unnamed: 1_level_1
0,"[(888, 0.9931382418006969), (310, 0.9894742240..."
1,"[(272, 0.9851447001360526), (459, 0.9849620550..."
2,"[(444, 0.9855481657176935), (832, 0.9721014780..."
3,"[(293, 0.972894605130872), (811, 0.95356116078..."
4,"[(416, 0.931826885347274), (37, 0.931367324520..."
...,...
938,"[(31, 0.9885999638136528), (653, 0.98607859350..."
939,"[(499, 0.9916445868018551), (451, 0.9909002287..."
940,"[(520, 0.9760246519310486), (471, 0.9750222721..."
941,"[(279, 0.9735855785281248), (343, 0.9713593657..."


Max and min cosine similarity values:

In [11]:
print(similarity_matrix.max(), similarity_matrix.min())

1.0000000000000007 0.06602377848067785


Let us choose the thresholds for the cosine similarity:

In [15]:
threshold_similarity = 0.9
threshold_count = 2

np.sum(np.count_nonzero(similarity_matrix > threshold_similarity, axis=1) < threshold_count)

17

As you can see, there are only 17 out of 943 users who have less than 2 users with similarity score above 0,9. We can use this 0,9 threshold to filter the similar users that will influence the rating recommended by SVD algorithm.

We can now save the user vectors and similarity table:

In [13]:
df_users.to_csv('../data/interim/users/user_vectors.csv')
df_users_similarity.to_csv('../data/interim/users/users_similarity.csv')