In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_pickle('spectral.pkl')
df.head()

Unnamed: 0,location.lat,location.lon,minority_serving.historically_black,act_scores.midpoint.cumulative,sat_scores.average.overall,demographics.race_ethnicity.white,net_price_less_30000,net_price_30001_48000,net_price_48001_75000,net_price_75001_110000,...,outcome_cohort.part_time.first_time.6yr,outcome_cohort.part_time.first_time.8yr,outcome_cohort.full_time.not_first_time.6yr,outcome_cohort.full_time.not_first_time.8yr,outcome_cohort.part_time.not_first_time.6yr,outcome_cohort.part_time.not_first_time.8yr,division_D1,division_D2,division_D3,division_non_ncaa
0,34.783368,-86.568502,1.0,18.0,850.0,0.034,13075.0,12458.0,15857.0,16022.0,...,2.0,2.0,110.0,110.0,16.0,16.0,0,0,0,1
1,33.50223,-86.80917,0.0,25.0,1147.0,0.5863,13614.0,14746.0,17601.0,18873.0,...,30.0,30.0,686.0,686.0,248.0,248.0,1,0,0,0
2,34.722818,-86.63842,0.0,27.0,1221.0,0.7024,15252.0,17228.0,19178.0,20842.0,...,31.0,31.0,483.0,483.0,131.0,131.0,0,1,0,0
3,32.364317,-86.295677,1.0,18.0,844.0,0.0165,7519.0,2611.0,9831.0,11446.0,...,49.0,49.0,177.0,177.0,36.0,36.0,1,0,0,0
4,33.2144,-87.545766,0.0,27.0,1181.0,0.7807,17263.0,19279.0,21309.0,22594.0,...,82.0,82.0,1131.0,1131.0,210.0,210.0,0,0,0,1


In [4]:
name=pd.read_csv('final_df.csv')
names=name['name']

Now that we have loaded our data, let's add an extra column called "like" that is always 1.
This seems pointless right now (what information can we gain from a column that always takes the same value?) but it will be useful in the next step.

In [2]:
df['like'] = 1.0
df.head()

Unnamed: 0,userId,movieId,like
0,412,512,1.0
1,458,770,1.0
2,185,37,1.0
3,137,701,1.0
4,190,870,1.0


Now we take this (sparse) dataset, and convert it into a dataframe with 
- a row for each  user (i.e. the user id is the index)
- a column for each movie (the movie id is the name of the column)
- a 1 if the user in that row recommended the movie in that column, or a zero otherwise.
We use `pd.pivot` -- we input the `1`s from that silly `like` column. Missing values we replace with zeroes (i.e. user hasn't seen that movie)

In [5]:
matrix = df.pivot(index='userId', columns='movieId', values='like').fillna(0)
matrix.head()

KeyError: 'userId'

Test the cosine similarity approach. Grab the list of 1s/0s for user 0, and get cosine similarity with every other user:

$$
\cos\theta = \frac{u_1 \cdot u_j}{\sqrt{||u_1|| \,\, ||u_j||}}
$$

where $\theta$ is the angle between user vector 1 and user vector j.

Then sort by the cosine similarity, biggest to smallest.

In [4]:
def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / np.sqrt(np.dot(vec1, vec1) * np.dot(vec2, vec2))

user_vector = movie_matrix.loc[0,:]
cosine = movie_matrix.apply(lambda x: cosine_similarity(x, user_vector), axis=1)
cosine.sort_values(ascending=False)[:6]

userId
0      1.000000
474    0.534522
83     0.520266
378    0.507093
89     0.478091
451    0.462910
dtype: float64

Wrap this behavior up into a function:

In [5]:
def similar_users(user_id, num_users = 5, df = movie_matrix):
    user_vector = df.loc[user_id].values
    cosine = movie_matrix.apply(lambda x: cosine_similarity(x, user_vector), axis=1)
    cosine = cosine.sort_values(ascending=False)
    
    return cosine[1: num_users + 1]

similar_users(0, 4)
    
    

userId
474    0.534522
83     0.520266
378    0.507093
89     0.478091
dtype: float64

Do the same thing, but now use the movie vectors instead.

In [6]:
def similar_movies(movie_id, num_movies = 5, df = movie_matrix):
    movie_vector = df.loc[:, movie_id]
    cosine = movie_matrix.apply(lambda x: cosine_similarity(x, movie_vector), axis=0)
    cosine = cosine.sort_values(ascending=False)
    return cosine[1:num_movies + 1]

similar_movies(37,4)

movieId
451    0.410112
80     0.374352
245    0.367841
856    0.364947
dtype: float64

We can also look at raw counts of how many users that have seen movie `movie_id` have also liked other movies. This gives slightly different results from looking at the cosine similiarity

In [7]:
def similar_movies_by_count(movie_id, num_movies = 5, df=movie_matrix):
    mask = (df.loc[:, movie_id] > 0)
    users_seen_movie = df[mask]
    counts = users_seen_movie.sum(axis = 0).sort_values(ascending=False)
    return counts[:num_movies]

In [8]:
similar_movies_by_count(37)

movieId
37     46.0
856    22.0
451    21.0
327    21.0
234    20.0
dtype: float64

In [9]:
# grab all users who have seen movie "2"
users = movie_matrix[movie_matrix.loc[:,2] > 0]

In [26]:
# Here is the count of people that have seen movie 2 that have also seen movie 803
users.loc[:,803].sum()

16.0