In [1]:
import numpy as np
import pandas as pd

In [2]:
def read_movie_titles():
    rows = []
    keys = ['movie_id', 'release_year', 'movie_title']
    with open('movie_titles.csv', encoding='iso-8859-1') as f:
      for line in f.read().splitlines():
        rows.append(dict(zip(keys, line.split(',', 2))))
    return pd.DataFrame(rows)


In [3]:
df_movie = read_movie_titles()
df_movie = df_movie.astype({'movie_id': 'int32', 'movie_title': 'str'})
df_movie

Unnamed: 0,movie_id,release_year,movie_title
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW
...,...,...,...
17765,17766,2002,Where the Wild Things Are and Other Maurice Se...
17766,17767,2004,Fidel Castro: American Experience
17767,17768,2000,Epoch
17768,17769,2003,The Company


In [4]:
import time
def get_ratings_data():
    start_time = time.time()

    sorted_user_ratings_file = "sorted_user_ratings.csv"
    if not os.path.isfile(sorted_user_ratings_file):

        unsorted_user_ratings_file = "unsorted_user_ratings.csv"
        if not os.path.isfile(unsorted_user_ratings_file):
            print("combining movie ratings data...")
            files = ['combined_data_1.txt', 'combined_data_2.txt', 'combined_data_3.txt', 'combined_data_4.txt']
            movie_id=-1

            with open(unsorted_user_ratings_file, "w+", encoding='iso-8859-1') as f_out:
                f_out.write("user_id,movie_id,rating\n")
                for file in files:
                    print("{}".format(file))
                    with open(file, 'r', encoding='iso-8859-1') as f:
                        for line in f.read().splitlines():
                            if line.endswith(':'):
                                movie_id = line.split(':')[0]
                            else:
                                fields = line.split(',')
                                f_out.write("{},{},{}\n".format(fields[0], movie_id, fields[1]))

        print("saving sorted user ratings to disk...")
        df = pd.read_csv(unsorted_user_ratings_file, encoding='iso-8859-1')
        
        df.sort_values('user_id', ascending=True, inplace=True, kind='quicksort')
        df.to_csv(sorted_user_ratings_file, index=False)
    else:
        df = pd.read_csv(sorted_user_ratings_file, encoding='iso-8859-1')


    end_time = time.time() 
    print("... took {:.3f} seconds".format(end_time - start_time))
    return df

In [5]:
import os
df = get_ratings_data()
df.head(n=20)

... took 15.377 seconds


Unnamed: 0,user_id,movie_id,rating
0,6,14358,2
1,6,6134,4
2,6,5926,4
3,6,6797,3
4,6,3905,3
5,6,17560,3
6,6,13651,3
7,6,7230,5
8,6,3962,4
9,6,12560,3


In [6]:
unique_users = df['user_id'].unique()
unique_movies = df['movie_id'].unique()
print("unique users: {}, unique movies: {}".format(len(unique_users), len(unique_movies)))
print("total records: {}".format(len(df)))

unique users: 480189, unique movies: 17770
total records: 100480507


In [7]:
from collections import defaultdict
from itertools import islice

def read_qualifying():
    movie_id = user_id = None
    user_ids = set()
    movie_user_mapping = defaultdict(list)
    with open("./qualifying.txt", 'r') as f:
        for line in f.read().splitlines():
            if line.endswith(':'):
                movie_id = line
            else:
                user_id, rating_date = line.split(',')
                user_ids.add(user_id)
                movie_user_mapping[movie_id].append(user_id)
    print(len(user_ids))
    print(len(movie_user_mapping))
    print(list(islice(movie_user_mapping.items(), 5)))
    return user_ids, movie_user_mapping
    

In [8]:
import random

def get_random_users():
    users_per_chunk=1000
    c = list(unique_users)
    return random.sample(c, users_per_chunk)


In [19]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
import heapq

def get_recommended_movie_ids(user_id, other_users_ids):
    rating_threshold = 3    # only consider ratings >= 3 for movie suggestion.
    rated_by_me = set(df_subset[ (df_subset['user_id']==user_id)]['movie_id'])
    
#     print("rated by {}: {}".format(user_id, rated_by_me))
    rated_by_others = list()
    for uid in other_users_ids:
        rated_by_others.append(set(df_subset[ (df_subset['user_id']==uid) & (df_subset['rating'] >= rating_threshold) ]['movie_id']))

#     print("rated by {}: {}".format(other_users_ids, rated_by_others))

    rated_by_others_common = set.union(*rated_by_others)
#     print("common movies rated by {}: {}".format(other_users_ids, rated_by_others_common))

    # these movies will be rated now, based on similar user's ratings.
    not_rated_by_me = set.difference(rated_by_others_common, rated_by_me)
#     print("not rated by {}: {}".format(user_id, not_rated_by_me))

    new_ratings = dict()
    for movie_id in not_rated_by_me:
        # add new rating by this user = mean of nearest user's ratings.
        others_ratings = []
        for ouid in other_users_ids:
#             rated = df[ (df_subset['user_id']== ouid ) & (df_subset['movie_id'] == movie_id ) ]
            rated = df_subset[ (df_subset['user_id']== ouid ) & (df_subset['movie_id'] == movie_id ) ]

            if not rated.empty:
                others_ratings.append(rated.iloc[0]['rating'])
#         print("ratings for movie_id {} by {}: {}".format(movie_id, other_users_ids, others_ratings))
        new_rating = np.mean(others_ratings)

        # also save this in a temp dict for top 5 picks.
        df_subset[ (df_subset['user_id']==user_id) & (df_subset['movie_id']==movie_id) ]['rating'] = new_rating
        new_ratings[movie_id] = new_rating
#         print("new rating for movie {} : {:.1f}".format(movie_id, new_rating))

    # pick top 5 movies from the not_rated_by_me 
    max5 = heapq.nlargest(5, new_ratings, key=new_ratings.get)
    
    return max5


    

In [20]:
def get_movie_titles(movie_ids):
    return [ df_movie[ df_movie['movie_id']==x ].iloc[0]['movie_title'] for x in movie_ids ]
    


In [21]:
def get_similarity_matrix(user_ratings_normalized):
    sparse_matrix = sparse.csr_matrix(user_ratings_normalized)
    similarities_sparse = cosine_similarity(sparse_matrix.transpose(), dense_output=False)
    similarities_sparse.setdiag(-1)   # set diagonal entries to lowest values to exclude self similarity.
    return similarities_sparse

In [22]:
def get_similar_user_ids(i, user_id, similarities_sparse):
    N = 2
    col_vals = similarities_sparse.getrow(i).toarray().tolist()[0]

    # N most similar users (last element is the most similar)
    similar_records = np.argsort(col_vals)[-2:]
    similar_user_ids = [user_ids[similar_records[j]] for j in range(len(similar_records)-1, -1, -1)]
    similarity_score = heapq.nlargest(2, col_vals)
    return similar_user_ids, similarity_score


In [23]:
while True:
# user_ids, movie_user_mapping = read_qualifying()
#     df_subset = df[ df['user_id'].isin(user_ids)]
    user_ids = get_random_users()
    df_subset = df[ df['user_id'].isin(user_ids)]

    # get user ratings
    # user_ratings = df_subset.pivot(columns=['user_id'], index='movie_id', values='rating')
    user_ratings = df_subset.pivot(columns=['user_id'], index='movie_id', values='rating')
    print("--- user ratings.")

    # normalize user ratings, drop any users that have rated only 4 or less movies.
    user_ratings_normalized = user_ratings.subtract(user_ratings.mean(axis=0), axis=1)
    user_ratings_normalized = user_ratings_normalized.dropna(thresh=4,axis=1).fillna(0)
    print("randomly chosen {} users".format(user_ratings_normalized.shape[1]))

    # compute similarity matrix using cosine similarity, obtain a sparse matrix.
    similarities_sparse = get_similarity_matrix(user_ratings_normalized)
    print("--- similarity sparse matrix {}".format(user_ratings_normalized.shape))

    # user_ids = df_subset['user_id'].unique()

    # iterate the similarity matrix and find N most similar users for each user.
    for i in range(similarities_sparse.shape[1]):
        user_id = user_ids[i]
        print("user_id:{}".format(user_id))
        similar_user_ids, similarity_score = get_similar_user_ids(i, user_id, similarities_sparse)
        print("similar_user_ids:{}, similarity_score:{}".format(similar_user_ids, similarity_score))

        recommended_movies = get_recommended_movie_ids(user_id, similar_user_ids)
        print("recommended_movies:{}".format(recommended_movies))

        recommended_movie_titles = get_movie_titles(recommended_movies)

        print("user_id: {}, similar users: {}, score: {}".format(user_id, similar_user_ids, similarity_score))
        print("recommended movies: {}".format(recommended_movie_titles))
        print("---")

    response = input("continue? ")
    if response == 'n' or response == 'N':
        break

--- user ratings.
randomly chosen 986 users


  self._set_arrayXarray(i, j, x)


--- similarity sparse matrix (11522, 986)
user_id:793369
similar_user_ids:[2440014, 813022], similarity_score:[0.19501030861788082, 0.1941819549674001]
recommended_movies:[3605, 8226, 12834, 10808, 11323]
user_id: 793369, similar users: [2440014, 813022], score: [0.19501030861788082, 0.1941819549674001]
recommended movies: ["The Wizard of Oz: Collector's Edition", 'Buffy the Vampire Slayer: Season 5', 'Family Guy: Vol. 2: Season 3', 'Being There', 'Shanghai Knights']
---
user_id:719110
similar_user_ids:[856261, 1945447], similarity_score:[0.22342051276816188, 0.1770548649341509]
recommended_movies:[11283, 9279, 2112, 3151, 7249]
user_id: 719110, similar users: [856261, 1945447], score: [0.22342051276816188, 0.1770548649341509]
recommended movies: ['Forrest Gump', 'Sexy Beast', 'Identity', 'Napoleon Dynamite', 'Alfie']
---
user_id:1065289
similar_user_ids:[1367796, 2112694], similarity_score:[0.2189858978664031, 0.19044965913729223]
recommended_movies:[16384, 11277, 3610, 12834, 14890]


KeyboardInterrupt: 

In [None]:
### 