In [1]:
import pandas as pd
import numpy as np
import math

ratings_path = "./ml-latest-small/ratings.csv"
movies_path = "./ml-latest-small/movies.csv"
tags_path = "./ml-latest-small/tags.csv"
top_similar_items_path = "./top_similar_items.csv"
top_movie_coefficients_path = "./top_item_coefficients.csv"

Ratings = pd.DataFrame()
Movies = pd.DataFrame()
TopSimilarItems = pd.DataFrame()
TopMovieCoefficients = pd.DataFrame()

In [2]:
def load_dataset(dataset):
    dataset_name = dataset[0]
    dataset_path = dataset[1]
    try:
        data = pd.read_csv(dataset_path)
        print("{} is successfully read from memory.").format(dataset_name)
        return data
    except:
        print("CAUTION: {} cannot be read from memory.").format(dataset_name)
        return 

In [3]:
def unskewed_pearson_similarity(movie1, movie2):
    dot_product = movie1.transpose().dot(movie2)
    item1_vector_length = math.sqrt(movie1.transpose().dot(movie1))
    item2_vector_length = math.sqrt(movie2.transpose().dot(movie2))
    if item1_vector_length < 0.0000001 or item2_vector_length < 0.0000001 or (movie1==movie2).all():
        return 0
    else:
        return dot_product / item1_vector_length / item2_vector_length

In [4]:
def process_ratings(Ratings):
    MeanUserRating = Ratings.groupby(['userId'], as_index = False, sort = False).mean().rename(columns = {'rating': 'rating_mean'})[['userId','rating_mean']]
    Ratings = pd.merge(Ratings, MeanUserRating, on = 'userId', how = 'left', sort = False)
    Ratings['rating_adjusted'] = Ratings['rating'] - Ratings['rating_mean']
    Ratings['mean_skewness'] = (Ratings['rating_adjusted'])/(5-Ratings['rating_mean'])*(Ratings['rating_adjusted']>0).astype(float) - (Ratings['rating_adjusted'])/(Ratings['rating_mean']-0.5)*(Ratings['rating_adjusted']<0).astype(float)
    Ratings['rating_unskewed'] = Ratings['rating_adjusted'] * np.sqrt(1+(Ratings['mean_skewness']**2)*2) 
    PivotedMoviesMatrix = Ratings.pivot_table(index='userId', columns='movieId', values='rating_unskewed', fill_value=0)
    return Ratings, PivotedMoviesMatrix

In [5]:
def item_similarity_matrix():
    MoviesMatrix = pd.DataFrame(0, index=np.arange(distinct_movies.size), columns=np.arange(distinct_movies.size), dtype='float')
    TopSimilarItems = pd.DataFrame(0, index=np.arange(distinct_movies.size), columns=np.arange(1000), dtype='float')
    MovieCoefficients = pd.DataFrame(0, index=np.arange(distinct_movies.size), columns=np.arange(distinct_movies.size), dtype='float')
    TopMovieCoefficients = pd.DataFrame(0, index=np.arange(distinct_movies.size), columns=np.arange(1000), dtype='float')

    for movie in distinct_movies[:4501]:
        movieIndex = np.searchsorted(distinct_movies, movie)

        for movie2 in distinct_movies:
            movie2Index = np.searchsorted(distinct_movies, movie2)
            proximity = unskewed_pearson_similarity(PivotedMoviesMatrix.iloc[:,movieIndex], PivotedMoviesMatrix.iloc[:,movie2Index])
            MoviesMatrix[movie2Index][movieIndex] = proximity

        similarity_values = np.copy(MoviesMatrix.values[movieIndex])
        MoviesMatrix.values[movieIndex] = np.argsort(MoviesMatrix.values[movieIndex])[::-1]
        similarity_values.sort()
        similarity_values = similarity_values[::-1]

        MovieCoefficients.values[movieIndex] = np.where(similarity_values > 0, similarity_values, 0)
        TopMovieCoefficients.values[movieIndex] = MovieCoefficients.values[movieIndex][:1000]
        MoviesMatrix.values[movieIndex] = np.where(similarity_values > 0, MoviesMatrix.values[movieIndex], 0)
        TopSimilarItems.values[movieIndex] = MoviesMatrix.values[movieIndex][:1000]
        print("Calculated for {} items out of {}.").format(movieIndex+1, distinct_movies.size)

    TopSimilarItems.to_csv(top_similar_items_path, index=False)
    TopMovieCoefficients.to_csv(top_movie_coefficients_path, index=False)
    return TopSimilarItems, TopMovieCoefficients

In [6]:
def load_precalculated(data, name, path, recalculator, arg):
    data = load_dataset((name, path))
    if data is None:
        print("{} data is being recalculated... It might take a while...").format(name)
        data = recalculator()[arg]   
    return data

In [7]:
Ratings = load_dataset(("Ratings", ratings_path))
Ratings, PivotedMoviesMatrix = process_ratings(Ratings)
distinct_users = np.unique(Ratings['userId'])
distinct_movies = np.unique(Ratings['movieId'])
Movies = load_dataset(("Movies", movies_path))

TopSimilarItems = load_precalculated(TopSimilarItems, "TopSimilarItems", top_similar_items_path, item_similarity_matrix, 0)
TopMovieCoefficients = load_precalculated(TopMovieCoefficients, "TopMovieCoefficients", top_movie_coefficients_path, item_similarity_matrix, 1)    

Ratings is successfully read from memory.
Movies is successfully read from memory.
CAUTION: TopSimilarItems cannot be read from memory.
TopSimilarItems data is being recalculated... It might take a while...
Calculated for 1 items out of 9066.
Calculated for 2 items out of 9066.
Calculated for 3 items out of 9066.
Calculated for 4 items out of 9066.
Calculated for 5 items out of 9066.
Calculated for 6 items out of 9066.
Calculated for 7 items out of 9066.
Calculated for 8 items out of 9066.
Calculated for 9 items out of 9066.
Calculated for 10 items out of 9066.
Calculated for 11 items out of 9066.
Calculated for 12 items out of 9066.
Calculated for 13 items out of 9066.
Calculated for 14 items out of 9066.
Calculated for 15 items out of 9066.
Calculated for 16 items out of 9066.
Calculated for 17 items out of 9066.
Calculated for 18 items out of 9066.
Calculated for 19 items out of 9066.
Calculated for 20 items out of 9066.
Calculated for 21 items out of 9066.
Calculated for 22 items o

In [39]:
def accumulate_item_recommendations(userId, user_preferences, user_rates):
    recommendations = np.zeros(distinct_movies.size)
    
    for preference, rate in zip(user_preferences, user_rates):
        preferenceIndex = np.searchsorted(distinct_movies, preference)
        preferenceTwins = TopSimilarItems.values[preferenceIndex].astype(int)
        twinsProximity = TopMovieCoefficients.values[preferenceIndex]
        
        for twinId, twinProximity in zip(preferenceTwins, twinsProximity):
            if twinId > 0:
                twinIndex = np.searchsorted(distinct_movies, twinId)
                recommendations[twinIndex] += rate*twinProximity

    return recommendations

In [40]:
def item_collaborative_recommendations(userId):
    userIndex = np.searchsorted(distinct_users, userId)
    user_rates = PivotedMoviesMatrix.values[userIndex]
    user_rates = np.where(user_rates>0, user_rates, 0)
    user_preferences = distinct_movies[np.argsort(user_rates)][::-1]
    
    user_rates.sort()
    user_rates = user_rates[::-1]
    user_rates = np.extract(user_rates>0, user_rates)
    user_preferences = user_preferences[:user_rates.size]
    
    acc_recommendations = accumulate_item_recommendations(userId, user_preferences, user_rates)
    item_recommendations = distinct_movies[np.argsort(acc_recommendations)][::-1]
    acc_recommendations.sort()
    acc_recommendations = acc_recommendations[::-1]
    item_recommendations = np.extract(acc_recommendations>0, item_recommendations)

    return item_recommendations

In [49]:
userId=5
res = item_collaborative_recommendations(userId)
Movies[Movies['movieId'].isin(res[:10])]

Unnamed: 0,movieId,title,genres


In [9]:
TopSimilarItems[:4502]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,990,991,992,993,994,995,996,997,998,999
0,2506.0,7532.0,1866.0,955.0,419.0,232.0,966.0,7316.0,7605.0,8519.0,...,1132.0,968.0,2651.0,5468.0,730.0,64.0,4914.0,1755.0,1844.0,1952.0
1,193.0,1706.0,2897.0,2907.0,1939.0,3244.0,447.0,2713.0,2559.0,1594.0,...,3593.0,6570.0,6684.0,3893.0,8065.0,6441.0,8943.0,6561.0,1937.0,5227.0
2,418.0,1223.0,2741.0,338.0,441.0,3161.0,36.0,379.0,3705.0,3057.0,...,3222.0,8745.0,3861.0,319.0,4246.0,4318.0,166.0,2927.0,1745.0,2874.0
3,1772.0,2057.0,1163.0,505.0,710.0,1447.0,2230.0,1469.0,400.0,1222.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,8143.0,5922.0,7183.0,36.0,124.0,398.0,491.0,418.0,77.0,3251.0,...,813.0,370.0,2099.0,1801.0,4059.0,3011.0,2436.0,614.0,3729.0,2530.0
5,2001.0,1255.0,1215.0,1236.0,1880.0,1664.0,2917.0,2462.0,2297.0,458.0,...,7746.0,1529.0,5532.0,5727.0,7623.0,5915.0,2883.0,4087.0,1585.0,7841.0
6,5503.0,4295.0,3616.0,3747.0,1557.0,3316.0,3111.0,3273.0,6244.0,1534.0,...,1140.0,8345.0,5655.0,635.0,5821.0,221.0,2011.0,3215.0,2266.0,158.0
7,2280.0,3451.0,2561.0,2413.0,3775.0,177.0,215.0,1110.0,125.0,459.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,613.0,92.0,158.0,340.0,77.0,3640.0,3590.0,4077.0,3390.0,4076.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2623.0,4564.0,5319.0,2252.0,447.0,383.0,2401.0,258.0,2362.0,1.0,...,82.0,7989.0,184.0,3764.0,6530.0,5106.0,2516.0,8426.0,685.0,6981.0
