# prep and read data

Credit : https://github.com/topspinj/recommender-tutorial/blob/master/part-1-item-item-recommender.ipynb


In [2]:
import numpy as np 
import pandas as pd 

import os


from datetime import datetime

import matplotlib
matplotlib.use('nbagg')

import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
%matplotlib inline

import seaborn as sns
sns.set_style('whitegrid')
import os
from scipy import sparse
from scipy.sparse import csr_matrix

from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
import random

In [3]:
for dirname, _, filenames in os.walk('/Users/luwu/Downloads/archive'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/Users/luwu/Downloads/archive/movie_titles.csv
/Users/luwu/Downloads/archive/qualifying.txt
/Users/luwu/Downloads/archive/combined_data_2.txt
/Users/luwu/Downloads/archive/combined_data_3.txt
/Users/luwu/Downloads/archive/combined_data_1.txt
/Users/luwu/Downloads/archive/combined_data_4.txt
/Users/luwu/Downloads/archive/README
/Users/luwu/Downloads/archive/probe.txt


In [70]:
title=pd.read_csv('/Users/luwu/Downloads/archive/movie_titles.csv',encoding='latin1')

In [71]:
title=title[['year', 'title']]

In [73]:
title=title.reset_index()

In [74]:
title

Unnamed: 0,index,year,title
0,0,2003.0,Dinosaur Planet
1,1,2004.0,Isle of Man TT 2004 Review
2,2,1997.0,Character
3,3,1994.0,Paula Abdul's Get Up & Dance
4,4,2004.0,The Rise and Fall of ECW
...,...,...,...
17765,17765,2002.0,Where the Wild Things Are and Other Maurice Se...
17766,17766,2004.0,Fidel Castro: American Experience
17767,17767,2000.0,Epoch
17768,17768,2003.0,The Company


In [4]:
if not os.path.isfile('data.csv'):
    # Create a file 'data.csv' before reading it
    # Read all the files in netflix and store them in one big file('data.csv')
    # reading from each of the four files and appendig each rating to a global file 'train.csv'
    data = open('data.csv', mode='w')
    
    row = list()
    files=['/Users/luwu/Downloads/archive/combined_data_1.txt']
    for file in files:
        print("Reading ratings from {}...".format(file))
        with open(file) as f:
            for line in f: 
                del row[:] 
                line = line.strip()
                if line.endswith(':'):
                    # All below are ratings for this movie, until another movie appears.
                    movie_id = line.replace(':', '')
                else:
                    row = [x for x in line.split(',')]
                    row.insert(0, movie_id)
                    data.write(','.join(row))
                    data.write('\n')
        print("Done.\n")
    data.close()

In [75]:

df = pd.read_csv('data.csv', sep=',', 
                       names=['movie', 'user','rating','date'])
df.date = pd.to_datetime(df.date)


# we are arranging the ratings according to time.
print('Sorting the dataframe by date..')
df.sort_values(by='date', inplace=True)
print('Done..')

Sorting the dataframe by date..
Done..


In [46]:
df

Unnamed: 0,movie,user,rating,date,movie_id,title
0,1969,204439,3,1999-12-10,1969.0,The Libertine
1,1969,204439,3,1999-12-10,1969.0,Guns of the Magnificent Seven
2,1969,204439,3,1999-12-10,1969.0,The Prime of Miss Jean Brodie
3,1969,204439,3,1999-12-10,1969.0,My Side of the Mountain
4,1969,204439,3,1999-12-10,1969.0,Benny Hill: Complete and Unadulterated: The Na...
...,...,...,...,...,...,...
124074097,1979,660764,3,2005-12-31,1979.0,Macbeth
124074098,1979,660764,3,2005-12-31,1979.0,Don Giovanni
124074099,1979,660764,3,2005-12-31,1979.0,Going in Style
124074100,1979,660764,3,2005-12-31,1979.0,The Europeans


In [6]:
def missing_df(df):
    missing_df=pd.DataFrame(df.apply(lambda x:sum(x.isnull())/len(df)))
    missing_df.columns=['pct_missing']
    missing_df=missing_df.sort_values(by='pct_missing', ascending=False)
    return missing_df


#drop duplicates
def drop_dup(df):
    df=df.drop_duplicates()
    return df

In [7]:
df=drop_dup(df) #remove duplicates

In [76]:
#join df and title to get movie titles 

df=pd.merge(df, title, left_on='movie', right_on='index')

In [77]:
sum(df['title'].isna())

0

In [78]:
print(f"number of unique movies: {df['movie'].nunique()}")
print(f"number of unique users:{df['user'].nunique()}")

number of unique movies: 4499
number of unique users:470758


In [79]:
#how many movies each user rated?

user_movies=df.groupby('user').count()

In [80]:
user_movies['movie'].describe() # on average users rate 51 movies 

count    470758.000000
mean         51.095816
std          74.405055
min           1.000000
25%           8.000000
50%          24.000000
75%          64.000000
max        4467.000000
Name: movie, dtype: float64

In [81]:
#for each movie, how many reviews?
movies_reviews=df.groupby('movie')['user'].count()


In [82]:
movies_reviews.describe() # on average, movies gaterh 4450 reviews

count      4499.000000
mean       5346.468993
std       16176.313851
min          36.000000
25%         192.000000
50%         552.000000
75%        2538.000000
max      193941.000000
Name: user, dtype: float64

# transform data to matrix

## user-movie matrix where each row is a user and each feature is a movie

In [12]:
from scipy.sparse import csr_matrix

In [57]:
def create_matrix(df):
    N=df['user'].nunique()
    M=df['movie'].nunique()
    user_mapper = dict(zip(np.unique(df["user"]), list(range(N))))
    movie_mapper = dict(zip(np.unique(df["movie"]), list(range(M))))
    
    user_inv_mapper = dict(zip(list(range(N)), np.unique(df["user"])))
    movie_inv_mapper = dict(zip(list(range(M)), np.unique(df["movie"])))
    
    user_index = [user_mapper[i] for i in df['user']]
    movie_index = [movie_mapper[i] for i in df['movie']]

    X = csr_matrix((df["rating"], (movie_index, user_index)), shape=(M, N))
    
    return X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper

In [41]:
np.unique(df["user"])

array([      6,       7,      10, ..., 2649409, 2649426, 2649429])

In [83]:
X, user_mapper, movie_mapper, user_inv_mapper, movie_inv_mapper = create_matrix(df)

In [84]:
#check sparsity of the matrix 

sparsity = X.count_nonzero()/(X.shape[0]*X.shape[1])

print(f"Matrix sparsity: {round(sparsity*100,2)}%")

Matrix sparsity: 1.14%


only 1.14% of  cells were populated, but it's more than 0.05% recommended bottomline. 

In [19]:
from scipy.sparse import save_npz

save_npz('user_movie_matrix.npz', X)

In [45]:
# Use KNN to identify similar movies

from sklearn.neighbors import NearestNeighbors

def find_similar_movies(movie_id, X, k, metric='cosine', show_distance=False):
    """
    Finds k-nearest neighbours for a given movie id.
    
    Args:
        movie_id: id of the movie of interest
        X: user-item utility matrix
        k: number of similar movies to retrieve
        metric: distance metric for kNN calculations
    
    Returns:
        list of k similar movie ID's
    """
    neighbour_ids = []
    
    movie_ind = movie_mapper[movie_id]
    movie_vec = X[movie_ind]
    k+=1
    kNN = NearestNeighbors(n_neighbors=k, algorithm="brute", metric=metric)
    kNN.fit(X)
    if isinstance(movie_vec, (np.ndarray)):
        movie_vec = movie_vec.reshape(1,-1)
    neighbour = kNN.kneighbors(movie_vec, return_distance=show_distance)
    for i in range(0,k):
        n = neighbour.item(i)
        neighbour_ids.append(movie_inv_mapper[n])
    neighbour_ids.pop(0)
    return neighbour_ids

In [85]:

movie_titles = dict(zip(df['movie'],df['title']))

movie_id = 1979

similar_ids = find_similar_movies(movie_id, X, k=10)
movie_title = movie_titles[movie_id]

print(f"Because you watched {movie_title}")
for i in similar_ids:
    print(movie_titles[i])


Because you watched Alien: Resurrection: Collector's Edition: Bonus Material
The Great Locomotive Chase
Adored: Diary of a Porn Star
411VM Skateboarding: Issue 61
Showdown in Little Tokyo
Clive Barker's Salome / The Forbidden
Godmother
The Dead Hate the Living
Modern Times
Ben Harper: Live at the Hollywood Bowl
Denise Austin: Ultimate Fat Burner
