In [1]:
#%pylab inline
import numpy as np
from sklearn.neighbors import NearestNeighbors
from evaluator import Evaluator
from dataset_handler import DatasetHandler
import pandas as pd
import time
import pickle

In [2]:
cd '/Users/snehavenkat/Desktop/BDS/ml-100k'

/Users/snehavenkat/Desktop/BDS/ml-100k


In [3]:
#Data preprocessing - 1
items = pd.read_csv("u.item",sep='|',names=['movie_id','movie_title','release_date','video_release_date','IMDb_URL','unknown','Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasy',
              'Film-Noir','Horror','Musical','Mystery','Romance','Sci-Fi',
              'Thriller','War','Western']) # movies information
genre = pd.read_csv("u.genre",names=['Genre','ID'],sep='|') # movie genre information
items = items.drop(['release_date','video_release_date','IMDb_URL','unknown'],axis=1) #Removing unwanted features

In [4]:
#Data Preprocessing - 2
df = pd.DataFrame(index=items.index.values,columns=['movieId','title','genres'])
df= df.dropna()
l = list(items.columns.values)
if 'movie_id' in l:
    l.remove('movie_id')
if 'movie_title' in l:
    l.remove('movie_title')

In [5]:
i = 1
for index, row in items.iterrows():
    if row['movie_title'] == 'unknown' or row['movie_title'] == 'Good Morning (1971)':
        continue
    df.loc[index,'movieId'] = row['movie_id']
    i=i+1
    df.loc[index,'title'] = row['movie_title']
    s=''
    for x in l:
        if(row[x]==1):
            s = s+x+'|'
    df.loc[index,'genres'] = s[:-1]

In [6]:
#Dataframes being created and nan values removed
df_1 = df[df['genres'].notnull()].astype(str)
df_2 = df_1[df_1['genres'].notnull()]
df_new = df_2[df_2['genres'].notnull()]
df_new = df_new[~df_new.isin(['NaN', 'NaT', 'nan']).any(axis=1)]
df_new = df_new.dropna()
df_new.to_csv('movies.csv',sep='$',header=False,index=False)

In [7]:
#csv files being referenced
with open('movies.csv') as infile, open('movies.dat', 'w') as outfile:
    for line in infile:
        line = line.replace('$', '::')
        outfile.write(line)

In [8]:
dataset100k = "/Users/snehavenkat/Desktop/BDS/ml-100k" #Specifying path to data file

In [9]:
rating = pd.read_csv("u.data",sep='\t',names=['userId','movieId','rating','timestamp'])
rating = rating[rating.movieId != 267]
rating = rating[rating.movieId != 1373]
rating.to_csv('ratings.csv',sep='|',header=False)

In [10]:
#csv files being referenced
with open('ratings.csv') as infile, open('ratings.dat', 'w') as outfile:
    for line in infile:
        line = line.replace('|', '::')
        outfile.write(line)

In [11]:
#Two external files used for pre-processing are linked here via imported libraries
dataset_handler = DatasetHandler(dataset100k)
user_ratings = dataset_handler.load_users_ratings()

In [12]:
class ContentBasedRecommender(object):
    def __init__(self, dataset_handler):
        self.dataset_handler = dataset_handler
        self.movies_vectors = self.dataset_handler.load_movies()
    
    def train(self, train_set):
        pass
    
    def top(self, user_profile, topN):
        return self._cosineKNN_all_movies(user_profile[0], topN)
    
    def predict_rating(self, user_profile, movieId):
        nearest_watched_movies = self._cosineKNN_movies_subset(user_profile[1].keys(), movieId, 5)
        return np.average(np.array([user_profile[1][movie] for movie in nearest_watched_movies]))
        
    def create_user_profile(self, user_ratings):#user profile being creatied
        return (
            np.average(
                np.array([
                    self.movies_vectors[self.dataset_handler.id2index(movie)]#movie genre info
                    for (movie, rating) in user_ratings.items()
                ]),
                weights=np.array(user_ratings.values()),
                axis=0
            ),
            user_ratings#user rating info
        )
    
    def present_user_profile(self, user_profile):
        #print(np.argmax(user_profile[0]))
        print "User favourite genre:", self.dataset_handler.feature_index2genre(np.argmax(user_profile[0]))#ratings
        print "User ratings:"
        for (movieId, rating) in user_profile[1].items():
            movie_vector = self.movies_vectors[self.dataset_handler.id2index(movieId)]#genre
            print "{} {}: {}".format(
                self.dataset_handler.id_to_title[movieId],#title
                self.dataset_handler.movie_vector2genres(movie_vector),#genre of favourite movie
                rating
            )
    
    def present_recommendations(self, recommendations):
        print(self.movies_vectors)
        print "Recommended movies:"
        for movieId in recommendations:
            movie_vector = self.movies_vectors[self.dataset_handler.id2index(movieId)]
            print "{} {}".format(
                self.dataset_handler.id_to_title[movieId],
                self.dataset_handler.movie_vector2genres(movie_vector)
            )
    
    def _cosineKNN_all_movies(self, user_profile, k):
        
        nbrs = NearestNeighbors(metric='cosine', algorithm='brute')
        nbrs.fit(self.movies_vectors)
        pickle.dump(nbrs, open('cosineKNN.dat', 'wb'))
        return self.dataset_handler.indices2ids(nbrs.kneighbors(np.array([user_profile]), k, return_distance=False)[0])
    
    def _cosineKNN_movies_subset(self, movies_subset, movieId, k):
        nbrs = NearestNeighbors(k, metric='cosine', algorithm='brute')
        movies_with_ids = np.array([
            np.hstack([[watched_movie], self.movies_vectors[self.dataset_handler.id2index(watched_movie)]])
            for watched_movie in movies_subset
        ])
        nbrs.fit(movies_with_ids[:, 1:])
        return movies_with_ids[
            nbrs.kneighbors(
                np.array([self.movies_vectors[self.dataset_handler.id2index(movieId)]]), return_distance=False
            )[0],
            0
        ]

In [13]:
test_time = time.time()
recommender = ContentBasedRecommender(dataset_handler)
user_profile = recommender.create_user_profile(user_ratings[120]) #create user profile for user id= 120
recommender.present_user_profile(user_profile)
print("--- Pre-processing time is %s seconds ---" % (time.time() - test_time))

User favourite genre: Drama
User ratings:
Men in Black (1997) ['Action', 'Adventure', 'Comedy', 'Sci-Fi']: 2
Contact (1997) ['Drama', 'Sci-Fi']: 5
Boot, Das (1981) ['Action', 'Drama', 'War']: 5
Toy Story (1995) ['Animation', 'Children', 'Comedy']: 4
Dead Man Walking (1995) ['Drama']: 4
Mr. Holland's Opus (1995) ['Drama']: 4
Ghost and the Darkness, The (1996) ['Action', 'Adventure']: 3
Mission: Impossible (1996) ['Action', 'Adventure', 'Mystery']: 4
Birdcage, The (1996) ['Comedy']: 5
Time to Kill, A (1996) ['Drama']: 4
White Squall (1996) ['Adventure', 'Drama']: 4
English Patient, The (1996) ['Drama', 'Romance', 'War']: 5
Broken Arrow (1996) ['Action', 'Thriller']: 2
Star Wars (1977) ['Action', 'Adventure', 'Romance', 'Sci-Fi', 'War']: 4
Daylight (1996) ['Action', 'Adventure', 'Thriller']: 2
Devil's Own, The (1997) ['Action', 'Drama', 'Thriller', 'War']: 3
Ransom (1996) ['Drama', 'Thriller']: 4
Michael Collins (1996) ['Drama', 'War']: 4
People vs. Larry Flynt, The (1996) ['Drama']: 2
Je

In [14]:
test_time = time.time()
top = recommender.top(user_profile, topN=40)
recommender.present_recommendations(top) #recommendations for user
print("--- Test time is %s seconds ---" % (time.time() - test_time))

[[0 0 1 ..., 0 0 0]
 [1 1 0 ..., 1 0 0]
 [0 0 0 ..., 1 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]
Recommended movies:
Empire Strikes Back, The (1980) ['Action', 'Adventure', 'Drama', 'Romance', 'Sci-Fi', 'War']
Ben-Hur (1959) ['Action', 'Adventure', 'Drama']
Devil's Own, The (1997) ['Action', 'Drama', 'Thriller', 'War']
Braveheart (1995) ['Action', 'Drama', 'War']
Glory (1989) ['Action', 'Drama', 'War']
G.I. Jane (1997) ['Action', 'Drama', 'War']
Full Metal Jacket (1987) ['Action', 'Drama', 'War']
Boot, Das (1981) ['Action', 'Drama', 'War']
Heaven & Earth (1993) ['Action', 'Drama', 'War']
First Knight (1995) ['Action', 'Adventure', 'Drama', 'Romance']
Crying Game, The (1992) ['Action', 'Drama', 'Romance', 'War']
Men of Means (1998) ['Action', 'Drama']
Marked for Death (1990) ['Action', 'Drama']
Romper Stomper (1992) ['Action', 'Drama']
Perfect World, A (1993) ['Action', 'Drama']
Program, The (1993) ['Action', 'Drama']
Target (1995) ['Action', 'Drama']
Toky