In [1]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt

movies = pd.read_csv("Data/movies.dat", delimiter = "::", header = None , names = ['Movie ID', 'Movie Title', 'Genre'], engine = "python")
reviews = pd.read_csv("Data/ratings.dat.000", delimiter = "::",header = None, names = ['User ID', 'Movie ID', 'Rating', 'Rating Timestamp'], engine = "python")

In [2]:
print(movies.head())
print(movies.info())
print(reviews.head())
print(reviews.info())

   Movie ID                                        Movie Title  \
0         8      Edison Kinetoscopic Record of a Sneeze (1894)   
1        10               La sortie des usines LumiÃ¨re (1895)   
2        12                      The Arrival of a Train (1896)   
3        25  The Oxford and Cambridge University Boat Race ...   
4        91                         Le manoir du diable (1896)   

               Genre  
0  Documentary|Short  
1  Documentary|Short  
2  Documentary|Short  
3                NaN  
4       Short|Horror  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34437 entries, 0 to 34436
Data columns (total 3 columns):
Movie ID       34437 non-null int64
Movie Title    34437 non-null object
Genre          34159 non-null object
dtypes: int64(1), object(2)
memory usage: 807.2+ KB
None
   User ID  Movie ID  Rating  Rating Timestamp
0        1    111161    10.0      1.373234e+09
1        1    117060     7.0      1.373415e+09
2        1    120755     6.0      1.373424e+09
3 

In [3]:
print("number of movies {}, number of reviews {}".format(movies.shape[0], reviews.shape[0]))

number of movies 34437, number of reviews 28335


In [4]:
def NumberOfGenres():
    """pulling unique genres"""
    genres = []
    for val in movies["Genre"]:
        try:
            genres.extend(val.split("|"))
        except AttributeError:
            pass
    unique = len(set(genres))
    genres = set(genres)
    return unique, genres

In [5]:
unique, genres = NumberOfGenres()
print(unique,genres)

28 {'Comedy', 'Crime', 'Documentary', 'Action', 'Film-Noir', 'History', 'News', 'Family', 'Sport', 'Music', 'Mystery', 'Fantasy', 'Adult', 'Talk-Show', 'Sci-Fi', 'Drama', 'Thriller', 'Game-Show', 'Short', 'Reality-TV', 'Horror', 'Adventure', 'Musical', 'Biography', 'Animation', 'Romance', 'Western', 'War'}


In [6]:
#getting the number of unique user in the dataset
print("number of unique user in the dataset :{}".format(len(np.unique(reviews["User ID"]))))
print("number of missing rating :{}".format(reviews["Rating"].isnull().sum()))
print("mean of ratings :{}".format(reviews["Rating"].mean()))
print("max of ratings :{}".format(reviews["Rating"].max()))

number of unique user in the dataset :2600
number of missing rating :1
mean of ratings :7.445224818239571
max of ratings :10.0


In [7]:
extract_date = lambda x: x[-5:-1] if x[-1] == ")" else np.nan
movies["date"] = movies["Movie Title"].apply(extract_date)

In [8]:
years = ['18', '19', '20']
for year in years:
    movies[str(year) + "00's"] = movies["date"].apply(lambda x: 1 if x[:2] == year else 0)

In [9]:
def Dummy_var_genre(val):
    try:
        if val.find(g) != -1:
            return 1
        else:
            return 0
    except AttributeError:
        return 0
    
for g in genres:
    movies[g] = movies["Genre"].apply(Dummy_var_genre)

In [10]:
reviews["date_time"] = reviews["Rating Timestamp"].apply(pd.to_datetime, errors='coerce', utc=True, unit='s')

In [11]:
print(movies.head())
print(reviews.head())

   Movie ID                                        Movie Title  \
0         8      Edison Kinetoscopic Record of a Sneeze (1894)   
1        10               La sortie des usines LumiÃ¨re (1895)   
2        12                      The Arrival of a Train (1896)   
3        25  The Oxford and Cambridge University Boat Race ...   
4        91                         Le manoir du diable (1896)   

               Genre  date  1800's  1900's  2000's  Comedy  Crime  \
0  Documentary|Short  1894       1       0       0       0      0   
1  Documentary|Short  1895       1       0       0       0      0   
2  Documentary|Short  1896       1       0       0       0      0   
3                NaN  1895       1       0       0       0      0   
4       Short|Horror  1896       1       0       0       0      0   

   Documentary  ...  Short  Reality-TV  Horror  Adventure  Musical  Biography  \
0            1  ...      1           0       0          0        0          0   
1            1  ...      1

## ranked-based recommendation

In [12]:
def create_ranked_df(movies, reviews):
    """Take in the movies and reviews dateset and return the ranked movies"""
    movie_ratings = reviews.groupby("Movie ID")["Rating"]
    avg_ratings = movie_ratings.mean() 
    num_ratings = movie_ratings.count()
    last_rating = pd.DataFrame(reviews.groupby("Movie ID").max()["date_time"])
    last_rating.columns = ["last_rating"]
    
    ratings_count_df = pd.DataFrame({"avg_ratings":avg_ratings, 'num_ratings':num_ratings})
    ratings_count_df = ratings_count_df.join(last_rating)
    
    movies_rec = movies.set_index("Movie ID").join(ratings_count_df)
    ranked_movies = movies_rec.sort_values(["avg_ratings", "num_ratings", "last_rating"], ascending = False)
    ranked_movies = ranked_movies[ranked_movies["num_ratings"] > 4]
    return ranked_movies

def popular_recommendation(user_id, n_top, ranked_movies):
    """Take in user_id, number of ranked movies, ranked_movies dataframe and return a listranked movies"""
    top_movies = list(ranked_movies["Movie Title"][:n_top])
    return top_movies

In [13]:
def popular_recs_filtered(user_id, n_top, ranked_movies,genres,years):
    """perform filter operations to recommend with knowledge based recommendation"""
    ranked_movies = ranked_movies[ranked_movies["date"].isin(years)]
    num_genre_match = ranked_movies[genres].sum(axis = 1)
    ranked_movies = ranked_movies.loc[num_genre_match > 0, :]
    top_movies = list(ranked_movies["Movie Title"][:n_top])
    return top_movies

In [18]:
ranked_movies = create_ranked_df(movies,reviews)
recommendation = popular_recs_filtered("1", 20, ranked_movies,["Sci-Fi","Action"],["2019"])
print(recommendation)

['Spider-Man: Far from Home (2019)', 'Avengers: Endgame (2019)', 'El Camino: A Breaking Bad Movie (2019)', 'Shazam! (2019)', 'Rambo: Last Blood (2019)', 'John Wick: Chapter 3 - Parabellum (2019)', 'Zombieland: Double Tap (2019)', 'Alita: Battle Angel (2019)', 'Captain Marvel (2019)', 'Triple Frontier (2019)', 'Ad Astra (2019)', 'Angel Has Fallen (2019)', 'Fast &amp; Furious Presents: Hobbs &amp; Shaw (2019)', 'Glass (2019)', 'PokÃ©mon Detective Pikachu (2019)', 'Crawl (2019)', 'Anna (2019)', 'Murder Mystery (2019)', 'Dark Phoenix (2019)', 'Men in Black: International (2019)']


## collaborative filtering: user-based collaborative filtering

In [19]:
user_items = reviews[["User ID","Movie ID","Rating"]]
print(user_items.head())

   User ID  Movie ID  Rating
0        1    111161    10.0
1        1    117060     7.0
2        1    120755     6.0
3        1    317919     6.0
4        1    454876    10.0


In [20]:
user_by_movie = user_items.groupby(["User ID","Movie ID"])["Rating"].max().unstack()
print(user_by_movie.head())

Movie ID  125       417       6414      12349     12844     13427     \
User ID                                                                
1              NaN       NaN       NaN       NaN       NaN       NaN   
2              NaN       NaN       NaN       NaN       NaN       NaN   
3              NaN       NaN       NaN       NaN       NaN       NaN   
4              NaN       NaN       NaN       NaN       NaN       NaN   
5              NaN       NaN       NaN       NaN       NaN       NaN   

Movie ID  13442     15002     15163     15324     ...  9495224   9541602   \
User ID                                           ...                       
1              NaN       NaN       NaN       NaN  ...       NaN       NaN   
2              NaN       NaN       NaN       NaN  ...       NaN       NaN   
3              NaN       NaN       NaN       NaN  ...       NaN       NaN   
4              NaN       NaN       NaN       NaN  ...       NaN       NaN   
5              NaN       NaN     

In [21]:
def movies_watched(user_id):
    """Take in any user id and return an array of rated movies by the user"""
    movies = user_by_movie.loc[user_id][user_by_movie.loc[user_id].isnull() == False].index.values
    return movies


def create_user_movie_dict():
    """Create a dict of user and the corresponding movies viewed"""
    movie_seen = dict()
    n_user = user_by_movie.shape[0]
    
    for user in range(1,n_user+1):
        movie_seen[user] = movies_watched(user)
    return movie_seen

def create_movies_to_analyze(movie_seen, bound):
    """Take in movie_seen dict and a bound and only keeps number of movies higher than the bound"""
    movie_to_analyze = dict()
    
    for user_id, movies in movie_seen.items():
        if len(movies) > bound:
            movie_to_analyze[user_id] = movies
    return movie_to_analyze

In [None]:
def euclidien_distance():
    

In [22]:
movie_seen = create_user_movie_dict()
movie_to_analyze = create_movies_to_analyze(movie_seen,2)