In [2]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt

movies = pd.read_csv("Data/movies.dat", delimiter = "::", header = None , names = ['Movie ID', 'Movie Title', 'Genre'], engine = "python")
reviews = pd.read_csv("Data/ratings.dat", delimiter = "::",header = None, names = ['User ID', 'Movie ID', 'Rating', 'Rating Timestamp'], engine = "python")

In [3]:
print(movies.head())
print(movies.info())
print(reviews.head())
print(reviews.info())

   Movie ID                                        Movie Title  \
0         8      Edison Kinetoscopic Record of a Sneeze (1894)   
1        10               La sortie des usines LumiÃ¨re (1895)   
2        12                      The Arrival of a Train (1896)   
3        25  The Oxford and Cambridge University Boat Race ...   
4        91                         Le manoir du diable (1896)   

               Genre  
0  Documentary|Short  
1  Documentary|Short  
2  Documentary|Short  
3                NaN  
4       Short|Horror  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34437 entries, 0 to 34436
Data columns (total 3 columns):
Movie ID       34437 non-null int64
Movie Title    34437 non-null object
Genre          34159 non-null object
dtypes: int64(1), object(2)
memory usage: 807.2+ KB
None
   User ID  Movie ID  Rating  Rating Timestamp
0        1    111161      10        1373234211
1        1    117060       7        1373415231
2        1    120755       6        1373424360
3 

In [15]:
print("number of movies {}, number of reviews {}".format(movies.shape[0], reviews.shape[0]))

number of movies 34437, number of reviews 814505


In [4]:
def NumberOfGenres():
    """pulling unique genres"""
    genres = []
    for val in movies["Genre"]:
        try:
            genres.extend(val.split("|"))
        except AttributeError:
            pass
    unique = len(set(genres))
    genres = set(genres)
    return unique, genres

In [5]:
unique, genres = NumberOfGenres()
print(unique,genres)

28 {'Crime', 'Family', 'Drama', 'History', 'Mystery', 'Horror', 'Film-Noir', 'Action', 'Fantasy', 'Adult', 'Documentary', 'Musical', 'Animation', 'Sport', 'Western', 'Comedy', 'Romance', 'Reality-TV', 'War', 'Thriller', 'Sci-Fi', 'Adventure', 'Short', 'Biography', 'News', 'Game-Show', 'Talk-Show', 'Music'}


In [6]:
#getting the number of unique user in the dataset
print("number of unique user in the dataset :{}".format(len(np.unique(reviews["User ID"]))))
print("number of missing rating :{}".format(reviews["Rating"].isnull().sum()))
print("mean of ratings :{}".format(reviews["Rating"].mean()))
print("max of ratings :{}".format(reviews["Rating"].max()))

number of unique user in the dataset :60283
number of missing rating :0
mean of ratings :7.304460991645232
max of ratings :10


In [7]:
extract_date = lambda x: x[-5:-1] if x[-1] == ")" else np.nan
movies["date"] = movies["Movie Title"].apply(extract_date)

In [45]:
years = ['18', '19', '20']
for year in years:
    movies[str(year) + "00's"] = movies["date"].apply(lambda x: 1 if x[:2] == year else 0)

In [9]:
def Dummy_var_genre(val):
    try:
        if val.find(g) != -1:
            return 1
        else:
            return 0
    except AttributeError:
        return 0
    
for g in genres:
    movies[g] = movies["Genre"].apply(Dummy_var_genre)

In [None]:
convert_timestamp = lambda x: datetime.datetime.fromtimestamp(x).strftime("%Y-%m-%d %H:%M:%S")
reviews["date_time"] = reviews["Rating Timestamp"].apply(convert_timestamp)

In [12]:
print(movies.head())
print(reviews.head())

   Movie ID                                        Movie Title  \
0         8      Edison Kinetoscopic Record of a Sneeze (1894)   
1        10               La sortie des usines LumiÃ¨re (1895)   
2        12                      The Arrival of a Train (1896)   
3        25  The Oxford and Cambridge University Boat Race ...   
4        91                         Le manoir du diable (1896)   

               Genre  date  Crime  Family  Drama  History  Mystery  Horror  \
0  Documentary|Short  1894      0       0      0        0        0       0   
1  Documentary|Short  1895      0       0      0        0        0       0   
2  Documentary|Short  1896      0       0      0        0        0       0   
3                NaN  1895      0       0      0        0        0       0   
4       Short|Horror  1896      0       0      0        0        0       1   

   ...  War  Thriller  Sci-Fi  Adventure  Short  Biography  News  Game-Show  \
0  ...    0         0       0          0      1        

## ranked-based recommendation

In [42]:
def create_ranked_df(movies, reviews):
    """Take in the movies and reviews dateset and return the ranked movies"""
    movie_ratings = reviews.groupby("Movie ID")["Rating"]
    avg_ratings = movie_ratings.mean()
    num_ratings = movie_ratings.count()
    last_rating = pd.DataFrame(reviews.groupby("Movie ID").max()["date_time"])
    last_rating.columns = ["last_rating"]
    
    ratings_count_df = pd.DataFrame({"avg_ratings":avg_ratings, 'num_ratings':num_ratings})
    ratings_count_df = ratings_count_df.join(last_rating)
    
    movies_rec = movies.set_index("Movie ID").join(ratings_count_df)
    ranked_movies = movies_rec.sort_values(["avg_ratings", "num_ratings", "last_rating"], ascending = False)
    ranked_movies = ranked_movies[ranked_movies["num_ratings"] > 4]
    return ranked_movies

def popular_recommendation(user_id, n_top, ranked_movies):
    """Take in user_id, number of ranked movies, ranked_movies dataframe and return a listranked movies"""
    top_movies = list(ranked_movies["Movie Title"][:n_top])
    return top_movies

In [55]:
def popular_recs_filtered(user_id, n_top, ranked_movies, genres, years):
    """perform filter operations to recommend with knowledge based recommendation"""
    ranked_movies = ranked_movies[ranked_movies["date"].isin(years)]
    num_genre_match = ranked_movies[genres].sum(axis = 1)
    ranked_movies = ranked_movies.loc[num_genre_match > 0, :]
    top_movies = list(ranked_movies["Movie Title"][:n_top])
    return top_movies

In [58]:
recommendation = popular_recs_filtered("1",20,ranked_movies,["History","Drama"],["2015"])
print(recommendation)

['MSG 2 the Messenger (2015)', 'Make Like a Dog (2015)', 'I Believe in Miracles (2015)', 'Bajrangi Bhaijaan (2015)', 'War Room (2015)', 'Visaaranai (2015)', 'Drishyam (2015)', 'The True Cost (2015)', 'Be Here Now (2015)', 'Lu bian ye can (2015)', 'Birigyaru (2015)', 'The Resurrection of Jake The Snake Roberts (2015)', 'Inside Out (2015)', 'Room (2015)', 'Sado (2015)', 'World of Tomorrow (2015)', 'Salam Neighbor (2015)', 'Racing Extinction (2015)', 'Talvar (2015)', 'A Girl Like Her (2015)']
