In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
ratings_file = '/content/drive/My Drive/IEOR/ratings.csv'
movie_file = '/content/drive/My Drive/IEOR/movies.csv'
train_file = '/content/drive/My Drive/IEOR/train.csv'
test_file = '/content/drive/My Drive/IEOR/test.csv'

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
ratings = pd.read_csv(ratings_file)
movies = pd.read_csv(movie_file)
training = pd.read_csv(train_file)

In [6]:
test = pd.read_csv(test_file)

## Another Baseline

This was another implementation of a baseline we decided not to do because it may not always return a movie thats also in the test set. This implementation works with the most watched genres an individual watched, and recommends the top rated movies of that genre.

In [10]:
training['genres'] = training['genres'].str.split('|')

In [32]:
avg_movie_rating_df = training.groupby('movieId').mean().reset_index()[['movieId', 'rating']]
avg_movie_rating_df[['title','genres']] = movies[['title','genres']]
avg_movie_rating_df['genres'] = avg_movie_rating_df['genres'].str.split('|')
avg_movie_rating_df = avg_movie_rating_df.sort_values(by='rating', ascending=False).reset_index()
avg_movie_rating_df.head()

Unnamed: 0,index,movieId,rating,title,genres
0,15090,94352,5.0,Chloe (2009),"[Drama, Thriller]"
1,15969,101670,5.0,Tumbledown (1988),[War]
2,15798,100085,5.0,Fate Is the Hunter (1964),[Drama]
3,22186,164260,5.0,"Enemy Within, The (O ehthros mou) (2013)",[Drama]
4,15820,100210,5.0,"President's Man, The (2000)","[Action, Adventure, Thriller]"


In [33]:
#dataframe for user 5
df_user = training[training['userId'] == 5]
df_user.head()

Unnamed: 0,userId,movieId,genres,rating
301218,5,3173,[Drama],3.5
324437,5,6322,"[Crime, Thriller]",3.5
346060,5,5528,"[Drama, Thriller]",3.5
346886,5,3178,[Drama],4.0
347717,5,55765,"[Crime, Drama, Thriller]",4.0


In [34]:
from collections import Counter
def get_most_watched_genre(df_user):
  #from the top 10 rated movies the user gave, what is the most common genre?
  top10 = df_user.sort_values(by='rating', ascending=False)[:10]
  top10 = top10.reset_index()
  all = top10['genres']
  l = [item for sublist in all for item in sublist]
  count = Counter(l)
  most_watched_genre = sorted(count, key=count.get)[-1]
  return most_watched_genre

In [35]:
most_watched = get_most_watched_genre(df_user)

In [38]:
def make_recommendation(userId): 
  #recommend 100 movies with that genre
  recommend = []
  most_watched_genre = most_watched
  movies_watched = df_user['movieId']
  for i in range(len(avg_movie_rating_df)):
    if most_watched_genre in avg_movie_rating_df['genres'][i]:
      if len(recommend) == 100:
        return recommend
      movie = avg_movie_rating_df['movieId'][i]
      if movie not in movies_watched:
        recommend.append(movie)
  return recommend

Recommend 100 movies with the highest ratings that have one of these genres

In [39]:
rec5 = make_recommendation(5)

In [40]:
def pred_rating_df(userId, df_user):
  #return df_user with pred_rating column
  recommendation = make_recommendation(userId)
  rec_movies = avg_movie_rating_df.loc[avg_movie_rating_df['movieId'].isin(recommendation)]
  rec_movies['pred_rating'] = (rec_movies['rating'] + df_user['rating'].mean()) / 2
  return rec_movies

In [41]:
rec_movies = avg_movie_rating_df.loc[avg_movie_rating_df['movieId'].isin(rec5)]
rec_movies['pred_rating'] = (rec_movies['rating'] + df_user['rating'].mean()) / 2
rec_movies

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,index,movieId,rating,title,genres,pred_rating
0,15090,94352,5.0,Chloe (2009),"[Drama, Thriller]",4.62069
2,15798,100085,5.0,Fate Is the Hunter (1964),[Drama],4.62069
3,22186,164260,5.0,"Enemy Within, The (O ehthros mou) (2013)",[Drama],4.62069
5,22162,164061,5.0,Back in the Saddle (Back in the Saddle Again) ...,"[Action, Drama, Western]",4.62069
7,22149,163987,5.0,Superclásico (2011),"[Comedy, Drama]",4.62069
...,...,...,...,...,...,...
193,14654,90730,5.0,Hard Luck (2006),"[Crime, Drama, Thriller]",4.62069
194,14687,91037,5.0,Under the Bridges (Unter den Brücken) (1945),"[Comedy, Drama, Romance]",4.62069
196,14691,91054,5.0,"H.M. Pulham, Esq. (1941)","[Drama, Romance]",4.62069
197,14719,91286,5.0,"Sun Also Rises, The (1957)",[Drama],4.62069


## Another Context Based Model

Another consideration of a context based model for new users is shown below. Please note this is not complete and we decided to go with our other content based model because it was more applicable to testing and incorporation to our hybrid model. This model has 2 interesting functions. 1) recommending movies similar to other movies. 2) recommending movies to users based on their past movie watching experience. They both utilize TFIDF for finding relative importance of the genres.



In [42]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58098 entries, 0 to 58097
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  58098 non-null  int64 
 1   title    58098 non-null  object
 2   genres   58098 non-null  object
dtypes: int64(1), object(2)
memory usage: 1.3+ MB


In [43]:
movies['genres'] = movies['genres'].str.split('|')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),[Comedy]


In [44]:
import itertools
set(itertools.chain.from_iterable(movies.genres))

{'(no genres listed)',
 'Action',
 'Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Fantasy',
 'Film-Noir',
 'Horror',
 'IMAX',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Thriller',
 'War',
 'Western'}

In [45]:
#Shows the userID of the new users we are working with
grouped = ratings.groupby('userId').count()
new_users = grouped[grouped['rating'] < 18].reset_index()['userId']
new_users.head()

0    1
1    2
2    3
3    7
4    9
Name: userId, dtype: int64

Train Test Split. Not going to work with any users with less than 6 ratings because there's too little to do proper testing. But will be included in the final model.

In [46]:
grouped = ratings.groupby('userId').count()
work_with_df = grouped[(grouped['rating'] < 18) & (grouped['rating'] > 5)]

In [47]:
work_with_df.head()

Unnamed: 0_level_0,movieId,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,16,16,16
2,15,15,15
3,11,11,11
7,15,15,15
21,15,15,15


In [48]:
new_users_df = ratings[ratings['userId'].isin(new_users)]
new_ppl_movies = new_users_df['movieId']

In [49]:
new_movies_df = movies[movies['movieId'].isin(new_ppl_movies)]

In [50]:
#We are only working with movies that are within our new user data for time constraints and training purposes.
new_movies_df['genres'] = new_movies_df['genres'].fillna("").astype('str')
new_movies_df = new_movies_df.reset_index().drop(columns='index')
new_movies_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"['Adventure', 'Animation', 'Children', 'Comedy..."
1,2,Jumanji (1995),"['Adventure', 'Children', 'Fantasy']"
2,3,Grumpier Old Men (1995),"['Comedy', 'Romance']"
3,4,Waiting to Exhale (1995),"['Comedy', 'Drama', 'Romance']"
4,5,Father of the Bride Part II (1995),['Comedy']


We are using TFIDF to determine the relative importance of the genre.

In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 3),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(new_movies_df['genres'])
tfidf_matrix.shape

(13319, 607)

In [53]:
tfidf_data = pd.DataFrame(tfidf_matrix.todense(), columns = tf.get_feature_names())
tfidf_data = tfidf_data.set_index(new_movies_df['movieId'])
tfidf_data.head()

Unnamed: 0_level_0,action,action adventure,action adventure animation,action adventure children,action adventure comedy,action adventure crime,action adventure documentary,action adventure drama,action adventure fantasy,action adventure horror,action adventure imax,action adventure mystery,action adventure romance,action adventure sci,action adventure thriller,action adventure war,action adventure western,action animation,action animation children,action animation comedy,action animation crime,action animation drama,action animation fantasy,action animation horror,action animation mystery,action animation romance,action animation sci,action children,action children comedy,action children drama,action children fantasy,action children romance,action children sci,action comedy,action comedy crime,action comedy documentary,action comedy drama,action comedy fantasy,action comedy horror,action comedy imax,...,mystery western,noir,noir horror,noir horror mystery,noir horror thriller,noir mystery,noir mystery romance,noir mystery thriller,noir romance,noir romance thriller,noir sci,noir sci fi,noir thriller,romance,romance imax,romance sci,romance sci fi,romance thriller,romance thriller imax,romance thriller war,romance thriller western,romance war,romance war western,romance western,sci,sci fi,sci fi imax,sci fi thriller,sci fi war,sci fi western,thriller,thriller imax,thriller war,thriller western,thriller western imax,war,war imax,war western,western,western imax
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.510362,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.362843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Now we want to calculate cosine similarity by using linear kernel which is equivalent in this situation and faster

In [54]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
cosine_sim

array([[1.        , 0.20440242, 0.05105159, ..., 0.        , 0.        ,
        0.        ],
       [0.20440242, 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.05105159, 0.        , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.56622704],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.56622704, 0.        ,
        1.        ]])

### Recommend Movies based on other movies

In [55]:
def get_recommendation_genre(movie_df, movie_title, cosine_sim_matrix = cosine_sim):
    #find index of movie and extract the row
    index = pd.Series(movie_df.index, index=movie_df['title'])[movie_title]
    similarity_scores = list(enumerate(cosine_sim_matrix[index]))
    #sort by highest similarity
    similarity_scores = sorted(similarity_scores, key=lambda x: x[1], reverse=True)
    similarity_scores = similarity_scores[1:11]
    movie_indices = [i[0] for i in similarity_scores]
    return movie_df['title'].iloc[movie_indices]

In [56]:
get_recommendation_genre(new_movies_df, "Toy Story (1995)", cosine_sim)

2199                                           Antz (1998)
3008                                    Toy Story 2 (1999)
3634        Adventures of Rocky and Bullwinkle, The (2000)
3891                      Emperor's New Groove, The (2000)
4720                                 Monsters, Inc. (2001)
7724                                      Wild, The (2006)
8104                                Shrek the Third (2007)
8723                        Tale of Despereaux, The (2008)
9910     Asterix and the Vikings (Astérix et les Viking...
10418                                         Turbo (2013)
Name: title, dtype: object

Here we can see our recommendation system give other movie recommendations for Toy Story. This is difficult for us to test so we did not include it in our final model but gives interesting insight. From a customer standpoint, these recommendations seem quite accurate.

### Recommend Movies For User

In [57]:
def get_recommendation_genre_for_user (userId):
    df_user_ratings = new_users_df[new_users_df.userId == userId]
    df_user_data_with_genre = new_movies_df.reset_index().merge(df_user_ratings, on='movieId')
    df_user_data_with_genre['weight'] = df_user_data_with_genre['rating']/5
    user_profile = np.dot(tfidf_matrix[df_user_data_with_genre['index'].values].toarray().T, df_user_data_with_genre['weight'].values)
    c_sim = cosine_similarity(np.atleast_2d(user_profile), tfidf_matrix)
    rec = np.flip(np.argsort(c_sim)[0])
    recommendations = []
    for i in rec:
      if i not in df_user_data_with_genre['index']:
        recommendations.append(i)
    return new_movies_df['title'][recommendations].head(10)

In [58]:
get_recommendation_genre_for_user(6)

13318                                       Leal (2018)
4993                                Mother's Day (1980)
4445                                Experts, The (1989)
4444                             Erik the Viking (1989)
4443                      Eddie and the Cruisers (1983)
4442     Eddie and the Cruisers II: Eddie Lives! (1989)
4441                        Earth Girls Are Easy (1988)
4440                         Dry White Season, A (1989)
4439                             Dream Team, The (1989)
4438                        Dream a Little Dream (1989)
Name: title, dtype: object

Here shows an example of giving movie recommendations for a particular user. The method is simimlar to the one above for finding similar movies. We construct a user profile based on their past movies and find movies similar to that user profile as a vector.