In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import ast 
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

import warnings; warnings.simplefilter('ignore')

In [2]:
data = pd.read_csv('data.csv')

In [7]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1518 entries, 0 to 1517
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  1518 non-null   bool   
 1   backdrop_path          1518 non-null   object 
 2   belongs_to_collection  1518 non-null   object 
 3   budget                 1518 non-null   int64  
 4   genres                 1518 non-null   object 
 5   homepage               803 non-null    object 
 6   id                     1518 non-null   int64  
 7   imdb_id                1518 non-null   object 
 8   origin_country         1518 non-null   object 
 9   original_language      1518 non-null   object 
 10  original_title         1518 non-null   object 
 11  overview               1518 non-null   object 
 12  popularity             1518 non-null   float64
 13  poster_path            1518 non-null   object 
 14  production_companies   1518 non-null   object 
 15  prod

###  Content based recommendation system : Using movie description and taglines

In [3]:
data['tagline'] = data['tagline'].fillna('')
data['description'] = data['overview'] + data['tagline']
data['description'] = data['description'].fillna('')

In [4]:
data.to_csv('data.csv',index = False)

In [24]:
print(data['description'].loc[0])

Several generations in the future following Caesar's reign, apes are now the dominant species and live harmoniously while humans have been reduced to living in the shadows. As a new tyrannical ape leader builds his empire, one young ape undertakes a harrowing journey that will cause him to question all that he has known about the past and to make choices that will define a future for apes and humans alike.No one can stop the reign.


In [11]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0.01, stop_words='english')
tfidf_matrix = tf.fit_transform(data['description'])

In [15]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [38]:
titles = data['original_title']
indices = pd.Series(data.index, index=data['original_title'])

### Content based RS : Using movie description, taglines, keywords, cast, director and genres

#### create shorten ratings csv

In [25]:
ratings = pd.read_csv('ratings.csv')

In [32]:
ratings = ratings[0:len(ratings)//2]

In [24]:
ratings.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [33]:
len(ratings)

390626

In [34]:
ratings[['userId','movieId','rating']].to_csv('shortened_ratings.csv',index = False)

#### content based RS

In [40]:
ratings = pd.read_csv('shortened_ratings.csv')

In [43]:
def get_recommendations(title, length):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:length+1]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [44]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [33]:
# this is V
vote_counts = data[data['vote_count'].notnull()]['vote_count'].astype('int')

# this is R
vote_averages = data[data['vote_average'].notnull()]['vote_average'].astype('int')

# this is C
C = vote_averages.mean()

In [51]:
data['year'] = pd.to_datetime(data['release_date'])
data['year']= data['year'].apply(
    lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

In [49]:
data['release_date']

Unnamed: 0,release_date,release_date.1
0,2024-05-08,2024-05-08
1,2024-03-20,2024-03-20
2,2024-03-27,2024-03-27
3,2024-02-27,2024-02-27
4,2014-07-08,2014-07-08
...,...,...
1513,2016-06-18,2016-06-18
1514,2011-06-28,2011-06-28
1515,1989-04-28,1989-04-28
1516,1985-08-01,1985-08-01


In [53]:
m = vote_counts.quantile(0.95)

In [55]:
qualified = data[(data['vote_count'] >= m) & 
               (data['vote_count'].notnull()) & 
               (data['vote_average'].notnull())][['original_title', 
                                                'release_date', 
                                                'vote_count', 
                                                'vote_average', 
                                                'popularity', 
                                                'genres']]

qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')

In [56]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [57]:
qualified = qualified.sort_values('wr', ascending=False).head(250)

In [58]:
qualified.head(15)

Unnamed: 0,original_title,release_date,vote_count,vote_average,popularity,genres,wr
198,The Dark Knight,2008-07-16,31920,8,114.484,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",7.36947
34,Avengers: Infinity War,2018-04-25,28863,8,362.473,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",7.326155
15,The Matrix,1999-03-31,24858,8,440.334,"[{'id': 28, 'name': 'Action'}, {'id': 878, 'na...",7.25951
64,Avengers: Endgame,2019-04-24,24855,8,267.76,"[{'id': 12, 'name': 'Adventure'}, {'id': 878, ...",7.259455
18,Joker,2019-10-01,24558,8,425.856,"[{'id': 80, 'name': 'Crime'}, {'id': 53, 'name...",7.253983
166,The Lord of the Rings: The Fellowship of the Ring,2001-12-18,24323,8,164.708,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",7.249596
205,The Lord of the Rings: The Return of the King,2003-12-01,23379,8,123.611,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",7.23144
272,The Lord of the Rings: The Two Towers,2002-12-18,21139,8,115.753,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",7.184627
178,Harry Potter and the Prisoner of Azkaban,2004-05-31,20892,8,135.962,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",7.179114
20,Star Wars,1977-05-25,19981,8,541.884,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",7.158118


In [59]:
def improved_recommendations(title, length):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:length]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = data.iloc[movie_indices][['original_title', 'vote_count', 'vote_average', 'release_date']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & 
                       (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [60]:
print(improved_recommendations('The Dark Knight Rises',100))

                 original_title  vote_count  vote_average release_date  \
34       Avengers: Infinity War       28863             8   2018-04-25   
200  Terminator 2: Judgment Day       12361             8   1991-07-03   
184                      Avatar       30828             7   2009-12-15   
47       Spider-Man: Homecoming       21203             7   2017-07-05   
117               Batman Begins       20360             7   2005-06-10   
393                     Ant-Man       19261             7   2015-07-14   
348                 I Am Legend       15263             7   2007-12-12   
211                Men in Black       13341             7   1997-07-02   
408  How to Train Your Dragon 2        9269             7   2014-06-05   
712                    Sin City        7855             7   2005-04-01   

           wr  
34   7.326155  
200  6.928988  
184  6.691182  
47   6.610498  
117  6.601376  
393  6.588823  
348  6.535622  
211  6.504821  
408  6.423862  
712  6.389183  


In [64]:
ratings.columns

Index(['userId', 'movieId', 'rating'], dtype='object')

In [66]:
data.columns

Index(['belongs_to_collection', 'genres', 'origin_country',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'spoken_languages',
       'vote_average', 'id', 'tagline', 'vote_count', 'release_date', 'year'],
      dtype='object')

In [65]:
ratings[ratings['userId']==1]

Unnamed: 0,userId,movieId,rating
0,1,296,5.0
1,1,306,3.5
2,1,307,5.0
3,1,665,5.0
4,1,899,3.5
...,...,...,...
65,1,27193,3.0
66,1,27266,4.5
67,1,27721,3.0
68,1,31956,3.5


In [69]:
def create_user_profile(userId, cosine_sim, user_ratings_matrix):
    user_ratings = user_ratings_matrix.loc[userId].dropna()
    profile_vector = np.zeros_like(cosine_sim[0])
    for movieId, rating in user_ratings.items():
        # Check if the movie exists in the data DataFrame
        if movieId in data['id'].values:
            # Get the index of the movie in the cosine similarity matrix
            idx = data[data['id'] == movieId].index[0]
            # Update the profile vector by adding the weighted similarity scores
            profile_vector += cosine_sim[idx] * rating
    return profile_vector

# Step 5: Normalize the weighted sum
def normalize_vector(vector):
    norm = np.linalg.norm(vector)
    if norm == 0:
        return vector
    return vector / norm

# Example usage
userId = 1  # Example user ID
user_profile = create_user_profile(userId, cosine_sim, ratings)
normalized_profile = normalize_vector(user_profile)

In [70]:
print(normalized_profile)

[0. 0. 0. ... 0. 0. 0.]


### Stuff to work on if have time

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, save_npz, load_npz
from sklearn.metrics.pairwise import linear_kernel
import pickle

# Load the saved vectorizer and cosine similarity matrix
with open('tfidf_vectorizer.pkl', 'rb') as f:
    tfidf = pickle.load(f)

cosine_sim_sparse = load_npz('cosine_sim.npz')
cosine_sim = cosine_sim_sparse.toarray()

# Function to add a new movie and update cosine similarity matrix
def update_cosine_sim(new_movie_id, new_movie_description, movies_df, cosine_sim, tfidf):
    # Compute the TF-IDF vector for the new movie using the existing vectorizer
    new_tfidf_vector = tfidf.transform([new_movie_description])
    
    # Compute cosine similarity between the new movie and all existing movies
    new_cosine_sim = linear_kernel(new_tfidf_vector, tfidf.transform(movies_df['description'])).flatten()
    
    # Expand the cosine similarity matrix to include the new movie
    num_movies = cosine_sim.shape[0]
    new_cosine_sim_matrix = np.zeros((num_movies + 1, num_movies + 1))
    new_cosine_sim_matrix[:num_movies, :num_movies] = cosine_sim
    new_cosine_sim_matrix[num_movies, :num_movies] = new_cosine_sim
    new_cosine_sim_matrix[:num_movies, num_movies] = new_cosine_sim
    new_cosine_sim_matrix[num_movies, num_movies] = 1  # Similarity with itself is 1
    
    # Add the new movie to the movies DataFrame
    new_movie = pd.DataFrame({'movieId': [new_movie_id], 'description': [new_movie_description]})
    updated_movies_df = pd.concat([movies_df, new_movie], ignore_index=True)
    
    return updated_movies_df, new_cosine_sim_matrix

# Example usage
new_movie_id = 123456  # New movie ID
new_movie_description = "An epic adventure of a lifetime."
updated_movies_df, updated_cosine_sim = update_cosine_sim(new_movie_id, new_movie_description, movies_df, cosine_sim, tfidf)

# Save the updated cosine similarity matrix
updated_cosine_sim_sparse = csr_matrix(updated_cosine_sim)
save_npz('updated_cosine_sim.npz', updated_cosine_sim_sparse)


In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix, save_npz, load_npz
import pickle

# Assume we have the following data frames already loaded
movies_df = pd.read_csv('movies.csv')  # Columns: ['movieId', 'title', 'description']
ratings_df = pd.read_csv('ratings.csv')  # Columns: ['userId', 'movieId', 'rating']

# Load the saved vectorizer and cosine similarity matrix
with open('tfidf_vectorizer.pkl', 'rb') as f:
    tfidf = pickle.load(f)

cosine_sim_sparse = load_npz('cosine_sim.npz')
cosine_sim = cosine_sim_sparse.toarray()

# Assume tfidf_matrix is already computed and available
tfidf_matrix = tfidf.transform(movies_df['description'])

# Function to create a popular movies profile
def create_popular_movies_profile(ratings_df, movies_df, tfidf_matrix, top_n=10):
    # Calculate average ratings for each movie
    avg_ratings = ratings_df.groupby('movieId')['rating'].mean()
    
    # Get the top N movies by average rating
    top_movies = avg_ratings.nlargest(top_n).index
    
    # Initialize the profile vector
    profile_vector = np.zeros(tfidf_matrix.shape[1])
    
    # Sum the TF-IDF vectors of the top movies
    for movieId in top_movies:
        movie_idx = movies_df[movies_df['movieId'] == movieId].index[0]
        profile_vector += tfidf_matrix[movie_idx]
    
    # Normalize the profile vector
    profile_vector = normalize(profile_vector.reshape(1, -1))
    
    return profile_vector

# Create a generic user profile based on popular movies
generic_user_profile = create_popular_movies_profile(ratings_df, movies_df, tfidf_matrix)

print(generic_user_profile)
