<a href="https://colab.research.google.com/github/mehdihemmatyar/RecommendationSystem/blob/main/MovieRecommendation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import zipfile

**Unzip data**

In [None]:

# Specify the path to the ZIP file
zip_file_path = "/content/drive/MyDrive/dataset-ml-25m.zip"

# Open the ZIP file
with zipfile.ZipFile(zip_file_path, "r") as zip_ref:
    # Extract all files and directories in the ZIP file
    zip_ref.extractall("/content/drive/MyDrive")

**Import datasets**

In [2]:
genom_scores = pd.read_csv("/content/drive/MyDrive/ml-25m/genome-scores.csv")

In [5]:
genom_tags = pd.read_csv("/content/drive/MyDrive/ml-25m/genome-tags.csv")

In [7]:
movies = pd.read_csv("/content/drive/MyDrive/ml-25m/movies.csv")

In [8]:
tags = pd.read_csv("/content/drive/MyDrive/ml-25m/tags.csv")

In [9]:
ratings = pd.read_csv("/content/drive/MyDrive/ml-25m/ratings.csv") # # we don't need this for content-based recommendation as it contains user ratings

**Data Preparation**

In [10]:
movies.drop_duplicates(inplace=True)
ratings.drop_duplicates(inplace=True)
tags.drop_duplicates(inplace=True)
genom_scores.drop_duplicates(inplace=True)
genom_tags.drop_duplicates(inplace=True)

movies.fillna('', inplace=True)
ratings.fillna(0, inplace=True)
tags.fillna('', inplace=True)
genom_scores.fillna(0, inplace=True)
genom_tags.fillna('', inplace=True)


**Filter relevant tags**


In [11]:
filtered_genom_scores = genom_scores[genom_scores['relevance']>0.8]

show some samples

In [3]:
genom_scores.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375
2,1,3,0.0625
3,1,4,0.07575
4,1,5,0.14075


In [13]:
genom_tags.head()

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


**Convert tagId to tag name**

In [14]:
merged_genom_tag = filtered_genom_scores.merge(genom_tags, on='tagId')


In [15]:
merged_genom_tag.head()

Unnamed: 0,movieId,tagId,relevance,tag
0,1,29,0.89375,adventure
1,2,29,0.976,adventure
2,10,29,0.80175,adventure
3,15,29,0.97475,adventure
4,146,29,0.836,adventure


**Group by movieId**


In [16]:
gp_genom_tag = merged_genom_tag.groupby('movieId')['tag'].apply(lambda x: ' '.join(x))

**Combine genres and tags into a consolidated dataset**

In [18]:

movie_genres = movies['genres'].str.replace('|', " ")
movie_tags = tags.groupby('movieId')['tag'].apply(lambda x: ' '.join(x)).reset_index()

consolidated_data = pd.merge(movies, movie_tags, on='movieId')
consolidated_data = pd.merge(consolidated_data, gp_genom_tag, on='movieId')


  movie_genres = movies['genres'].str.replace('|', " ")


In [19]:
consolidated_data

Unnamed: 0,movieId,title,genres,tag_x,tag_y
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Owned imdb top 250 Pixar Pixar time travel chi...,adventure animated animation cartoon cgi child...
1,2,Jumanji (1995),Adventure|Children|Fantasy,Robin Williams time travel fantasy based on ch...,adventure childhood children family kids anima...
2,3,Grumpier Old Men (1995),Comedy|Romance,funny best friend duringcreditsstinger fishing...,comedy good sequel sequel sequels
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,based on novel or book chick flick divorce int...,chick flick divorce women
4,5,Father of the Bride Part II (1995),Comedy,aging baby confidence contraception daughter g...,family comedy good sequel sequel sequels fathe...
...,...,...,...,...,...
13332,205072,Zombieland: Double Tap (2019),Action|Comedy|Horror,All-Star Cast Jesse Eisenberg Logo Joke The St...,friendship original dumb but funny
13333,205076,Downton Abbey (2019),Drama,period drama period drama theater,girlie movie
13334,205383,El Camino: A Breaking Bad Movie (2019),Crime|Drama|Thriller,breaking bad breaking bad cinematography Vince...,original
13335,205425,Dave Chappelle: Sticks & Stones (2019),Comedy,stand-up comedy stand-up comedy comedy Politic...,original comedy stand-up comedy


**Subset of data due to limited RAM**

In [27]:
subset_movies = consolidated_data.head(20000)

In [28]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer


**Create a TF-IDF vectorizer**

In [29]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')


In [30]:
subset_genres_matrix = tfidf_vectorizer.fit_transform(subset_movies['genres']+subset_movies['tag_x']+subset_movies['tag_y'])


In [31]:
cosine_similarities = cosine_similarity(subset_genres_matrix, subset_genres_matrix)


**Function to get movie recommendations based on title similarity**

In [32]:


def get_movie_recommendations(title, cosine_similarities, consolidated_data, top_n=5):
    index = consolidated_data[consolidated_data['title'] == title].index[0]
    similarities = list(enumerate(cosine_similarities[index]))
    similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    top_similar_movies = similarities[1:top_n + 1]
    recommended_movies = consolidated_data.iloc[[movie[0] for movie in top_similar_movies]]['title'].tolist()
    return recommended_movies


Example

In [33]:

input_movie = 'Ice Age (2002)'
recommended_movies = get_movie_recommendations(input_movie, cosine_similarities, consolidated_data)
print(f"Recommended movies for '{input_movie}':")
for movie in recommended_movies:
    print(movie)

Recommended movies for 'Ice Age (2002)':
Bug's Life, A (1998)
Finding Nemo (2003)
Toy Story (1995)
Monsters, Inc. (2001)
Toy Story 2 (1999)
