In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import os
warnings.filterwarnings('ignore')

In [2]:
# Loading the entire datasets

test = pd.read_csv('data/test.csv')
genome_scores = pd.read_csv('data/genome_scores.csv')
genome_tags = pd.read_csv('data/genome_tags.csv')
imdb_data = pd.read_csv('data/imdb_data.csv')
links = pd.read_csv('data/links.csv')
movies = pd.read_csv('data/movies.csv')
sample_submission = pd.read_csv('data/sample_submission.csv')
tags = pd.read_csv('data/tags.csv')
train = pd.read_csv('data/train.csv')

In [3]:
test.head(2)

Unnamed: 0,userId,movieId
0,1,2011
1,1,4144


In [4]:
test.shape

(5000019, 2)

In [5]:
genome_scores.head(2)

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.02875
1,1,2,0.02375


In [6]:
genome_scores.shape

(15584448, 3)

In [7]:
genome_tags.head(2)

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)


In [8]:
genome_tags.shape

(1128, 2)

In [9]:
imdb_data.head(2)

Unnamed: 0,movieId,title_cast,director,runtime,budget,plot_keywords
0,1,Tom Hanks|Tim Allen|Don Rickles|Jim Varney|Wal...,John Lasseter,81.0,"$30,000,000",toy|rivalry|cowboy|cgi animation
1,2,Robin Williams|Jonathan Hyde|Kirsten Dunst|Bra...,Jonathan Hensleigh,104.0,"$65,000,000",board game|adventurer|fight|game


In [10]:
imdb_data.shape

(27278, 6)

In [11]:
links.head(2)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0


In [12]:
links.shape

(62423, 3)

In [120]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"Adventure, Animation, Children, Comedy, Fantasy"
1,2,Jumanji (1995),"Adventure, Children, Fantasy"


In [14]:
movies.shape

(62423, 3)

In [15]:
sample_submission.head(2)

Unnamed: 0,Id,rating
0,1_2011,1.0
1,1_4144,1.0


In [16]:
sample_submission.shape

(5000019, 2)

In [17]:
tags.head(2)

Unnamed: 0,userId,movieId,tag,timestamp
0,3,260,classic,1439472355
1,3,260,sci-fi,1439472256


In [18]:
tags.shape

(1093360, 4)

In [19]:
train.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,5163,57669,4.0,1518349992
1,106343,5,4.5,1206238739


In [20]:
train.shape

(10000038, 4)

In [42]:
# Print number of movies in dataset
print(f'There are {movies.shape[0]} movies in the dataset.')

There are 62423 movies in the dataset.


In [44]:
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [46]:
# To check for duplicate values
movies.duplicated().sum()

0

In [48]:
# To finetune the genre description to a list and make it readable
movies.iloc[0]['genres']

'Adventure|Animation|Children|Comedy|Fantasy'

In [50]:
# Define the function to format genres
def format_genres(genre_string):
    """
    Convert a genre string with '|' separators into a readable list format.
    
    Args:
        genre_string (str): String containing genres separated by '|'
    
    Returns:
        str: Formatted string with genres separated by commas
    """
    if not genre_string:  # Handle empty strings
        return ""
    
    # Split the string by '|' and create a list, then join with commas
    genres_list = genre_string.split('|')
    return ', '.join(genres_list)

# Apply the function to the 'genres' column
movies['genres'] = movies['genres'].apply(format_genres)

In [52]:
movies.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"Adventure, Animation, Children, Comedy, Fantasy"
1,2,Jumanji (1995),"Adventure, Children, Fantasy"
2,3,Grumpier Old Men (1995),"Comedy, Romance"


In [54]:
# Removing unnecessary columns
new_df = movies[['movieId', 'title', 'genres']]
new_df.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"Adventure, Animation, Children, Comedy, Fantasy"
1,2,Jumanji (1995),"Adventure, Children, Fantasy"
2,3,Grumpier Old Men (1995),"Comedy, Romance"


In [56]:
# Making all letters in genre to lower case
new_df['genres'] = new_df['genres'].str.lower()
new_df.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),"adventure, animation, children, comedy, fantasy"
1,2,Jumanji (1995),"adventure, children, fantasy"
2,3,Grumpier Old Men (1995),"comedy, romance"


In [58]:
import nltk
from nltk.stem import PorterStemmer

In [60]:
ps = PorterStemmer()

In [62]:
# Preprocess the 'genres' column
import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer # Import CountVectorizer
new_df['genres'] = new_df['genres'].fillna('').astype(str)
new_df['genres'] = new_df['genres'].str.strip()
new_df = new_df[new_df['genres'].str.len() > 0]  # Drop empty strings

# Create and configure CountVectorizer
cv = CountVectorizer(
    stop_words=None,  # Keep all words
    lowercase=True,   # Convert to lowercase
    token_pattern=r'(?u)\b\w+\b',  # Match any word
    min_df=1,         # Include terms in at least 1 document
)

# Fit and transform the 'tags' column
vector = cv.fit_transform(new_df['genres']).toarray()

# Print results for verification
print("Vocabulary:", cv.vocabulary_)
print("Vector shape:", vector.shape)
print("Sample vectors:", vector[:5])  # Show first 5 rows of the vector matrix

Vocabulary: {'adventure': 1, 'animation': 2, 'children': 3, 'comedy': 4, 'fantasy': 8, 'romance': 19, 'drama': 7, 'action': 0, 'crime': 5, 'thriller': 21, 'horror': 12, 'mystery': 16, 'sci': 20, 'fi': 9, 'imax': 13, 'documentary': 6, 'war': 22, 'musical': 15, 'western': 23, 'film': 10, 'noir': 18, 'no': 17, 'genres': 11, 'listed': 14}
Vector shape: (62423, 24)
Sample vectors: [[0 1 1 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0]
 [0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [64]:
vector

array([[0, 1, 1, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0]], dtype=int64)

In [66]:
from sklearn.metrics.pairwise import cosine_similarity

In [68]:
similary = cosine_similarity(vector)

In [69]:
similary

array([[1.        , 0.77459667, 0.31622777, ..., 0.31622777, 0.        ,
        0.25819889],
       [0.77459667, 1.        , 0.        , ..., 0.        , 0.        ,
        0.33333333],
       [0.31622777, 0.        , 1.        , ..., 0.5       , 0.        ,
        0.        ],
       ...,
       [0.31622777, 0.        , 0.5       , ..., 1.        , 0.        ,
        0.40824829],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.25819889, 0.33333333, 0.        , ..., 0.40824829, 0.        ,
        1.        ]])

In [70]:
similary.shape

(62423, 62423)

In [108]:
def recommend(movie):
    index = new_df[new_df['title'] == movie].index[0]
    distances = sorted(list(enumerate(similary[index])), reverse=True, key = lambda x: x[1])
    for i in distances[1:8]:
        print(new_df.iloc[i[0]].title)

In [110]:
recommend('A Girl Thing (2001)')

Glitterbug (1994)
Age of the Earth, The (A Idade da Terra) (1980)
Trails (Veredas) (1978)
Milky Way (Tejút) (2007)
Dancing Hawk, The (Tanczacy jastrzab) (1978)
Warsaw Bridge (Pont de Varsòvia) (1990)
Ella Lola, a la Trilby (1898)


In [112]:
recommend('Jane B. by Agnès V. (1988)')

Darkon (2006)
Video Letter (1983)
The Endless Film (2018)
Jane B. by Agnès V. (1988)
Nico Icon (1995)
Heidi Fleiss: Hollywood Madam (1995)
Catwalk (1996)


In [116]:
recommend('Avengers: Infinity War - Part I (2018)')

Star Wars: Episode IV - A New Hope (1977)
Stargate (1994)
Demolition Man (1993)
Star Wars: Episode V - The Empire Strikes Back (1980)
Star Wars: Episode VI - Return of the Jedi (1983)
Star Trek III: The Search for Spock (1984)
Time Tracers (1995)
