In [18]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [19]:
# Load the dataset
url = "https://raw.githubusercontent.com/rashida048/Some-NLP-Projects/master/movie_dataset.csv"
movies = pd.read_csv(url)


print(movies.head())
print("\nDataset shape:", movies.shape)
print("\nColumns:", movies.columns)
print("\nMissing values:\n", movies.isnull().sum())

   index     budget                                    genres  \
0      0  237000000  Action Adventure Fantasy Science Fiction   
1      1  300000000                  Adventure Fantasy Action   
2      2  245000000                    Action Adventure Crime   
3      3  250000000               Action Crime Drama Thriller   
4      4  260000000          Action Adventure Science Fiction   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   
3            http://www.thedarkknightrises.com/   49026   
4          http://movies.disney.com/john-carter   49529   

                                            keywords original_language  \
0  culture clash future space war space colony so...                en   
1  ocean drug abuse exotic island east india trad...                en   
2         spy based on novel sec

In [20]:
# Fill missing values in important columns
movies['keywords'] = movies['keywords'].fillna('')
movies['tagline'] = movies['tagline'].fillna('')
movies['overview'] = movies['overview'].fillna('')


In [21]:

# 1. First create the 'combined_features' column safely
text_columns = ['genres', 'keywords', 'overview', 'tagline']  # Common movie dataset columns

# Check which columns actually exist in your dataset
available_columns = [col for col in text_columns if col in movies.columns]

# Create combined features by joining available columns
movies['combined_features'] = (
    movies[available_columns]
    .fillna('')  # Fill missing values with empty string
    .astype(str)  # Convert all to strings
    .agg(' '.join, axis=1)  # Combine columns with space between them
    .str.replace(r'\s+', ' ', regex=True)  # Normalize whitespace
    .str.strip()  # Remove leading/trailing spaces
)

# 2. Now perform the null checks
print("Null values after creation:", movies['combined_features'].isnull().sum())
print("Empty strings:", (movies['combined_features'] == '').sum())

# 3. Handle any empty strings if needed
movies['combined_features'] = movies['combined_features'].replace('', 'no_description')

# 4. Create TF-IDF matrix
tfidf = TfidfVectorizer(
    stop_words='english',
    min_df=2,
    max_features=10000
)
tfidf_matrix = tfidf.fit_transform(movies['combined_features'])

print("TF-IDF Matrix Shape:", tfidf_matrix.shape)

Null values after creation: 0
Empty strings: 0
TF-IDF Matrix Shape: (4803, 10000)


In [22]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Create a mapping between movie titles and indices
indices = pd.Series(movies.index, index=movies['title']).drop_duplicates()

In [23]:
def get_recommendations(title, cosine_sim=cosine_sim, movies=movies, indices=indices):
    # Get the index of the movie that matches the title
    idx = indices[title]
    
    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]
    
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Return the top 10 most similar movies
    return movies['title'].iloc[movie_indices]

In [24]:
# Get recommendations for a sample movie
sample_movie = "The Dark Knight Rises"
recommendations = get_recommendations(sample_movie)
print(f"Recommendations for '{sample_movie}':")
print(recommendations)

Recommendations for 'The Dark Knight Rises':
299                              Batman Forever
65                              The Dark Knight
1359                                     Batman
428                              Batman Returns
119                               Batman Begins
3854    Batman: The Dark Knight Returns, Part 2
210                              Batman & Robin
2507                                  Slow Burn
9            Batman v Superman: Dawn of Justice
2274                                   Survivor
Name: title, dtype: object


In [25]:
# Get recommendations for a sample movie
sample_movie = "The Mask"
recommendations = get_recommendations(sample_movie)
print(f"Recommendations for '{sample_movie}':")
print(recommendations)

Recommendations for 'The Mask':
309                        Son of the Mask
2349                     Small Time Crooks
4253                    Closer to the Moon
918                             Inside Man
2767                         Birthday Girl
3142                        American Heist
4011    Halloween III: Season of the Witch
534                                Bandits
3106              High Heels and Low Lifes
782                             The Spirit
Name: title, dtype: object
