In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

        
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# Libraries for recommendation system algorithms
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

# Libraries for working with JSON data
import json


import sys

In [None]:
movies_df = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')
credits_df = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv') 

# Display the first few rows of each dataframe
credits = credits_df.head()
movies = movies_df.head()

credits, movies

In [None]:
# Renaming columns in the credits dataframe
credits_df.columns = ['id', 'title', 'cast', 'crew']

# Merging the movies dataframe with the renamed credits dataframe on 'id'
merged_df = movies_df.merge(credits_df, on='id')

# Display the first few rows of the merged dataframe
merged_df_head = merged_df.head()
merged_df_head

In [None]:
merged_df.isnull().sum()

**Visualisation**


In [None]:
from collections import Counter

# Extract genres from the merged dataframe
merged_df['genres'] = merged_df['genres'].apply(lambda x: json.loads(x) if isinstance(x, str) else x)
all_genres_merged = merged_df['genres'].apply(lambda x: [genre['name'] for genre in x] if x else [])
flat_genres_merged = [genre for sublist in all_genres_merged for genre in sublist]

# Count the frequency of each genre in the merged dataframe
genre_counts_merged = Counter(flat_genres_merged)
genres_merged, counts_merged = zip(*genre_counts_merged.items())

# Define a color map for the genres
colors = plt.cm.tab20.colors

# Create the bar plot for the merged dataframe with different colors for each genre
plt.figure(figsize=(12, 8))
plt.bar(genres_merged, counts_merged, color=colors[:len(genres_merged)])
plt.xlabel('Genres')
plt.ylabel('Count')
plt.title('Distribution of Movie Genres')
plt.xticks(rotation=90)
plt.show()

In [None]:
# Assuming merged_df is already defined and loaded with the correct data
# Example setup for merged_df if needed

# Extract genres from the merged dataframe
merged_df['genres'] = merged_df['genres'].apply(lambda x: json.loads(x) if isinstance(x, str) else x)
all_genres_merged = merged_df['genres'].apply(lambda x: [genre['name'] for genre in x] if x else [])
flat_genres_merged = [genre for sublist in all_genres_merged for genre in sublist]

# Create a new dataframe with each genre associated with the corresponding movie
expanded_genres = merged_df.explode('genres')
expanded_genres['genre_names'] = expanded_genres['genres'].apply(lambda x: x['name'] if isinstance(x, dict) else None)

# Extract the release year from the release_date column
expanded_genres['release_year'] = pd.to_datetime(expanded_genres['release_date'], errors='coerce').dt.year

# Group by genre and year to calculate the average popularity
genre_popularity_over_time = expanded_genres.groupby(['release_year', 'genre_names'])['popularity'].mean().unstack()

# Create the line plot for genre popularity over time
plt.figure(figsize=(16, 10))
genre_popularity_over_time.plot(kind='line', linewidth=2, colormap='tab20', figsize=(16, 10))
plt.xlabel('Year')
plt.ylabel('Average Popularity')
plt.title('Average Popularity of Each Genre Over Time')
plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.grid(True)
plt.show()

In [None]:
# Create a scatter plot for Revenue vs. Budget
plt.figure(figsize=(12, 8))
plt.scatter(merged_df['budget'], merged_df['revenue'], alpha=0.5, color='red', edgecolors='w', linewidth=0.5)
plt.xlabel('Budget')
plt.ylabel('Revenue')
plt.title('Revenue vs. Budget')
plt.xscale('log')
plt.yscale('log')
plt.show()

In [None]:
# Extract genres and repeat the rows for each genre to handle multi-genre movies
expanded_genres = merged_df.explode('genres')

# Prepare the budget and revenue data for violin plots
expanded_genres['genre_names'] = expanded_genres['genres'].apply(lambda x: x['name'] if isinstance(x, dict) else None)

# Drop rows with missing genre names
expanded_genres = expanded_genres.dropna(subset=['genre_names'])

# Create violin plots for budget and revenue by genre
plt.figure(figsize=(16, 10))

plt.subplot(2, 1, 1)
sns.violinplot(x='genre_names', y='budget', data=expanded_genres, scale='width', inner='quartile', palette='muted')
plt.xlabel('Genres')
plt.ylabel('Budget')
plt.title('Budget Distribution by Genre')
plt.xticks(rotation=90)

plt.subplot(2, 1, 2)
sns.violinplot(x='genre_names', y='revenue', data=expanded_genres, scale='width', inner='quartile', palette='muted')
plt.xlabel('Genres')
plt.ylabel('Revenue')
plt.title('Revenue Distribution by Genre')
plt.xticks(rotation=90)

plt.tight_layout()
plt.show()


In [None]:
# Get the top 10 highest-grossing movies
top_10_grossing = merged_df.nlargest(10, 'revenue')[['title_x', 'revenue']]

# Create the bar plot for the top 10 highest-grossing movies
plt.figure(figsize=(12, 8))
plt.barh(top_10_grossing['title_x'], top_10_grossing['revenue'], color='skyblue')
plt.xlabel('Revenue')
plt.ylabel('Movie Title')
plt.title('Top 10 Highest-Grossing Movies')
plt.gca().invert_yaxis()  # Invert y-axis to have the highest-grossing movie at the top
plt.show()

In [None]:
# Create the pie chart for the top 10 highest-grossing movies
plt.figure(figsize=(12, 8))
plt.pie(top_10_grossing['revenue'], labels=top_10_grossing['title_x'], autopct='%.1f%%', colors=plt.cm.tab20.colors)
plt.title('Top 10 Highest-Grossing Movies')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

In [None]:
# Get the top 10 movies by popularity
top_10_popularity = merged_df.nlargest(10, 'popularity')[['title_x', 'popularity']]

# Create the bar plot for the top 10 movies by popularity with different colors
plt.figure(figsize=(12, 8))
colors = plt.cm.Paired.colors[:len(top_10_popularity)]
plt.barh(top_10_popularity['title_x'], top_10_popularity['popularity'], color=colors)
plt.xlabel('Popularity')
plt.ylabel('Movie Title')
plt.title('Top 10 Movies by Popularity')
plt.gca().invert_yaxis()  # Invert y-axis to have the most popular movie at the top
plt.show()

In [None]:
# Extract the year from the release_date column
merged_df['release_year'] = pd.to_datetime(merged_df['release_date'], errors='coerce').dt.year

# Count the number of movies released each year
movies_per_year = merged_df['release_year'].value_counts().sort_index()

# Create the line plot for the number of movies released each year
plt.figure(figsize=(12, 8))
plt.plot(movies_per_year.index, movies_per_year.values, marker='o', linestyle='-', color='skyblue')
plt.xlabel('Year')
plt.ylabel('Number of Movies')
plt.title('Number of Movies Released Each Year')
plt.grid(True)
plt.show()

In [None]:
import ast

# Extract and count gender information from the cast column
def extract_genders(cast_data):
    try:
        cast_list = ast.literal_eval(cast_data)
        genders = [person['gender'] for person in cast_list if person['gender'] in [1, 2]]
        return genders
    except (ValueError, SyntaxError):
        return []

# Apply the extraction function to the cast column
merged_df['cast_genders'] = merged_df['cast'].apply(extract_genders)

# Flatten the list of genders and count occurrences
all_genders = [gender for sublist in merged_df['cast_genders'] for gender in sublist]
gender_counts = Counter(all_genders)

# Map gender codes to labels
gender_labels = {1: 'Female', 2: 'Male'}
gender_counts_mapped = {gender_labels[k]: v for k, v in gender_counts.items()}

# Create the pie chart for gender distribution
plt.figure(figsize=(12, 8))
plt.pie(gender_counts_mapped.values(), labels=gender_counts_mapped.keys(), autopct='%.1f%%', colors=plt.cm.Paired.colors)
plt.title('Gender Distribution of Actors')
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.show()

In [None]:
from wordcloud import WordCloud

# Generate the word cloud
plt.figure(figsize=(12, 10))
plt.title('The Most Common Words in Movie Overviews\n', fontsize=20, weight='bold', color='#333d29')

# Generate the word cloud from the 'overview' column
wc = WordCloud(max_words=1000, min_font_size=10, height=800, width=1600, background_color="white").generate(' '.join(merged_df['overview'].dropna()))

# Display the word cloud
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Fill NaN values in the 'overview' column with an empty string
merged_df['overview'] = merged_df['overview'].fillna('')

# Use CountVectorizer to count the frequency of each word
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(merged_df['overview'])
word_counts = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Sum up the counts of each word
word_counts_sum = word_counts.sum().sort_values(ascending=False)

# Select the top 20 most common words
top_words = word_counts_sum.head(20)

# Plot the most common words
plt.figure(figsize=(12, 8))
sns.barplot(x=top_words.values, y=top_words.index, palette='viridis')
plt.title('The Most Common Words in Movie Overviews', fontsize=20, weight='bold')
plt.xlabel('Frequency')
plt.ylabel('Word')
plt.show()

In [None]:
import plotly.express as px
import json
from collections import Counter

# Extract and count production countries
merged_df['production_countries'] = merged_df['production_countries'].apply(lambda x: json.loads(x) if isinstance(x, str) else x)
all_countries = merged_df['production_countries'].apply(lambda x: [country['name'] for country in x] if x else [])
flat_countries = [country for sublist in all_countries for country in sublist]

# Count the frequency of each country
country_counts = Counter(flat_countries)
top_countries = country_counts.most_common(10)
countries, counts = zip(*top_countries)

# Create a treemap plot for the top production countries
fig = px.treemap(
    names=countries, 
    parents=["" for _ in countries], 
    values=counts,
    title='Top Production Countries with Highest Number of Movies'
)

fig.update_layout(
    plot_bgcolor='rgb(17, 17, 17)',  
    paper_bgcolor='rgb(17, 17, 17)', 
    font_color='white'
)

fig.show()

In [None]:
# Handle missing values by filling with empty strings
merged_df.fillna('', inplace=True)

# Function to extract names from a list of dictionaries
def extract_names(json_str, key='name'):
    try:
        json_obj = json.loads(json_str)
        names = [item[key] for item in json_obj]
        return ' '.join(names)
    except:
        return ''

In [None]:
# Apply extraction function to relevant columns
merged_df['genres'] = merged_df['genres'].apply(extract_names)
merged_df['keywords'] = merged_df['keywords'].apply(extract_names)
merged_df['production_companies'] = merged_df['production_companies'].apply(extract_names)
merged_df['production_countries'] = merged_df['production_countries'].apply(extract_names)


In [None]:
# Function to extract cast and crew names
def extract_cast_crew_names(json_str, role='cast'):
    try:
        json_obj = json.loads(json_str)
        if role == 'cast':
            names = [item['name'] for item in json_obj]
        else:
            names = [item['name'] for item in json_obj if item['job'] in ['Director', 'Producer', 'Writer']]
        return ' '.join(names)
    except:
        return ''

In [None]:
merged_df['cast'] = merged_df['cast'].apply(extract_cast_crew_names)
merged_df['crew'] = merged_df['crew'].apply(lambda x: extract_cast_crew_names(x, role='crew'))

In [None]:
# Extract relevant features and combine them into a single string
def combine_features(row):
    features = []
    features.append(row['genres'])
    features.append(row['keywords'])
    features.append(row['original_language'])
    features.append(row['production_companies'])
    features.append(row['production_countries'])
    features.append(row['tagline'])
    features.append(row['cast'])
    features.append(row['crew'])
    return ' '.join(features)

In [None]:
merged_df['combined_features'] = merged_df.apply(combine_features, axis=1)

In [None]:
# Vectorize the combined features using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
feature_vectors = vectorizer.fit_transform(merged_df['combined_features'])

In [None]:
# Train the Nearest Neighbors model
model = NearestNeighbors(metric='cosine', algorithm='brute')
model.fit(feature_vectors)

In [None]:
# Define the recommendation function with various search filters
def recommend_movies(search_term, n_recommendations=5):
    search_vector = vectorizer.transform([search_term])
    distances, indices = model.kneighbors(search_vector, n_neighbors=n_recommendations + 1)
    recommended_movies = merged_df.iloc[indices[0][1:]]['original_title']
    return recommended_movies

In [None]:
# Example usage
# Searching by genre
print("Movies related to 'Action' genre:")
print(recommend_movies('Action', 5))

# Searching by title
print("\nMovies related to 'Avatar' title:")
print(recommend_movies('Avatar', 5))

# Searching by keywords
print("\nMovies related to 'alien' keyword:")
print(recommend_movies('alien', 5))

# Searching by production company
print("\nMovies related to 'Walt Disney' production company:")
print(recommend_movies('Walt Disney', 5))