## Install and Import Required Libraries

In [12]:
# Install required libraries
!pip install pandas scikit-learn requests streamlit -q

In [13]:
import pandas as pd
import numpy as np
import pickle
import requests
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

print("‚úÖ All libraries imported successfully!")

‚úÖ All libraries imported successfully!


## Load and Explore the Dataset

In [14]:
movies_df = pd.read_csv('movies.csv')

# Display basic info
print(f"Dataset Shape: {movies_df.shape}")
print(f"\nColumn Names: {list(movies_df.columns)}")
print(f"\nData Types:\n{movies_df.dtypes}")
print(f"\nMissing Values:\n{movies_df.isnull().sum()}")
movies_df.head()

Dataset Shape: (10000, 9)

Column Names: ['id', 'title', 'genre', 'original_language', 'overview', 'popularity', 'release_date', 'vote_average', 'vote_count']

Data Types:
id                     int64
title                 object
genre                 object
original_language     object
overview              object
popularity           float64
release_date          object
vote_average         float64
vote_count             int64
dtype: object

Missing Values:
id                    0
title                 0
genre                 3
original_language     0
overview             13
popularity            0
release_date          0
vote_average          0
vote_count            0
dtype: int64


Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


## Data Preprocessing ‚Äî Feature Engineering3

In [15]:
# Select relevant columns (dataset uses 'genre' not 'genres')
movies = movies_df[['id', 'title', 'overview', 'genre']].copy()

# Drop rows with missing values in overview or genre
movies.dropna(subset=['overview', 'genre'], inplace=True)
print(f"Shape after dropping nulls: {movies.shape}")

# The genre column contains comma-separated genre names (e.g., "Drama,Crime")
# Replace commas with spaces so each genre is a separate word
movies['genre'] = movies['genre'].apply(lambda x: x.replace(',', ' '))

# Combine overview and genre into a new 'tags' column
movies['tags'] = movies['overview'] + ' ' + movies['genre']

# Convert tags to lowercase
movies['tags'] = movies['tags'].str.lower()

# Drop original overview and genre columns
movies.drop(columns=['overview', 'genre'], inplace=True)

# Reset index
movies.reset_index(drop=True, inplace=True)

print(f"Final Shape: {movies.shape}")
movies.head()

Shape after dropping nulls: (9985, 4)
Final Shape: (9985, 3)


Unnamed: 0,id,title,tags
0,278,The Shawshank Redemption,framed in the 1940s for the double murder of h...
1,19404,Dilwale Dulhania Le Jayenge,"raj is a rich, carefree, happy-go-lucky second..."
2,238,The Godfather,"spanning the years 1945 to 1955, a chronicle o..."
3,424,Schindler's List,the true story of how businessman oskar schind...
4,240,The Godfather: Part II,in the continuing saga of the corleone crime f...


## Text Vectorization with CountVectorizer

In [16]:
# Initialize CountVectorizer
cv = CountVectorizer(max_features=10000, stop_words='english')

# Fit and transform the tags column
vectors = cv.fit_transform(movies['tags'])

print(f"Vectorized Matrix Shape: {vectors.shape}")
print(f"  ‚Üí {vectors.shape[0]} movies √ó {vectors.shape[1]} features")

Vectorized Matrix Shape: (9985, 10000)
  ‚Üí 9985 movies √ó 10000 features


## Compute Cosine Similarity Matrix

In [17]:
# Compute cosine similarity matrix
similarity = cosine_similarity(vectors)

print(f"Similarity Matrix Shape: {similarity.shape}")
print(f"\nSample (first 5√ó5 slice):")
print(np.round(similarity[:5, :5], 3))

Similarity Matrix Shape: (9985, 9985)

Sample (first 5√ó5 slice):
[[1.    0.056 0.129 0.037 0.114]
 [0.056 1.    0.077 0.037 0.114]
 [0.129 0.077 1.    0.035 0.476]
 [0.037 0.037 0.035 1.    0.038]
 [0.114 0.114 0.476 0.038 1.   ]]


## Build the Recommendation Function

In [18]:
def recommend(movie_title):
    """
    Recommend top 5 similar movies based on content similarity.
    
    Args:
        movie_title (str): Title of the selected movie
    
    Returns:
        list: Top 5 recommended movie titles
        list: Corresponding movie IDs
    """
    # Find the index of the selected movie
    movie_index = movies[movies['title'] == movie_title].index[0]
    
    # Get similarity scores for this movie with all others
    distances = similarity[movie_index]
    
    # Sort by similarity (descending) and get top 6 (first one is itself)
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    recommended_titles = []
    recommended_ids = []
    
    for i in movie_list:
        recommended_titles.append(movies.iloc[i[0]].title)
        recommended_ids.append(movies.iloc[i[0]].id)
    
    return recommended_titles, recommended_ids

# Test the recommendation function
print("üé¨ Movies similar to 'The Godfather':\n")
titles, ids = recommend('The Godfather')
for i, (title, movie_id) in enumerate(zip(titles, ids), 1):
    print(f"  {i}. {title} (ID: {movie_id})")

üé¨ Movies similar to 'The Godfather':

  1. The Godfather: Part II (ID: 240)
  2. Blood Ties (ID: 190955)
  3. Joker (ID: 475557)
  4. Bomb City (ID: 396774)
  5. Gotti (ID: 339103)


## Fetch Movie Posters from TMDB API

In [19]:
TMDB_API_KEY = "Input your TMDB API key here"
PLACEHOLDER_POSTER = "https://via.placeholder.com/500x750.png?text=No+Poster+Available"

def fetch_poster(movie_id):
    """
    Fetch movie poster URL from TMDB API with retry logic.
    
    Args:
        movie_id (int): TMDB movie ID
    
    Returns:
        str: Full URL to the movie poster image
    """
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={TMDB_API_KEY}&language=en-US"
    for attempt in range(3):
        try:
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                data = response.json()
                poster_path = data.get('poster_path')
                if poster_path:
                    # poster_path already starts with '/' so no extra slash needed
                    return f"https://image.tmdb.org/t/p/w500{poster_path}"
            break
        except (requests.exceptions.Timeout, requests.exceptions.ConnectionError):
            if attempt == 2:
                break
            continue
        except Exception as e:
            print(f"Error fetching poster: {e}")
            break
    return PLACEHOLDER_POSTER

# Test: fetch poster for The Godfather (ID: 238)
test_poster_url = fetch_poster(238)
print(f"Poster URL for The Godfather:\n{test_poster_url}")

Poster URL for The Godfather:
https://via.placeholder.com/500x750.png?text=No+Poster+Available


## Save Model and Data with Pickle

In [20]:
# Save the processed movie dataframe
pickle.dump(movies, open('movies.pkl', 'wb'))
print("‚úÖ Saved movies.pkl")

# Save the similarity matrix
pickle.dump(similarity, open('similarity.pkl', 'wb'))
print("‚úÖ Saved similarity.pkl")

# Verify by loading them back
movies_loaded = pickle.load(open('movies.pkl', 'rb'))
similarity_loaded = pickle.load(open('similarity.pkl', 'rb'))

print(f"\nüì¶ Verification:")
print(f"  movies.pkl shape: {movies_loaded.shape}")
print(f"  similarity.pkl shape: {similarity_loaded.shape}")
print(f"  Data integrity: {'‚úÖ Match!' if movies_loaded.shape == movies.shape and similarity_loaded.shape == similarity.shape else '‚ùå Mismatch!'}")

‚úÖ Saved movies.pkl
‚úÖ Saved similarity.pkl

üì¶ Verification:
  movies.pkl shape: (9985, 3)
  similarity.pkl shape: (9985, 9985)
  Data integrity: ‚úÖ Match!


## Build the Streamlit Web Application

In [21]:
%%writefile app.py
import streamlit as st
import pickle
import pandas as pd
import requests

# ‚îÄ‚îÄ‚îÄ Page Configuration ‚îÄ‚îÄ‚îÄ
st.set_page_config(
    page_title="üé¨ Netflix Movie Recommender",
    page_icon="üé¨",
    layout="wide"
)

# ‚îÄ‚îÄ‚îÄ Custom CSS for Netflix-style dark theme & animated carousel ‚îÄ‚îÄ‚îÄ
st.markdown("""
<style>
    /* Dark background */
    .stApp {
        background-color: #141414;
        color: #ffffff;
    }
    
    /* Title styling */
    h1 {
        color: #E50914 !important;
        text-align: center;
        font-family: 'Helvetica Neue', sans-serif;
        font-weight: 700;
        text-shadow: 2px 2px 4px rgba(0,0,0,0.5);
    }
    
    /* Subtitle */
    .subtitle {
        text-align: center;
        color: #b3b3b3;
        font-size: 1.1rem;
        margin-bottom: 2rem;
    }
    
    /* Movie card styling */
    .movie-card {
        background: #1a1a2e;
        border-radius: 12px;
        padding: 10px;
        text-align: center;
        transition: transform 0.3s ease, box-shadow 0.3s ease;
        border: 1px solid #2a2a4a;
    }
    .movie-card:hover {
        transform: scale(1.05);
        box-shadow: 0 8px 25px rgba(229, 9, 20, 0.3);
    }
    .movie-card img {
        border-radius: 8px;
        width: 100%;
    }
    .movie-title {
        color: #ffffff;
        font-size: 0.95rem;
        font-weight: 600;
        margin-top: 8px;
        min-height: 45px;
    }
    
    /* Carousel animation */
    @keyframes slideIn {
        from { opacity: 0; transform: translateY(30px); }
        to { opacity: 1; transform: translateY(0); }
    }
    .animate-card {
        animation: slideIn 0.6s ease forwards;
    }
    .animate-card:nth-child(2) { animation-delay: 0.1s; }
    .animate-card:nth-child(3) { animation-delay: 0.2s; }
    .animate-card:nth-child(4) { animation-delay: 0.3s; }
    .animate-card:nth-child(5) { animation-delay: 0.4s; }
    
    /* Button styling */
    .stButton > button {
        background-color: #E50914 !important;
        color: white !important;
        border: none !important;
        border-radius: 8px !important;
        padding: 0.6rem 2rem !important;
        font-size: 1.1rem !important;
        font-weight: 600 !important;
        transition: all 0.3s ease !important;
    }
    .stButton > button:hover {
        background-color: #f40612 !important;
        box-shadow: 0 4px 15px rgba(229, 9, 20, 0.4) !important;
        transform: translateY(-2px) !important;
    }
    
    /* Selectbox styling */
    .stSelectbox label {
        color: #b3b3b3 !important;
        font-size: 1rem !important;
    }
</style>
""", unsafe_allow_html=True)

# ‚îÄ‚îÄ‚îÄ Load Data ‚îÄ‚îÄ‚îÄ
@st.cache_data
def load_data():
    movies = pickle.load(open('movies.pkl', 'rb'))
    similarity = pickle.load(open('similarity.pkl', 'rb'))
    return movies, similarity

movies, similarity = load_data()

# ‚îÄ‚îÄ‚îÄ TMDB API Configuration ‚îÄ‚îÄ‚îÄ
TMDB_API_KEY = "aae8076738f53ac73410f5c1b284de1e"
PLACEHOLDER_POSTER = "https://via.placeholder.com/500x750.png?text=No+Poster+Available"

def fetch_poster(movie_id):
    """Fetch movie poster from TMDB API with retry logic."""
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={TMDB_API_KEY}&language=en-US"
    for attempt in range(3):
        try:
            response = requests.get(url, timeout=10)
            if response.status_code == 200:
                data = response.json()
                poster_path = data.get('poster_path')
                if poster_path:
                    return f"https://image.tmdb.org/t/p/w500{poster_path}"
            break
        except (requests.exceptions.Timeout, requests.exceptions.ConnectionError):
            if attempt == 2:
                break
            continue
        except Exception:
            break
    return PLACEHOLDER_POSTER

def recommend(movie_title):
    """Get top 5 similar movie recommendations."""
    movie_index = movies[movies['title'] == movie_title].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    recommended_titles = []
    recommended_posters = []
    
    for i in movie_list:
        movie_id = movies.iloc[i[0]].id
        recommended_titles.append(movies.iloc[i[0]].title)
        recommended_posters.append(fetch_poster(movie_id))
    
    return recommended_titles, recommended_posters

# ‚îÄ‚îÄ‚îÄ App Header ‚îÄ‚îÄ‚îÄ
st.markdown("<h1>üé¨ Netflix-Style Movie Recommender</h1>", unsafe_allow_html=True)
st.markdown('<p class="subtitle">Discover movies you\'ll love ‚Äî powered by Machine Learning</p>', unsafe_allow_html=True)

# ‚îÄ‚îÄ‚îÄ Divider ‚îÄ‚îÄ‚îÄ
st.markdown("---")

# ‚îÄ‚îÄ‚îÄ Movie Selection ‚îÄ‚îÄ‚îÄ
col_select, col_btn = st.columns([3, 1])

with col_select:
    selected_movie = st.selectbox(
        "üîç Choose a movie you like:",
        movies['title'].values,
        index=0
    )

with col_btn:
    st.markdown("<br>", unsafe_allow_html=True)
    show_btn = st.button("üéØ Show Recommendations")

# ‚îÄ‚îÄ‚îÄ Display Recommendations ‚îÄ‚îÄ‚îÄ
if show_btn:
    with st.spinner("üîÑ Finding similar movies..."):
        titles, posters = recommend(selected_movie)
    
    st.markdown("---")
    st.markdown(f"### üçø Movies similar to **{selected_movie}**:")
    st.markdown("")
    
    # Create 5 columns for movie cards
    cols = st.columns(5)
    
    for idx, col in enumerate(cols):
        with col:
            st.image(posters[idx], use_container_width=True)
            st.markdown(f"""
            <div class="movie-title" style="text-align:center; color:#ffffff;
                font-size:0.95rem; font-weight:600; margin-top:4px;">
                {titles[idx]}
            </div>
            """, unsafe_allow_html=True)

# ‚îÄ‚îÄ‚îÄ Footer ‚îÄ‚îÄ‚îÄ
st.markdown("---")
st.markdown(
    '<p style="text-align:center; color:#666; font-size:0.85rem;">'
    'Built with ‚ù§Ô∏è using Streamlit & TMDB API | Content-Based Recommendation Engine'
    '</p>',
    unsafe_allow_html=True
)

Overwriting app.py


## Run the Streamlit App

In [22]:
# Launch the Streamlit app (run this in your terminal)
# !streamlit run app.py

# Or run in background:
# !nohup streamlit run app.py &

print("‚ñ∂ To launch the app, run in your terminal:")
print("  streamlit run app.py")
print("\nüåê The app will be available at: http://localhost:8501")

‚ñ∂ To launch the app, run in your terminal:
  streamlit run app.py

üåê The app will be available at: http://localhost:8501
