In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
df=pd.read_csv('/content/drive/MyDrive/netflix_titles.csv')

In [6]:
df.columns = df.columns.str.lower().str.strip()

In [7]:
df['listed_in'] = df['listed_in'].fillna('')
df['cast'] = df['cast'].fillna('')
df['director'] = df['director'].fillna('')
df['description'] = df['description'].fillna('')

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['listed_in'])

tfidf_matrix.shape

(8807, 44)

In [11]:
tfidf_description = TfidfVectorizer(stop_words='english')
tfidf_matrix_description = tfidf_description.fit_transform(df['description'])

tfidf_matrix_description.shape

(8807, 18895)

In [13]:
tfidf_cast = TfidfVectorizer(stop_words='english')
tfidf_matrix_cast = tfidf_cast.fit_transform(df['cast'])
tfidf_director = TfidfVectorizer(stop_words='english')
tfidf_matrix_director = tfidf_director.fit_transform(df['director'])
tfidf_type = TfidfVectorizer(stop_words='english')
tfidf_matrix_type = tfidf_type.fit_transform(df['type'])

In [14]:
from sklearn.preprocessing import MultiLabelBinarizer
# Split the comma-separated strings into lists of genres, directors, and cast members
df['listed_in'] = df['listed_in'].apply(lambda x: [i.strip() for i in x.split(',')])
df['cast'] = df['cast'].apply(lambda x: [i.strip() for i in x.split(',')])
df['director'] = df['director'].apply(lambda x: [i.strip() for i in x.split(',')])
# Initialize MultiLabelBinarizer
mlb_genres = MultiLabelBinarizer()
mlb_directors = MultiLabelBinarizer()
mlb_cast = MultiLabelBinarizer()
# Fit and transform the data
genres_encoded = mlb_genres.fit_transform(df['listed_in'])
directors_encoded = mlb_directors.fit_transform(df['director'])
cast_encoded = mlb_cast.fit_transform(df['cast'])
# Convert to DataFrames for easier handling
genres_df = pd.DataFrame(genres_encoded, columns=mlb_genres.classes_)
directors_df = pd.DataFrame(directors_encoded, columns=mlb_directors.classes_)
cast_df = pd.DataFrame(cast_encoded, columns=mlb_cast.classes_)
# Display the shapes of the new DataFrames
print("Genres shape:", genres_df.shape)
print("Directors shape:", directors_df.shape)
print("Cast shape:", cast_df.shape)

Genres shape: (8807, 42)
Directors shape: (8807, 4994)
Cast shape: (8807, 36440)


In [16]:
import scipy.sparse

# Combine the sparse matrices
feature_matrix = scipy.sparse.hstack([
    tfidf_matrix,
    tfidf_matrix_description,
    tfidf_matrix_cast,
    tfidf_matrix_director,
    tfidf_matrix_type,
    genres_encoded,
    directors_encoded,
    cast_encoded
])

print("Combined feature matrix shape:", feature_matrix.shape)

Combined feature matrix shape: (8807, 117493)


In [17]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(feature_matrix, feature_matrix)

print("Cosine similarity matrix shape:", cosine_sim.shape)

# Function to get recommendations
def get_recommendations(title, cosine_sim=cosine_sim, df=df, indices=None):
  if indices is None:
    # Create a reverse map of indices and show titles
    indices = pd.Series(df.index, index=df['title']).drop_duplicates()

  # Get the index of the show that matches the title
  idx = indices[title]

  # Get the pairwise similarity scores with that show
  sim_scores = list(enumerate(cosine_sim[idx]))

  # Sort the shows based on the similarity scores
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

  # Get the scores of the 10 most similar shows (excluding the show itself)
  sim_scores = sim_scores[1:11]

  # Get the show indices
  show_indices = [i[0] for i in sim_scores]

  # Return the top 10 most similar shows
  return df['title'].iloc[show_indices]

# Example usage:
# print(get_recommendations('The Crown'))
# print(get_recommendations('Stranger Things'))

Cosine similarity matrix shape: (8807, 8807)


In [None]:
!pip install streamlit
!npm install localtunnel

import streamlit as st

# Create a reverse map of indices and show titles (needed inside the function)
indices = pd.Series(df.index, index=df['title']).drop_duplicates()

st.title('Netflix Show Recommender')

# Dropdown to select a show
selected_show = st.selectbox('Select a show:', df['title'].tolist())

if st.button('Get Recommendations'):
  if selected_show:
    recommendations = get_recommendations(selected_show, cosine_sim, df, indices)
    st.subheader('Recommended Shows:')
    for rec in recommendations:
      st.write(rec)
  else:
    st.warning('Please select a show.')

# Save the streamlit app to a file
streamlit_code = """
import streamlit as st
import pandas as pd
import numpy as np
import scipy.sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

# Load your data and pre-trained model components
# This part assumes you have saved your processed data and models.
# For this example, we'll reuse the objects created in your notebook.

# Assuming df, cosine_sim, and indices are available from the notebook execution
# In a real deployment, you would load these from saved files (e.g., pickle)

# Function to get recommendations (copy-pasted from your notebook)
def get_recommendations(title, cosine_sim, df, indices):
  if indices is None:
    indices = pd.Series(df.index, index=df['title']).drop_duplicates()

  if title not in indices:
    return ["Show not found in the dataset."]

  idx = indices[title]
  sim_scores = list(enumerate(cosine_sim[idx]))
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
  sim_scores = sim_scores[1:11]
  show_indices = [i[0] for i in sim_scores]
  return df['title'].iloc[show_indices].tolist() # Convert to list for Streamlit

st.title('Netflix Show Recommender')

# Ensure df and indices are available.
# If you were deploying this separately, you'd load them here.
# For this example in Colab, they are in memory.

# Dropdown to select a show
selected_show = st.selectbox('Select a show:', df['title'].tolist())

if st.button('Get Recommendations'):
  if selected_show:
    recommendations = get_recommendations(selected_show, cosine_sim, df, indices)
    st.subheader('Recommended Shows:')
    for rec in recommendations:
      st.write(rec)
  else:
    st.warning('Please select a show.')
"""

with open('app.py', 'w') as f:
  f.write(streamlit_code)

# Run streamlit in the background
!nohup streamlit run app.py &

# Expose the port with localtunnel
!npx localtunnel --port 8501


Collecting streamlit
  Downloading streamlit-1.46.1-py3-none-any.whl.metadata (9.0 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.46.1-py3-none-any.whl (10.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m72.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m97.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl (79 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hI

2025-07-07 08:14:10.822 
  command:

    streamlit run /usr/local/lib/python3.11/dist-packages/colab_kernel_launcher.py [ARGUMENTS]
2025-07-07 08:14:10.843 Session state does not function when running a script without `streamlit run`


nohup: appending output to 'nohup.out'
[1G[0K⠙[1G[0Kyour url is: https://all-suns-double.loca.lt
/content/node_modules/localtunnel/bin/lt.js:81
    throw err;
    ^

Error: connection refused: localtunnel.me:5417 (check your firewall settings)
    at Socket.<anonymous> [90m(/content/[39mnode_modules/[4mlocaltunnel[24m/lib/TunnelCluster.js:52:11[90m)[39m
[90m    at Socket.emit (node:events:524:28)[39m
[90m    at emitErrorNT (node:internal/streams/destroy:169:8)[39m
[90m    at emitErrorCloseNT (node:internal/streams/destroy:128:3)[39m
[90m    at process.processTicksAndRejections (node:internal/process/task_queues:82:21)[39m

Node.js v20.19.0
[1G[0K⠙[1G[0K