In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [None]:
df = pd.read_csv(
    '/kaggle/input/movies/movies_metadata.csv',
    engine='python',
    sep=',',
    escapechar='\\',
    on_bad_lines='skip'
)

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df = df.drop_duplicates().reset_index(drop=True)

In [None]:
df = df[["title", "genres", "tagline", "vote_average","popularity","overview"]]

In [None]:
df

In [None]:
df.isnull().sum()

In [None]:
df = df.dropna(subset=['title'])

In [None]:
df['overview'] = df['overview'].fillna('')

In [None]:
df.iloc[0]['genres']

In [None]:
import ast

In [None]:
df['genres'] = df['genres'].apply(lambda x: " ".join([i['name'] for i in ast.literal_eval(x)]))

In [None]:
df['tagline'] = df['tagline'].fillna('')

In [None]:
df.isnull().sum()

In [None]:
df['tags'] = df['overview'] + ' ' + df['genres'] + ' ' + df['tagline']

In [None]:
df.head()

In [None]:
df['tags'][1]

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
lemmatizer = WordNetLemmatizer()

In [None]:
def preprocess_text(text):
  text = str(text).lower()
  text = re.sub(r'[^a-zA-Z\s]','',text)
  words = text.split()
  words = [word for word in words if word not in stop_words]
  words = [lemmatizer.lemmatize(word) for word in words]
  return " ".join(words)

In [None]:
df['tags'] = df['tags'].apply(preprocess_text)

In [None]:
df = df.reset_index(drop = True)

In [None]:
indices = pd.Series(df.index,index = df['title']).drop_duplicates()
indices

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(max_features=50000,ngram_range=(1,2),stop_words='english')

In [None]:
tfidf_matrix = tfidf.fit_transform(df['tags'])

In [None]:
tfidf_matrix

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
def recommend(title, n = 10):
  if title not in indices:
    return ['Movie not found']

  idx = indices[title]
  sim_score = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
  similar_idx = sim_score.argsort()[::-1][1:n+1]
  return df['title'].iloc[similar_idx]


In [None]:
recommend('Avatar 2',5)

In [None]:
import pickle

pickle.dump(tfidf_matrix,open('tfidf_matrix.pkl','wb'))

pickle.dump(indices,open('indices.pkl','wb'))

df.to_pickle('df.pkl')

pickle.dump(tfidf,open('tfidf.pkl','wb'))