In [2]:
import pandas as pd

movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

movies = movies.merge(credits, left_on='id', right_on='movie_id')
movies = movies[['id', 'title_x', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies.rename(columns={'title_x': 'title'}, inplace=True)

movies['overview'] = movies['overview'].fillna('')


In [3]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [4]:
credits.head(1)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [5]:
movies.isnull().sum()

id          0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [6]:
movies.dropna(inplace=True)

In [7]:
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [8]:
#[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]
#list of dict
#['Action', 'Adventure', 'SciFi']

In [9]:
import ast

def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

def convert_cast(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
        if len(L) == 3:
            break
    return L

def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert_cast)
movies['crew'] = movies['crew'].apply(fetch_director)


In [10]:
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

movies['tags'] = movies['overview'] + " " + \
                 movies['genres'].apply(lambda x: " ".join(x)) + " " + \
                 movies['keywords'].apply(lambda x: " ".join(x)) + " " + \
                 movies['cast'].apply(lambda x: " ".join(x)) + " " + \
                 movies['crew'].apply(lambda x: " ".join(x))


In [51]:
new_df=movies[['id','title', 'tags']]

In [52]:
new_df.head()

Unnamed: 0,id,title,tags
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...
4,49529,John Carter,"John Carter is a war-weary, former military ca..."


In [13]:
new_df['tags']=new_df['tags'].apply(lambda x:"".join(x))
new_df.head()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x:"".join(x))


Unnamed: 0,title,tags
0,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,Spectre,A cryptic message from Bond’s past sends him o...
3,The Dark Knight Rises,Following the death of District Attorney Harve...
4,John Carter,"John Carter is a war-weary, former military ca..."


In [14]:
import nltk

In [15]:
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [16]:
def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [17]:
new_df.head()

Unnamed: 0,title,tags
0,Avatar,"In the 22nd century, a paraplegic Marine is di..."
1,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha..."
2,Spectre,A cryptic message from Bond’s past sends him o...
3,The Dark Knight Rises,Following the death of District Attorney Harve...
4,John Carter,"John Carter is a war-weary, former military ca..."


In [18]:
new_df['tags']=new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(lambda x:x.lower())


In [19]:
new_df['tags'][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron'

In [20]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,stop_words='english')

In [21]:
new_df['tags']=new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags']=new_df['tags'].apply(stem)


In [53]:
vector=cv.fit_transform(new_df['tags']).toarray()

In [54]:
vector.shape


(4803, 5000)

In [55]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [56]:
ps.stem('loved')

'love'

In [57]:
from sklearn.metrics.pairwise import cosine_similarity

In [73]:
similarity=cosine_similarity(vector)

In [74]:
similarity

array([[1.        , 0.08980265, 0.05986843, ..., 0.0248452 , 0.02777778,
        0.        ],
       [0.08980265, 1.        , 0.06451613, ..., 0.02677398, 0.        ,
        0.        ],
       [0.05986843, 0.06451613, 1.        , ..., 0.02677398, 0.        ,
        0.        ],
       ...,
       [0.0248452 , 0.02677398, 0.02677398, ..., 1.        , 0.0745356 ,
        0.04774099],
       [0.02777778, 0.        , 0.        , ..., 0.0745356 , 1.        ,
        0.05337605],
       [0.        , 0.        , 0.        , ..., 0.04774099, 0.05337605,
        1.        ]])

In [75]:
def recommend(movie):
    movies_index=new_df[new_df['title']==movie].index[0]
    distances=similarity[movies_index]
    movies_list=sorted(list(enumerate(distances)), reverse=True, key=lambda x:x[1])[1:6]
    for i in movies_list:
        print (new_df.iloc[i[0]].title)

In [76]:
recommend('Avatar')

Titan A.E.
Small Soldiers
Independence Day
Aliens vs Predator: Requiem
Ender's Game


In [77]:
import pandas as pd

# Load MovieLens ratings and titles
ratings = pd.read_csv('ratings.csv')     # From MovieLens
ml_movies = pd.read_csv('movies.csv')    # From MovieLens

# Load your TMDB data
tmdb_movies = pd.read_csv('tmdb_5000_movies.csv')


In [78]:
# Clean titles for easier matching
ml_movies['title_clean'] = ml_movies['title'].apply(lambda x: x.lower().split(" (")[0])
tmdb_movies['title_clean'] = tmdb_movies['title'].str.lower()

# Merge on cleaned title
merged = pd.merge(ml_movies, tmdb_movies, on='title_clean')

# Check match success
merged[['movieId', 'id', 'title_x', 'title_y']].head()


Unnamed: 0,movieId,id,title_x,title_y
0,1,862,Toy Story (1995),Toy Story
1,10,710,GoldenEye (1995),GoldenEye
2,14,10858,Nixon (1995),Nixon
3,15,1408,Cutthroat Island (1995),Cutthroat Island
4,16,524,Casino (1995),Casino


In [79]:
# Merge ratings with the matched movieId → tmdb_id mapping
ratings = ratings.merge(merged[['movieId', 'id']], on='movieId')

# Rename 'id' to 'tmdb_id'
ratings.rename(columns={'id': 'tmdb_id'}, inplace=True)

# Final ratings file
ratings = ratings[['userId', 'tmdb_id', 'rating']]
ratings.columns = ['userId', 'movieId', 'rating']  # For Surprise library
ratings.head()


Unnamed: 0,userId,movieId,rating
0,1,862,4.0
1,1,755,3.0
2,1,13685,5.0
3,1,197,4.0
4,1,11780,5.0


In [80]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

In [81]:
reader = Reader(rating_scale=(0.5, 5.0))

data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)


trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

svd = SVD()
svd.fit(trainset)


predictions = svd.test(testset)
print("RMSE:", accuracy.rmse(predictions))


RMSE: 0.8680
RMSE: 0.8680121253865671


In [82]:
def hybrid_recommend(user_id, movie_title, top_n=5):
    try:
        movie_index = movies[movies['title'] == movie_title].index[0]
    except IndexError:
        return ["Movie not found in database."]

    content_scores = list(enumerate(similarity[movie_index]))
    content_scores = sorted(content_scores, key=lambda x: x[1], reverse=True)[1:21]

    hybrid_scores = []
    for i in content_scores:
        tmdb_id = movies.iloc[i[0]].id
        try:
            rating_pred = svd.predict(user_id, tmdb_id).est
        except:
            rating_pred = 0
        hybrid_scores.append((movies.iloc[i[0]].title, rating_pred))

    hybrid_scores = sorted(hybrid_scores, key=lambda x: x[1], reverse=True)
    return [i[0] for i in hybrid_scores[:top_n]]


In [95]:
hybrid_recommend(user_id=1, movie_title='Iron Man', top_n=5)


['X-Men',
 'Avengers: Age of Ultron',
 'Captain America: The First Avenger',
 'X-Men: Days of Future Past',
 'Guardians of the Galaxy']

In [96]:
import pickle

In [97]:
pickle.dump(new_df,open('movies.pkl', 'wb'))

In [98]:
pickle.dump(new_df.to_dict(),open('movies_dict.pkl', 'wb'))

In [99]:
pickle.dump(similarity,open('similarity.pkl', 'wb'))

In [100]:
import pickle
pickle.dump(svd, open('svd_model.pkl', 'wb'))