In [1]:
import pandas as pd

movies_ml = pd.read_csv("movies.csv")
tmdb = pd.read_csv("tmdb_5000_movies.csv")

print("MovieLens shape:", movies_ml.shape)
print("TMDB shape:", tmdb.shape)

MovieLens shape: (9742, 3)
TMDB shape: (4803, 20)


In [2]:
movies_ml = movies_ml[["movieId", "title"]]

tmdb = tmdb[["title", "overview", "genres", "vote_average", "popularity"]]

print(movies_ml.head())
print(tmdb.head())

   movieId                               title
0        1                    Toy Story (1995)
1        2                      Jumanji (1995)
2        3             Grumpier Old Men (1995)
3        4            Waiting to Exhale (1995)
4        5  Father of the Bride Part II (1995)
                                      title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   
3                     The Dark Knight Rises   
4                               John Carter   

                                            overview  \
0  In the 22nd century, a paraplegic Marine is di...   
1  Captain Barbossa, long believed to be dead, ha...   
2  A cryptic message from Bond’s past sends him o...   
3  Following the death of District Attorney Harve...   
4  John Carter is a war-weary, former military ca...   

                                              genres  vote_average  popularity  
0  [{"id": 28, "n

In [3]:
print(movies_ml.columns)
print(tmdb.columns)

Index(['movieId', 'title'], dtype='str')
Index(['title', 'overview', 'genres', 'vote_average', 'popularity'], dtype='str')


In [4]:
import re

def clean_title(title):
    title = title.lower()
    title = re.sub(r"\(\d{4}\)", "", title)  # year sil
    title = re.sub(r"[^a-z0-9 ]", "", title)
    return title.strip()

movies_ml["clean_title"] = movies_ml["title"].apply(clean_title)
tmdb["clean_title"] = tmdb["title"].apply(clean_title)

print(movies_ml.columns)
print(tmdb.columns)

Index(['movieId', 'title', 'clean_title'], dtype='str')
Index(['title', 'overview', 'genres', 'vote_average', 'popularity',
       'clean_title'],
      dtype='str')


In [5]:
merged = pd.merge(
    movies_ml,
    tmdb,
    on="clean_title",
    how="inner"
)

print("Merged shape:", merged.shape)

Merged shape: (2838, 8)


In [6]:
merged = merged.dropna(subset=["overview"])
print("After dropping empty overview:", merged.shape)

After dropping empty overview: (2838, 8)


In [7]:
import ast

def extract_genres(genre_str):
    genres = ast.literal_eval(genre_str)
    return [g["name"] for g in genres]

merged["genre_list"] = merged["genres"].apply(extract_genres)

merged[["title_x", "genre_list"]].head()

Unnamed: 0,title_x,genre_list
0,Toy Story (1995),"[Animation, Comedy, Family]"
1,GoldenEye (1995),"[Adventure, Action, Thriller]"
2,Nixon (1995),"[History, Drama]"
3,Cutthroat Island (1995),"[Action, Adventure]"
4,Casino (1995),"[Drama, Crime]"


In [8]:
all_genres = set()

for genres in merged["genre_list"]:
    for g in genres:
        all_genres.add(g)

all_genres = sorted(list(all_genres))

print("Total unique genres:", len(all_genres))
print(all_genres)

Total unique genres: 20
['Action', 'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Family', 'Fantasy', 'Foreign', 'History', 'Horror', 'Music', 'Mystery', 'Romance', 'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western']


In [9]:
import numpy as np

# Her film için genre vector üret
def create_genre_vector(genres, all_genres):
    return [1 if g in genres else 0 for g in all_genres]

merged["genre_vector"] = merged["genre_list"].apply(
    lambda x: create_genre_vector(x, all_genres)
)

# Kontrol
print("Vector length:", len(merged["genre_vector"].iloc[0]))
merged[["title_x", "genre_vector"]].head()

Vector length: 20


Unnamed: 0,title_x,genre_vector
0,Toy Story (1995),"[0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
1,GoldenEye (1995),"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Nixon (1995),"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, ..."
3,Cutthroat Island (1995),"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Casino (1995),"[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
!pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")