In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
movies = pd.read_csv("movies.csv")

In [8]:
movies.head(2)

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski


In [9]:
movies.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

In [10]:
movies.shape

(4803, 24)

In [11]:
movies = movies[["id","genres","keywords","original_language","title","overview","tagline","cast","director"]]
movies.head(2)

Unnamed: 0,id,genres,keywords,original_language,title,overview,tagline,cast,director
0,19995,Action Adventure Fantasy Science Fiction,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",Enter the World of Pandora.,Sam Worthington Zoe Saldana Sigourney Weaver S...,James Cameron
1,285,Adventure Fantasy Action,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","At the end of the world, the adventure begins.",Johnny Depp Orlando Bloom Keira Knightley Stel...,Gore Verbinski


In [12]:
movies.shape

(4803, 9)

In [13]:
movies.columns

Index(['id', 'genres', 'keywords', 'original_language', 'title', 'overview',
       'tagline', 'cast', 'director'],
      dtype='object')

In [14]:
movies["tags"] = movies["keywords"] + movies["overview"] + movies["tagline"] + movies["cast"] + movies["director"]

In [15]:
movies["original_language"].describe()

count     4803
unique      37
top         en
freq      4505
Name: original_language, dtype: object

In [16]:
movies = movies.drop(columns=["cast","director","keywords","overview","tagline","original_language"])

In [17]:
movies["tags"]

0       culture clash future space war space colony so...
1       ocean drug abuse exotic island east india trad...
2       spy based on novel secret agent sequel mi6A cr...
3       dc comics crime fighter terrorist secret ident...
4       based on novel mars medallion space travel pri...
                              ...                        
4798    united states\u2013mexico barrier legs arms pa...
4799                                                  NaN
4800                                                  NaN
4801                                                  NaN
4802                                                  NaN
Name: tags, Length: 4803, dtype: object

**Missing values**

In [18]:
movies.isnull().sum()

id           0
genres      28
title        0
tags      1046
dtype: int64

In [19]:
movies["genres"] = movies["genres"].fillna("NO genre")

In [20]:
movies["tags"] = movies["tags"].fillna("NO information")

In [21]:
movies.isnull().sum()

id        0
genres    0
title     0
tags      0
dtype: int64

**Duplicate Values**

In [22]:
movies.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
4798    False
4799    False
4800    False
4801    False
4802    False
Length: 4803, dtype: bool

In [23]:
movies

Unnamed: 0,id,genres,title,tags
0,19995,Action Adventure Fantasy Science Fiction,Avatar,culture clash future space war space colony so...
1,285,Adventure Fantasy Action,Pirates of the Caribbean: At World's End,ocean drug abuse exotic island east india trad...
2,206647,Action Adventure Crime,Spectre,spy based on novel secret agent sequel mi6A cr...
3,49026,Action Crime Drama Thriller,The Dark Knight Rises,dc comics crime fighter terrorist secret ident...
4,49529,Action Adventure Science Fiction,John Carter,based on novel mars medallion space travel pri...
...,...,...,...,...
4798,9367,Action Crime Thriller,El Mariachi,united states\u2013mexico barrier legs arms pa...
4799,72766,Comedy Romance,Newlyweds,NO information
4800,231617,Comedy Drama Romance TV Movie,"Signed, Sealed, Delivered",NO information
4801,126186,NO genre,Shanghai Calling,NO information


In [24]:
# preprocesss
movies["title"] = movies["title"].str.lower()
movies["title"]

0                                         avatar
1       pirates of the caribbean: at world's end
2                                        spectre
3                          the dark knight rises
4                                    john carter
                          ...                   
4798                                 el mariachi
4799                                   newlyweds
4800                   signed, sealed, delivered
4801                            shanghai calling
4802                           my date with drew
Name: title, Length: 4803, dtype: object

In [25]:
movies.iloc[0].genres  
# Multi-label categorical data stored as text tokens

'Action Adventure Fantasy Science Fiction'

In [26]:
movies.tags[0]

'culture clash future space war space colony societyIn the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization.Enter the World of Pandora.Sam Worthington Zoe Saldana Sigourney Weaver Stephen Lang Michelle RodriguezJames Cameron'

In [27]:
movies.id[0]

19995

In [28]:
movies.genres[0]

'Action Adventure Fantasy Science Fiction'

In [29]:
movies.title[0]

'avatar'

In [30]:
movies["content"] = movies["genres"] + movies["tags"]
movies['content']

0       Action Adventure Fantasy Science Fictioncultur...
1       Adventure Fantasy Actionocean drug abuse exoti...
2       Action Adventure Crimespy based on novel secre...
3       Action Crime Drama Thrillerdc comics crime fig...
4       Action Adventure Science Fictionbased on novel...
                              ...                        
4798    Action Crime Thrillerunited states\u2013mexico...
4799                         Comedy RomanceNO information
4800          Comedy Drama Romance TV MovieNO information
4801                               NO genreNO information
4802                            DocumentaryNO information
Name: content, Length: 4803, dtype: object

In [31]:
movies

Unnamed: 0,id,genres,title,tags,content
0,19995,Action Adventure Fantasy Science Fiction,avatar,culture clash future space war space colony so...,Action Adventure Fantasy Science Fictioncultur...
1,285,Adventure Fantasy Action,pirates of the caribbean: at world's end,ocean drug abuse exotic island east india trad...,Adventure Fantasy Actionocean drug abuse exoti...
2,206647,Action Adventure Crime,spectre,spy based on novel secret agent sequel mi6A cr...,Action Adventure Crimespy based on novel secre...
3,49026,Action Crime Drama Thriller,the dark knight rises,dc comics crime fighter terrorist secret ident...,Action Crime Drama Thrillerdc comics crime fig...
4,49529,Action Adventure Science Fiction,john carter,based on novel mars medallion space travel pri...,Action Adventure Science Fictionbased on novel...
...,...,...,...,...,...
4798,9367,Action Crime Thriller,el mariachi,united states\u2013mexico barrier legs arms pa...,Action Crime Thrillerunited states\u2013mexico...
4799,72766,Comedy Romance,newlyweds,NO information,Comedy RomanceNO information
4800,231617,Comedy Drama Romance TV Movie,"signed, sealed, delivered",NO information,Comedy Drama Romance TV MovieNO information
4801,126186,NO genre,shanghai calling,NO information,NO genreNO information


In [32]:
tfidf = TfidfVectorizer(stop_words="english")
tfidf

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,'english'
,token_pattern,'(?u)\\b\\w\\w+\\b'


In [33]:
tfidf_matrix = tfidf.fit_transform(movies["content"])
tfidf_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 178615 stored elements and shape (4803, 34748)>

In [34]:
similarity = cosine_similarity(tfidf_matrix)
similarity

array([[1.        , 0.02384228, 0.00791086, ..., 0.        , 0.        ,
        0.        ],
       [0.02384228, 1.        , 0.00938354, ..., 0.        , 0.        ,
        0.        ],
       [0.00791086, 0.00938354, 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.08415495,
        0.09132274],
       [0.        , 0.        , 0.        , ..., 0.08415495, 1.        ,
        0.1566181 ],
       [0.        , 0.        , 0.        , ..., 0.09132274, 0.1566181 ,
        1.        ]])

In [35]:
movies

Unnamed: 0,id,genres,title,tags,content
0,19995,Action Adventure Fantasy Science Fiction,avatar,culture clash future space war space colony so...,Action Adventure Fantasy Science Fictioncultur...
1,285,Adventure Fantasy Action,pirates of the caribbean: at world's end,ocean drug abuse exotic island east india trad...,Adventure Fantasy Actionocean drug abuse exoti...
2,206647,Action Adventure Crime,spectre,spy based on novel secret agent sequel mi6A cr...,Action Adventure Crimespy based on novel secre...
3,49026,Action Crime Drama Thriller,the dark knight rises,dc comics crime fighter terrorist secret ident...,Action Crime Drama Thrillerdc comics crime fig...
4,49529,Action Adventure Science Fiction,john carter,based on novel mars medallion space travel pri...,Action Adventure Science Fictionbased on novel...
...,...,...,...,...,...
4798,9367,Action Crime Thriller,el mariachi,united states\u2013mexico barrier legs arms pa...,Action Crime Thrillerunited states\u2013mexico...
4799,72766,Comedy Romance,newlyweds,NO information,Comedy RomanceNO information
4800,231617,Comedy Drama Romance TV Movie,"signed, sealed, delivered",NO information,Comedy Drama Romance TV MovieNO information
4801,126186,NO genre,shanghai calling,NO information,NO genreNO information


**Recommendor function**

In [36]:
def recommend(movie_title, df, similarity_matrix, top_n=5):
    """
    movie_title: user input
    df: your movies dataframe with 'title' and 'id'
    similarity_matrix: precomputed similarity (cosine) between movies
    top_n: number of recommendations
    """
    
    # 1. Normalize user input
    movie_title = movie_title.lower()
    
    # 2. Find the movie index
    movie_row = df[df['title'].str.lower() == movie_title]
    
    if movie_row.empty:
        return "Movie not found"
    
    movie_index = movie_row.index[0]  # get the row index
    
    # 3. Get similarity scores for this movie with all others
    sim_scores = list(enumerate(similarity_matrix[movie_index]))
    
    # 4. Sort movies by similarity (highest first)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # 5. Pick top_n movies (skip the first one, it's the same movie)
    top_movies = sim_scores[1:top_n+1]
    
    # 6. Get movie titles for these indices
    recommended_titles = [df.iloc[i[0]]['title'] for i in top_movies]
    
    return recommended_titles

In [40]:
recommend("Titanic", movies, similarity, top_n=2)

['the black hole', 'ghost ship']

In [None]:
movie_title = movies["title"].values

In [None]:
q = input("Enter the Movie name").lower().strip()


In [None]:
movie_row= movies[movies["title"]== q]

In [None]:
movie_index = movie_row.index[0]
movie_index

0

In [None]:
for q in movies:
    if q == movie_title.any(): # filter condition
        print()
    else:
        print("no")



no
no
no
no
no
