In [35]:
import ast
import numpy as np
import pandas as pd
import tabulate as tb

In [36]:
credits = pd.read_csv("tmdb_5000_credits.csv")
movies = pd.read_csv("tmdb_5000_movies.csv")

In [37]:
# print(tb.tabulate(movies.head(1), headers='keys', tablefmt='grid'))
# print(tb.tabulate(credits.head(1), headers='keys', tablefmt='grid'))

In [38]:
column_names = movies.columns.tolist()
print(len(column_names), column_names)
column_names = credits.columns.tolist()
print(len(column_names), column_names)

22 ['id', 'budget', 'genres', 'homepage', 'keywords', 'original_language', 'original_title', 'overview', 'popularity', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'vote_average', 'vote_count', 'cast', 'crew']
4 ['movie_id', 'title', 'cast', 'crew']


### Columns which are needed for date preprocessing

id, genres, original_title, overview, keywords, cast, crew

In [39]:
movies = movies.rename(columns={'original_title': 'title'})
credits = credits.rename(columns={'movie_id': 'id'})

movies = movies.loc[:, ~movies.columns.duplicated()]
credits = credits.loc[:, ~credits.columns.duplicated()]


movies = movies[['id', 'title', 'genres', 'overview', 'keywords']]
credits = credits[['id', 'title', 'cast', 'crew']]

movies = movies.merge(credits, on=['id', 'title'])

movies = movies[['id', 'title', 'genres', 'overview', 'keywords', 'cast', 'crew']]
print(movies.columns.tolist())



['id', 'title', 'genres', 'overview', 'keywords', 'cast', 'crew']


In [40]:
print(movies.isnull().sum())

id          0
title       0
genres      0
overview    3
keywords    0
cast        0
crew        0
dtype: int64


In [41]:
movies.dropna(inplace=True)

In [42]:
print(movies.isnull().sum())


id          0
title       0
genres      0
overview    0
keywords    0
cast        0
crew        0
dtype: int64


### For checking Duplicate data

In [43]:
print(movies.duplicated().sum())

0


In [44]:
print(movies['genres'][0])
print(movies.iloc[0].genres)

[{"id": 80, "name": "Crime"}, {"id": 35, "name": "Comedy"}]
[{"id": 80, "name": "Crime"}, {"id": 35, "name": "Comedy"}]


### Function to convert for genres, keywords,...
```bash
'[{"id": 80, "name": "Crime"}, {"id": 35, "name": "Comedy"}]' --> [Crime, Comedy]

```

In [45]:
def convert(obj):
    L = []
    for i in ast.literal_eval(obj): 
        L.append(i['name']) 
    return L

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# print(movies['genres'])
print(movies['keywords'])

0       [hotel, new year's eve, witch, bet, hotel room...
1       [android, galaxy, hermit, death star, lightsab...
2       [father son relationship, harbor, underwater, ...
3       [vietnam veteran, hippie, mentally disabled, r...
4       [male nudity, female nudity, adultery, midlife...
                              ...                        
4536                       [adventure, fairy-tale figure]
4537                                                   []
4538                                                   []
4539                    [christian film, sex trafficking]
4540                                                   []
Name: keywords, Length: 4539, dtype: object


#### Only taking top 3 actors from cast

In [46]:
def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj): 
        if counter != 3:
            L.append(i['name'])
            counter += 1
        else:
            break 
    return L

movies['cast'] = movies['cast'].apply(convert3)
print(movies['cast'])

0            [Tim Roth, Antonio Banderas, Jennifer Beals]
1             [Mark Hamill, Harrison Ford, Carrie Fisher]
2       [Albert Brooks, Ellen DeGeneres, Alexander Gould]
3                  [Tom Hanks, Robin Wright, Gary Sinise]
4             [Kevin Spacey, Annette Bening, Thora Birch]
                              ...                        
4536    [Antonio Banderas, Salma Hayek, Zach Galifiana...
4537    [Lisa Hart Carroll, Michael Des Barres, Paul D...
4538         [Roni Akurati, Brighton Sharbino, Jason Lee]
4539        [Nicole Smolen, Kim Baldwin, Ariana Stephens]
4540                                                   []
Name: cast, Length: 4539, dtype: object


#### Taking director from crew

In [47]:
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break 
    return L

movies['crew'] = movies['crew'].apply(fetch_director)
# print(movies['crew'])



#### To remove spaces from genres, keywords, cast and crew

In [48]:
# movies.head()

In [49]:
movies['genres'] = movies['genres'].apply(lambda x :[i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x :[i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x :[i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x :[i.replace(" ", "") for i in x])
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [50]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [52]:
movies = movies[['id', 'title', 'tags']]
# print(movies.head())

In [53]:
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))
# movies['tags'] = movies['tags'].apply(lambda x: x.replace("   ", " ").replace(" ", ""))

In [54]:
print(movies['tags'][0])


It's Ted the Bellhop's first night on the job...and the hotel's very unusual guests are about to place him in some outrageous predicaments. It seems that this evening's room service is serving up one unbelievable happening after another. Crime Comedy hotel newyear'seve witch bet hotelroom sperm losangeles hoodlum womandirector episodefilm TimRoth AntonioBanderas JenniferBeals AllisonAnders


#### Remving unnessary words/ numbers by steming
```bash
like giving ['loving', 'loved', 'loves', 'love'] to get root term ['love', 'love', 'love', 'love', ]

like giving ['dancing', 'danced', 'dances', 'dance'] to get root term ['danc', 'danc', 'danc', 'danc', ]
```

In [55]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    y = []

    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

In [56]:
movies['tags'] = movies['tags'].apply(stem)


In [57]:
movies['tags'][0]

"it' ted the bellhop' first night on the job...and the hotel' veri unusu guest are about to place him in some outrag predicaments. it seem that thi evening' room servic is serv up one unbeliev happen after another. crime comedi hotel newyear'sev witch bet hotelroom sperm losangel hoodlum womandirector episodefilm timroth antoniobandera jenniferb allisonand"

#### Converting the text data into vectors to find the similarity between different rows(movies)and choise the closest vectors(most similar movies)

In [58]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features=5000, stop_words='english') 

In [59]:
vector = cv.fit_transform(movies['tags']).toarray()

In [60]:
feature_names = cv.get_feature_names_out()

# print(feature_names[:500])

##### To find similarity of one movies with all the other movies, we need to find the cosine angle of that movies with other movies vector and smaller the angle greater the similarity between the movies


Cosine similarity is used to measure how similar two items (e.g., movies) are by comparing the **angle between their vector representations**.

---

## üîç How It Works

Each movie is converted into a **vector** in high-dimensional space (based on keywords, genres, tags, etc.).  
We then compute the **cosine of the angle** between two vectors.

### **Angle Meaning**
- **Small angle ‚Üí similar movies**  
  Example: Both are *Action* movies with similar keywords.
- **Large angle ‚Üí dissimilar movies**  
  Example: *Horror* vs *Romance*.

---

## üìä Cosine Similarity Score (0 to 1)

| Score | Meaning |
|-------|---------|
| **1** | Movies are highly similar |
| **0** | Movies are completely unrelated |

- **Closer to 1 ‚Üí high similarity**  
- **Closer to 0 ‚Üí low similarity**

---


In [61]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vector)

##### The similarity of movie to itself (1,1) is 1


In [62]:
similarity[0]   # All diagonals elements are 1

array([1.        , 0.        , 0.        , ..., 0.02961744, 0.        ,
       0.        ], shape=(4539,))

In [63]:
movies['title'].head()

0         Four Rooms
1          Star Wars
2       Finding Nemo
3       Forrest Gump
4    American Beauty
Name: title, dtype: object

In [64]:
def recommed(movie):
    movies_index = movies[movies['title'] == movie].index[0]
    distances = similarity[movies_index]
    movies_list = sorted(list(enumerate(distances)), reverse= True, key= lambda x:x[1])[1:6]
    for i in movies_list:
        print(movies.iloc[i[0]].title)


recommed("Star Wars")

The Empire Strikes Back
Return of the Jedi
Shanghai Noon
Star Wars: Episode III - Revenge of the Sith
Star Wars: Episode I - The Phantom Menace


#### Creating a folder to store similarity to all the movies with each other, and movies data to fetch movies id and its title to display for the website


In [65]:
import pickle
movies.to_pickle('movies_data.pkl')
pickle.dump(similarity, open('similarity.pkl', 'wb'))