**Setup environment and dependencies**

Note: This notebook is experted from kaggle personal account

    - pip install uv
    - pyenv to manage python version
    - using 3.11
    - uv sync
    - unzip data


In [None]:
!uv add numpy pandas 
!uv add nltk gensim
!uv add scikit-learn


In [None]:
# Using gensim -> True
# to use nltk stemming set False
USE_GENSIM=True
# USE_GENSIM=False

# gensim takes 0.6 s for me and nltk took 2.5 test twice


In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

## Steps
**Short description**
- Load datasets into DF using pd
    - Explore datasets
    - shape
    - columns info()
    - datatypes
    - head and head(count)
- combine dataframes into one using merge
    - droped title from credits and used only movie_id, inplace=True
    - merge(on=Column name)
    - merge left_on, right_on used becuase i used id and movie_id and with innner as how
- feature selection
  - Null checks
  - duplicate checks
  - drop null
- data preprocessing
- Vectorization
- Main function to return movies
- Saving Model
---

### Step 1

- Load datasets into DF using pd
    - Explore datasets
    - shape
    - columns info()
    - datatypes
    - head and head(count)

In [None]:
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')


In [None]:
print("Movies DF shape:", movies.shape, "Credits DF shape:", credits.shape)

In [None]:
movies.info()

In [None]:
credits.info()

In [None]:
movies.head(2)

In [None]:
credits.head(2)

### Step 2
- combine dataframes into one using merge
    - droped title from credits and used only movie_id, inplace=True
    - merge(on=Column name)
    - merge left_on, right_on used becuase i used id and movie_id and with innner as how

In [None]:
movies.merge?

In [None]:
credits.drop('title', axis=1, inplace=True)

In [None]:
movies = movies.merge(credits, left_on=['id'], right_on=['movie_id'])

In [None]:
movies.head(1)

---
### Step 3
- feature selection
  - Null checks
  - duplicate checks
  - drop null

**Selected Feature**
* genre
* keywords
* title -> english
* overview
* cast -> name -> Actors real name -> top 3
* crew -> Director

In [None]:
_movies = movies[['id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
_movies.head(3)

In [None]:
_movies.isna().sum()

In [None]:
movies = _movies.dropna()
movies.isna().sum()

In [None]:
movies.duplicated().sum()

---
### Step 4

- data preprocessing
  - Cleanup for columns like genres, keywords, cast, crew
  - Spaces cleanup
  - Tags combined column

In [None]:
movies['genres'][0]

# reading 0 index will return string
movies['genres'][0][0]

In [None]:
# temp testing
import json
temp= json.loads(movies['genres'][0])

print("data at 0", temp[0], "\nname at 0", temp[0]['name'])

# or use ast.literal_eval
# import ast

In [None]:
def data_parser(data_obj, data_key='name'):
    d_list = []
    try:
        data = json.loads(data_obj)
        for item in data:
            d_list.append(item[data_key])   
    except Exception:
        print("Error")
        d_list.append(None)
    return d_list

In [None]:
movies.head(1)

In [None]:
movies['genres'] = movies['genres'].apply(lambda x: data_parser(x))

In [None]:
movies['keywords'] = movies['keywords'].apply(lambda x: data_parser(x))


In [None]:
movies.head(1)

In [None]:
def cast_parser(data_obj, data_key='name', limit=3):
    d_list = []
    try:
        data = json.loads(data_obj)
        data = data[:limit]
        for item in data:
            d_list.append(item[data_key])   
    except Exception:
        print("Error")
        d_list.append(None)
    return d_list

In [None]:
movies['cast'] = movies['cast'].apply(lambda x: cast_parser(x))

In [None]:
movies.shape

In [None]:
movies['crew'][0]

In [None]:
def director_crew_parser(data_obj, data_key='name'):
    d_list = []
    try:
        data = json.loads(data_obj)
        for item in data:
            if item['job'] == 'Director':
                d_list.append(item[data_key])   
                break
    except Exception:
        print("Error")
        d_list.append(None)
    return d_list

In [None]:
movies['crew'] = movies['crew'].apply(lambda x: director_crew_parser(x))

In [None]:
movies.isna().sum()

In [None]:
movies.head()

In [None]:
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [None]:
movies.head()

In [None]:
movies.loc[:,'genres'] = movies['genres'].apply(lambda x: [i.replace(" ","") for i in x])
movies.loc[:,'keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ","") for i in x])
movies.loc[:,'cast'] = movies['cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies.loc[:,'crew'] = movies['crew'].apply(lambda x: [i.replace(" ","") for i in x])


In [None]:
movies.head()

In [None]:
movies.loc[:, 'tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [None]:
movies.head()

In [None]:
main_df = movies[['id', 'title', 'tags']]

In [None]:
main_df

In [None]:
main_df['tags'] = main_df['tags'].apply(lambda x: " ".join(x))

In [None]:
main_df['tags'][0]

In [None]:
# we can leverage CountVectoriser to apply lowercase
main_df['tags'] = main_df['tags'].apply(lambda x: x.lower())


In [None]:
main_df

### Step 5

Vectorization

* Text vectorization
    * Text similarity between tags
    * Stop words removal
    * 5000 minimal words for now

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

In [None]:
vectors = cv.fit_transform(main_df['tags'])

In [None]:
cv.get_feature_names_out()[-5:]

In [None]:
vectors.shape

In [None]:
if USE_GENSIM:
    from gensim.parsing.porter import PorterStemmer
    ps = PorterStemmer()
    print("gensim")
else:
    from nltk.stem.porter import PorterStemmer
    ps = PorterStemmer()
    print("nltk")

In [None]:
def stem(input:str)->str:
    out = [ps.stem(i) for i in input.split()]
    return " ".join(out)
    

In [None]:
stem(main_df['tags'][0])

In [None]:
# Gensim
# 'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, 
# but becom torn between follow order and protect an alien civilization. action adventur fantasi
# sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi 
# marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jamescameron'

In [None]:
# nltk
# 'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission,
# but becom torn between follow order and protect an alien civilization. action adventur fantasi
# sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi
# marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jamescameron'

In [None]:
main_df.loc[:, 'tags'] = main_df['tags'].apply(stem)

In [None]:
vectors = cv.fit_transform(main_df['tags'])

In [None]:
cv.get_feature_names_out()[:40]

In [None]:
vectors.shape

### Step 6

Similarity between movies to other movies
some chart like heatmap
* Create similarity matrix
* cosine similarity -> uses angles
* using indexes to get similarity matrix
* return movie data using index and main df

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
similarity = cosine_similarity(vectors)


In [None]:
similarity.shape

In [None]:
print(main_df[main_df['title']=='Avatar'])

In [None]:
def recommend_movie(movie_name):
    # get movie data from df matching to title
    # movie = main_df[main_df['title'] == movie_name]

    # using lowercase for better comparision
    movie = main_df[main_df["title"].str.lower() == movie_name.lower()]

    if movie.empty:
        out = {"status": "success", "data": []}
        return out

    # get index to so similarity matrix can be used to get similarity based of index
    movie_index = movie.index[0]

    # get similarity matrix of this movie using movie_index
    movie_similarity_matrix = similarity[movie_index]

    # need to sort to get top highest similarity based movies and keeping the indexes at same place or maybe
    # known
    closest_movies_sorted = sorted(
        enumerate(movie_similarity_matrix), reverse=True, key=lambda x: x[1]
    )
    similar_movies_five: list[tuple[int, np.float64]] = closest_movies_sorted[1:6]
    # [(1213, np.float64(0.2847987184339659))] -> [(index, similarity)]

    recommended_movies = [
        {
            "movie_id": main_df.loc[item[0]].id,
            "movie_name": main_df.loc[item[0]].title,
            "movie_index": item[0],
            "movie_similarity": item[1],
        }
        for item in similar_movies_five
    ]

    out = {"status": "success", "data": recommended_movies}

    return out

In [None]:
recommend_movie('Moon knight')

In [None]:
def recommend_movie_by_id(movie_id):
    # get movie data from df matching to id

    # using lowercase for better comparision
    movie = main_df[main_df["id"] == movie_id]

    if movie.empty:
        out = {"status": "success", "data": []}
        return out

    # get index to so similarity matrix can be used to get similarity based of index
    movie_index = movie.index[0]

    # get similarity matrix of this movie using movie_index
    movie_similarity_matrix = similarity[movie_index]

    # need to sort to get top highest similarity based movies and keeping the indexes at same place or maybe
    # known
    closest_movies_sorted = sorted(
        enumerate(movie_similarity_matrix), reverse=True, key=lambda x: x[1]
    )
    similar_movies_five: list[tuple[int, np.float64]] = closest_movies_sorted[1:6]
    # [(1213, np.float64(0.2847987184339659))] -> [(index, similarity)]

    recommended_movies = [
        {
            "movie_id": main_df.loc[item[0]].id,
            "movie_name": main_df.loc[item[0]].title,
            "movie_index": item[0],
            "movie_similarity": item[1],
        }
        for item in similar_movies_five
    ]

    out = {"status": "success", "data": recommended_movies}

    return out

In [None]:
recommend_movie_by_id(19995)

### Step 7

- backup for later use

In [None]:
import joblib
movies_df = main_df[['id', 'title']]
movies_df.sort_values(by='title', ascending=True, inplace=True)
joblib.dump((movies_df, similarity), 'movies_recommendations_assets_min.joblib')


In [None]:
df, similarity_matrix = joblib.load("movie_recommendation_assets.joblib")


In [None]:
_df = df[['id', 'title']]

In [None]:
# def recommend_movies():
#     movie_id = '19995'
#     main_df, similarity = joblib.load("movie_recommendation_assets.joblib")

#     # get movie data from df matching to id
#     movie = main_df[main_df["id"] == int(movie_id) ]

#     if movie.empty:
#         out = {"status": "success", "data": []}
#         return out

#     # get index to so similarity matrix can be used to get similarity based of index
#     movie_index = movie.index[0]

#     # get similarity matrix of this movie using movie_index
#     movie_similarity_matrix = similarity[movie_index]

#     # need to sort to get top highest similarity based movies and keeping the indexes at same place or maybe
#     # known
#     closest_movies_sorted = sorted(
#         enumerate(movie_similarity_matrix), reverse=True, key=lambda x: x[1]
#     )
#     similar_movies_five: list[tuple[int, np.float64]] = closest_movies_sorted[1:6]
#     # [(1213, np.float64(0.2847987184339659))] -> [(index, similarity)]

#     recommended_movies = [
#         {
#             "movie_id": main_df.loc[item[0]].id,
#             "movie_name": main_df.loc[item[0]].title,
#             "movie_index": item[0],
#             "movie_similarity": item[1],
#         }
#         for item in similar_movies_five
#     ]

#     out = {"status": "success", "data": recommended_movies}

#     return out


In [None]:
# recommend_movies()