In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data Loading

In [4]:
movies = pd.read_csv('n_movies.csv')
movies.head()

Unnamed: 0,title,year,certificate,duration,genre,rating,description,stars,votes
0,Cobra Kai,(2018– ),TV-14,30 min,"Action, Comedy, Drama",8.5,Decades after their 1984 All Valley Karate Tou...,"['Ralph Macchio, ', 'William Zabka, ', 'Courtn...",177031
1,The Crown,(2016– ),TV-MA,58 min,"Biography, Drama, History",8.7,Follows the political rivalries and romance of...,"['Claire Foy, ', 'Olivia Colman, ', 'Imelda St...",199885
2,Better Call Saul,(2015–2022),TV-MA,46 min,"Crime, Drama",8.9,The trials and tribulations of criminal lawyer...,"['Bob Odenkirk, ', 'Rhea Seehorn, ', 'Jonathan...",501384
3,Devil in Ohio,(2022),TV-MA,356 min,"Drama, Horror, Mystery",5.9,When a psychiatrist shelters a mysterious cult...,"['Emily Deschanel, ', 'Sam Jaeger, ', 'Gerardo...",9773
4,Cyberpunk: Edgerunners,(2022– ),TV-MA,24 min,"Animation, Action, Adventure",8.6,A Street Kid trying to survive in a technology...,"['Zach Aguilar, ', 'Kenichiro Ohashi, ', 'Emi ...",15413


Menampilkan list genre yang ada pada dataset

In [5]:
print(len(movies['genre'].unique().tolist()))

570


Menampilkan shape dari dataset. Dataset memiliki 9957 baris dan 9 kolom

In [6]:
movies.shape

(9957, 9)

In [56]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9957 entries, 0 to 9956
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   title        9957 non-null   object 
 1   year         9430 non-null   object 
 2   certificate  6504 non-null   object 
 3   duration     7921 non-null   object 
 4   genre        9884 non-null   object 
 5   rating       8784 non-null   float64
 6   description  9957 non-null   object 
 7   stars        9957 non-null   object 
 8   votes        8784 non-null   object 
dtypes: float64(1), object(8)
memory usage: 700.2+ KB


# Data Preperation

Mengecek missing value

In [7]:
movies.isna().sum()

title             0
year            527
certificate    3453
duration       2036
genre            73
rating         1173
description       0
stars             0
votes          1173
dtype: int64

In [9]:
movies.dropna(inplace=True)
movies.isna().sum()

title          0
year           0
certificate    0
duration       0
genre          0
rating         0
description    0
stars          0
votes          0
dtype: int64

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english')

movies['genre'] = movies['genre'].fillna('')

tfidf_matrix = tfidf.fit_transform(movies['genre'])

tfidf_matrix.shape

(5754, 30)

In [11]:
tfidf_matrix.todense()

matrix([[0.65584709, 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        ...,
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ],
        [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
         0.        ]])

In [12]:
pd.DataFrame(
    tfidf_matrix.todense(),
    columns=tfidf.get_feature_names_out(),
    index=movies.title
).sample(22, axis=1).sample(10, axis=0)

Unnamed: 0_level_0,musical,history,mystery,game,biography,romance,talk,thriller,comedy,fantasy,...,news,music,short,sci,western,war,crime,drama,documentary,film
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Through My Father's Eyes,0.0,0.0,0.0,0.0,0.573678,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.431854,0.0
Parrot Heads,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.836215,0.0,0.0,0.0,0.0,0.0,0.0,0.548402,0.0
Chip and Potato,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.493112,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Rick and Morty,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.493112,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Lucifer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.763973,...,0.0,0.0,0.0,0.0,0.0,0.0,0.539018,0.354689,0.0,0.0
Money Heist,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.668852,0.440123,0.0,0.0
The Hater,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.880232,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.474544,0.0,0.0
Bleach,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Itaewon Class,0.0,0.0,0.0,0.0,0.0,0.884319,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.466883,0.0,0.0
Cleveland Abduction,0.0,0.0,0.0,0.0,0.788329,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.513963,0.338202,0.0,0.0


# Modelling

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

# Menghitung cosine similarity pada matrix tf-idf
cosine_sim = cosine_similarity(tfidf_matrix)
cosine_sim

array([[1.        , 0.13936107, 0.26484661, ..., 0.13936107, 0.15153875,
        0.16002668],
       [0.13936107, 1.        , 0.15899709, ..., 1.        , 0.09097425,
        0.09606986],
       [0.26484661, 0.15899709, 1.        , ..., 0.15899709, 0.17289061,
        0.18257449],
       ...,
       [0.13936107, 1.        , 0.15899709, ..., 1.        , 0.09097425,
        0.09606986],
       [0.15153875, 0.09097425, 0.17289061, ..., 0.09097425, 1.        ,
        0.5713586 ],
       [0.16002668, 0.09606986, 0.18257449, ..., 0.09606986, 0.5713586 ,
        1.        ]])

In [14]:
cosine_sim_df = pd.DataFrame(cosine_sim, index=movies['title'], columns=movies['title'])
print('Shape:', cosine_sim_df.shape)

cosine_sim_df.sample(5, axis=1).sample(10, axis=0)

Shape: (5754, 5754)


title,The Stranded,A Jazzman's Blues,Tarung Sarung,Unbreakable Kimmy Schmidt,The Short Game
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Monty Python Live (Mostly),0.0,0.0,0.0,0.300391,0.230403
The Good Place,0.144684,0.375445,0.168054,0.452852,0.0
Muqaddar Ka Faisla,0.185673,0.481807,0.61527,0.581142,0.0
Jago: A Life Underwater,0.0,0.0,0.0,0.0,0.527242
To All the Boys: P.S. I Still Love You,0.156772,0.406812,0.182094,0.490686,0.0
Walking on Water,0.385367,1.0,0.447612,0.0,0.0
Conversations with a Killer: The Ted Bundy Tapes,0.0,0.0,0.0,0.0,0.281231
S.W.A.T.: Firefight,0.0,0.0,0.300792,0.0,0.0
The Movies That Made Us,0.0,0.0,0.0,0.0,0.247618
Thiago Ventura: Pokas,0.0,0.0,0.0,1.0,0.0


# Recommendation Result

In [15]:
def book_recommendation(nama_film, similarity_data=cosine_sim_df, items=movies[['title', 'genre']], k=10):
    index = similarity_data.loc[:,nama_film].to_numpy().argpartition(
        range(-1, -k, -1))

    closest = similarity_data.columns[index[-1:-(k+2):-1]]

    closest = closest.drop(nama_film, errors='ignore')
    df = pd.DataFrame(closest).merge(items)
    df.drop_duplicates(keep='first', subset="title", inplace=True)
    return df.head(k)

In [16]:
book_recommendation('Danur')

Unnamed: 0,title,genre
0,The Revenge of Robert the Doll,Horror
1,The Human Centipede 2 (Full Sequence),Horror
2,Robert Reborn,Horror
3,The Exorcism of Anna Ecklund,Horror
4,Rape Zombie: Lust of the Dead,Horror
5,Verónica,Horror
6,FirstBorn,Horror
7,23:59,Horror
8,Beast of Morocco,Horror
9,Pentagram,Horror
