In [None]:
!kaggle datasets download -d asaniczka/tmdb-movies-dataset-2023-930k-movies

Dataset URL: https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies
License(s): ODC Attribution License (ODC-By)
Downloading tmdb-movies-dataset-2023-930k-movies.zip to /content
 99% 196M/198M [00:11<00:00, 22.3MB/s]
100% 198M/198M [00:11<00:00, 18.3MB/s]


In [None]:
!unzip *.zip

Archive:  tmdb-movies-dataset-2023-930k-movies.zip
  inflating: TMDB_movie_dataset_v11.csv  


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MultiLabelBinarizer, OneHotEncoder
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import random

In [None]:
df = pd.read_csv('TMDB_movie_dataset_v11.csv')
df.sample(5)

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,original_title,overview,popularity,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords
180393,188591,The Man Who Loves Women,9.5,2,Released,1988-01-01,0,94,True,,...,L'Homme qui était fou des femmes,"Serge Bresson, the famous erotic photographer ...",2.62,/tSWIrT0e112zF9l5xJJdfASW1RZ.jpg,,,Marc Dorcel,France,French,
84226,143573,Send Me to the 'Lectric Chair,6.2,9,Released,2009-01-29,0,7,False,,...,Send Me to the 'Lectric Chair,A woman sent is sent to an electric chair that...,0.959,/3wU5mntoskABZPTv3q8b9uObRtx.jpg,,,,Canada,English,
938004,768617,Dzieje grzechu,0.0,0,Released,1911-08-26,0,0,False,,...,Dzieje grzechu,,0.6,,,Drama,,Poland,,
372826,927756,Ballada o królu Piecuchu,0.0,0,Released,1977-01-01,0,7,False,,...,Ballada o królu Piecuchu,Short animated film from Poland,0.6,/geA6bpavKRr323rElhs6u1cdcTY.jpg,,"Animation, TV Movie",,,Polish,
1029394,621391,"Yes, Death",0.0,0,Released,2004-10-04,0,26,False,/lG7A8Gt6kbO8uurug55nxTT1YkU.jpg,...,"Да, смерть",Short movie shows us a life in the Moscow Head...,0.841,/jmtaoprZsPWAhOaDbGrlgeQ6IX0.jpg,,Documentary,VKSR,Russia,Russian,


In [None]:
to_drop = ['id', 'status', 'original_title', 'homepage',
           'poster_path', 'tagline', 'vote_count',
           'release_date', 'revenue', 'backdrop_path', 'budget',
           'imdb_id', 'keywords',
           'spoken_languages', 'spoken_languages', 'overview',
           'production_companies', 'production_countries']
df = df.drop(to_drop, axis='columns')

In [None]:
df.sample(5)

Unnamed: 0,title,vote_average,runtime,adult,original_language,popularity,genres
197211,The Castellans,7.5,85,False,ro,0.736,Comedy
531712,Motu Patlu: The Secret Mission Of Motu Patlu,0.0,42,False,en,0.0,
128214,Jam Films 2,5.2,115,False,ja,1.013,
982266,Söderkåkar,0.0,0,False,sv,0.84,Comedy
334342,Heist,5.0,92,False,en,0.6,"Action, Crime"


In [None]:
df.genres.unique()

array(['Action, Science Fiction, Adventure',
       'Adventure, Drama, Science Fiction',
       'Drama, Action, Crime, Thriller', ...,
       'Action, Adventure, History, Romance', 'Animation, Crime, History',
       'Mystery, Fantasy, Music'], dtype=object)

In [None]:
df.original_language.unique()

array(['en', 'ko', 'fr', 'ja', 'it', 'es', 'pl', 'pt', 'hi', 'tr', 'da',
       'de', 'cn', 'id', 'zh', 'sv', 'el', 'ru', 'sr', 'fa', 'th', 'ar',
       'no', 'nb', 'fi', 'te', 'la', 'nl', 'hu', 'he', 'is', 'ro', 'gl',
       'uk', 'eu', 'et', 'bs', 'bn', 'xx', 'sh', 'km', 'cs', 'tn', 'ml',
       'mk', 'ga', 'hy', 'ku', 'ka', 'ta', 'kn', 'tl', 'vi', 'ca', 'dz',
       'sw', 'wo', 'kk', 'sk', 'lv', 'mi', 'bo', 'ps', 'mn', 'lt', 'ur',
       'sl', 'sc', 'af', 'hr', 'iu', 'se', 'ms', 'bm', 'mr', 'bg', 'am',
       'lo', 'cy', 'xh', 'yi', 'qu', 'yo', 'sq', 'pa', 'eo', 'gu', 'zu',
       'st', 'ne', 'ak', 'mt', 'rw', 'as', 'ln', 'ay', 'si', 'mo', 'so',
       'ff', 'ky', 'ik', 'az', 'ab', 'kl', 'jv', 'fo', 'li', 'sn', 'tg',
       'su', 'ks', 'my', 'lb', 'ht', 'rm', 'sm', 'ha', 'sa', 'tk', 'fy',
       'gd', 'om', 'ny', 'or', 'bi', 'be', 'uz', 'cr', 'mg', 'mh', 'nn',
       'co', 'ia', 'tw', 'gn', 'ig', 'nv', 'ug', 'os', 'tt', 'ie', 'kw',
       'ba', 'nd', 'ty', 'sg', 'oc', 'dv', 'sd', 'c

In [None]:
df.shape

(1101480, 7)

In [None]:
for column in df.columns:
  nan = df[column].isna().sum()
  print(f"{column}: {nan}")

title: 13
vote_average: 0
runtime: 0
adult: 0
original_language: 0
popularity: 0
genres: 435024


In [None]:
df.dropna(subset=['title'], inplace=True)

In [None]:
df.fillna('Unknown', inplace=True)

In [None]:
for column in df.columns:
  nan = df[column].isna().sum()
  print(f"{column}: {nan}")

title: 0
vote_average: 0
runtime: 0
adult: 0
original_language: 0
popularity: 0
genres: 0


In [None]:
df.adult = df.adult.apply(lambda x: 1 if x==True else 0)

In [None]:
df.sample(5)

Unnamed: 0,title,vote_average,runtime,adult,original_language,popularity,genres
885736,Absolute Strangers,0.0,96,0,en,1.207,"Drama, TV Movie"
464526,Dixie Lynn in New Cuckquean,0.0,51,1,en,0.6,Unknown
849707,Muda Brasil,0.0,0,0,pt,0.6,Documentary
367254,Star Trek: The Next Generation - All Good Thi...,0.0,92,0,en,0.6,Science Fiction
1032957,Arden los juegos,0.0,12,0,es,0.626,"Science Fiction, Drama"


In [None]:
df.vote_average[df.vote_average == 0.0].count()

752415

In [None]:
df.title.duplicated().sum()

154943

In [None]:
df.title.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

3451

In [None]:
df.title.duplicated().sum()

154943

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

0

In [None]:
Sscaler = StandardScaler()
to_scale = ['vote_average', 'runtime', 'popularity']
df[to_scale] = Sscaler.fit_transform(df[to_scale])

In [None]:
df.sample(5)

Unnamed: 0,title,vote_average,runtime,adult,original_language,popularity,genres
401239,"Romantické hrady, Rýn a dívka jménem Lorelei",-0.635977,-0.783303,0,cs,-0.085522,Documentary
552076,Ordet / Le Mot / The Word,-0.635977,0.68758,0,en,-0.013479,Unknown
514537,Vögelfrei,-0.635977,-0.346887,1,de,-0.163266,Unknown
506924,The Koi Video,-0.635977,-0.783303,0,en,-0.163266,Unknown
886817,Intellectual Scum,-0.635977,-0.540849,0,en,-0.085522,Drama


In [None]:
df.adult.value_counts()

Unnamed: 0_level_0,count
adult,Unnamed: 1_level_1
0,996760
1,101256


In [None]:
df.genres = df.genres.apply(lambda x: x.split(','))
df.title = df.title.apply(lambda x: x.strip())
df['title'] = df['title'].apply(lambda x: x.title())

In [None]:
def strip_lower(lst):
  genre_list = []
  for element in lst:
    element = element.strip().lower().replace(' ', '')
    genre_list.append(element)
  return genre_list

In [None]:
df.genres = df.genres.apply(strip_lower)

In [None]:
df.genres.sample(5)

Unnamed: 0,genres
118558,[unknown]
194099,[unknown]
22993,[drama]
999507,[unknown]
176354,[comedy]


In [None]:
mlb = MultiLabelBinarizer()
binary_matrix = mlb.fit_transform(df.genres)

In [None]:
binary_df_genre = pd.DataFrame(binary_matrix, columns=mlb.classes_)
binary_df_genre = binary_df_genre.reset_index()
binary_df_genre.sample(5)

Unnamed: 0,index,action,adventure,animation,comedy,crime,documentary,drama,family,fantasy,...,horror,music,mystery,romance,sciencefiction,thriller,tvmovie,unknown,war,western
752910,752910,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
977689,977689,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
503067,503067,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
560047,560047,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
291293,291293,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_all = df.reset_index()

In [None]:
df_all = pd.concat([df_all, binary_df_genre], axis='columns').drop('index', axis=1)

In [None]:
df_all.sample(5)

Unnamed: 0,title,vote_average,runtime,adult,original_language,popularity,genres,action,adventure,animation,...,horror,music,mystery,romance,sciencefiction,thriller,tvmovie,unknown,war,western
388378,Raul Vaiksoo: Pätt Või Pühak,-0.635977,0.18651,0,et,-0.085522,[documentary],0,0,0,...,0,0,0,0,0,0,0,0,0,0
800982,The Boys Of Mardi Gras '84,-0.635977,0.509781,1,en,-0.085522,[unknown],0,0,0,...,0,0,0,0,0,0,0,1,0,0
600321,Honey To The Moon,-0.635977,-0.637831,0,es,-0.085522,[unknown],0,0,0,...,0,0,0,0,0,0,0,1,0,0
292078,Puddin' Head,1.00573,0.509781,0,en,0.040682,"[comedy, music]",0,0,0,...,0,1,0,0,0,0,0,0,0,0
287005,Pepe Conde,1.990754,1.285631,0,es,-0.085522,[comedy],0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_all.drop(['genres'], axis='columns', inplace=True)

In [None]:
df_all.sample(5)

Unnamed: 0,title,vote_average,runtime,adult,original_language,popularity,action,adventure,animation,comedy,...,horror,music,mystery,romance,sciencefiction,thriller,tvmovie,unknown,war,western
535846,За Нашим Домом Сад,-0.635977,-0.783303,0,en,-0.163266,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
320299,Texas To Bataan,0.677389,0.121856,0,en,-0.074767,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1067308,The Lost Land,-0.635977,0.283491,0,fa,-0.085522,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
441105,Faceless,-0.635977,-0.411541,0,fr,0.06258,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
123624,Shala,1.826583,1.495757,0,mr,0.017618,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
ohe = OneHotEncoder(sparse_output=False)

In [None]:
df_lang = pd.get_dummies(df_all['original_language'], dtype=int)

In [None]:
df_final = pd.concat([df_all, df_lang], axis='columns')
df_final.drop(['original_language'], inplace=True, axis='columns')

In [None]:
df_final.sample(5)

Unnamed: 0,title,vote_average,runtime,adult,popularity,action,adventure,animation,comedy,crime,...,uz,vi,wo,xh,xx,yi,yo,za,zh,zu
586706,The Flourishing,-0.635977,-0.783303,0,-0.085522,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
261720,Joker Review,2.647437,-0.686321,0,-0.085522,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
277395,Replica,2.647437,-0.605504,0,-0.043929,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
815190,"Christmas Snows, Christmas Winds",-0.635977,-0.330723,0,-0.085522,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
935700,Headshot,-0.635977,0.18651,0,-0.085522,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_f = df_final.drop(['title'], axis='columns')

In [None]:
df_f.sample(5)

Unnamed: 0,vote_average,runtime,adult,popularity,action,adventure,animation,comedy,crime,documentary,...,uz,vi,wo,xh,xx,yi,yo,za,zh,zu
347319,1.990754,0.364309,0,-0.025011,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
994991,-0.635977,0.671417,0,-0.085522,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
658087,-0.635977,0.525945,0,-0.163266,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1036632,-0.635977,0.639089,0,-0.044836,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
646724,-0.635977,-0.702485,0,-0.085522,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#df_s = np.array_split(df_final, 52)

In [None]:
df_s = df_final.iloc[:10000]

In [None]:
df_s.sample(5)

Unnamed: 0,title,vote_average,runtime,adult,popularity,action,adventure,animation,comedy,crime,...,uz,vi,wo,xh,xx,yi,yo,za,zh,zu
2277,The Other Boleyn Girl,1.56391,1.075505,0,2.491938,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3242,Charlotte'S Web,1.398098,0.784561,0,3.249939,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4079,Invasion Of The Body Snatchers,1.818375,0.509781,0,1.632224,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5983,Third Person,1.29861,1.431103,0,1.823602,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4141,The Happytime Murders,1.247061,0.68758,0,1.615379,1,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0


**This will run all over 1 million movies in dataframe**

In [None]:
# similarity_matrix = cosine_similarity(df_f)
# similarity_df = pd.DataFrame(similarity_matrix, index=df_final['title'], columns=df_final['title'])

**This will only use small part of our dataframe 1/52**

In [None]:
df_f = df_s.drop(['title'], axis='columns')
similarity_matrix = cosine_similarity(df_f)

In [None]:
similarity_df = pd.DataFrame(similarity_matrix, index=df_s['title'], columns=df_s['title'])

In [None]:
similarity_df.to_csv('Movies_similarity.csv')

In [None]:
def suggest_movie(name, sugg_number=3):
  name = name.title()
  movies = []
  try:
    similar_movies = similarity_df[name].sort_values(ascending=False)
  except Exception as e:
    return list(similarity_df[name].index)[:sugg_number]
  movies = list(similarity_df[name].sort_values(ascending=False).index)
  movies = movies[:10]
  movies_list = random.sample(movies, sugg_number)
  return movies_list

In [None]:
print(suggest_movie('Invasion of The Body Snatchers'))

['Inception', 'Interstellar', 'The Dark Knight']
