## 1. Set Up <a class="anchor"  id="h1"></a>

In [50]:
import numpy as np # linear algebra
import csv
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import time
import warnings
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/tmdb-movies-daily-updates/TMDB_all_movies.csv


In [51]:
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)  # Add this line to suppress FutureWarnings

In [52]:
# Set option to display all columns
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', none)

# Set display option in pandas to show floats with thousand separators
pd.options.display.float_format = '{:,.1f}'.format

# Display multiple Variables without print() statements
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [59]:
# Principal dataset 900k movies from TMDB
df = pd.read_csv("/kaggle/input/tmdb-movies-daily-updates/TMDB_all_movies.csv")

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1091783 entries, 0 to 1091782
Data columns (total 28 columns):
 #   Column                   Non-Null Count    Dtype  
---  ------                   --------------    -----  
 0   id                       1091783 non-null  int64  
 1   title                    1091770 non-null  object 
 2   vote_average             1091781 non-null  float64
 3   vote_count               1091781 non-null  float64
 4   status                   1091781 non-null  object 
 5   release_date             966507 non-null   object 
 6   revenue                  1091781 non-null  float64
 7   runtime                  1091781 non-null  float64
 8   budget                   1091781 non-null  float64
 9   imdb_id                  617171 non-null   object 
 10  original_language        1091781 non-null  object 
 11  original_title           1091771 non-null  object 
 12  overview                 895324 non-null   object 
 13  popularity               1091781 non-null 

## 2. Data preprocessing <a class="anchor"  id="hdata"></a>

In [61]:
# Converting columns
df['revenue'] = df['revenue'].astype(float)
df['budget'] = df['budget'].astype(float)
# df['vote_count'] = df['vote_count'].fillna(0).astype(int)
df['release_date'] = pd.to_datetime(df['release_date'], errors='coerce')  # errors='coerce' will convert invalid parsing to NaT
df['year'] = df['release_date'].dt.year.astype('Int64')  # Use 'Int64' to allow for NaN values

In [62]:
df.head(1)
df.tail(1)

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,original_language,original_title,overview,popularity,tagline,genres,production_companies,production_countries,spoken_languages,cast,director,director_of_photography,writers,producers,music_composer,imdb_rating,imdb_votes,poster_path,year
0,2,Ariel,7.1,346.0,Released,1988-10-21,0.0,73.0,0.0,tt0094675,fi,Ariel,A Finnish man goes to the city to find a job a...,1.8,,"Comedy, Drama, Romance, Crime",Villealfa Filmproductions,Finland,suomi,"Merja Pulkkinen, Eetu Hilkamo, Turo Pajala, Es...",Aki Kaurismäki,Timo Salminen,Aki Kaurismäki,Aki Kaurismäki,,7.4,9159.0,/ojDg0PGvs6R9xYFodRct2kdI6wC.jpg,1988


Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,original_language,original_title,overview,popularity,tagline,genres,production_companies,production_countries,spoken_languages,cast,director,director_of_photography,writers,producers,music_composer,imdb_rating,imdb_votes,poster_path,year
1091782,5180730,,,,,NaT,,,,,,,,,,,,,,"Hailee Steinfeld, Tim Martin Gleason, Holland ...",Travis Knight,Enrique Chediak,"Christina Hodson, Shoji Kawamori","Chris Brigham, Stephen Davis, Don Murphy, Bria...",Dario Marianelli,,,,


In [63]:
df['profit'] = df['revenue'] - df['budget']
df['roi'] = (df['profit'] / df['budget']) * 100

In [64]:
# Example to ensure 'nan' string is handled correctly
df['imdb_url'] = df['imdb_id'].apply(lambda x: f"https://www.imdb.com/title/{x}/" if pd.notnull(x) and x != 'nan' else np.nan)

In [65]:
df.isna().sum()

id                               0
title                           13
vote_average                     2
vote_count                       2
status                           2
release_date                125276
revenue                          2
runtime                          2
budget                           2
imdb_id                     474612
original_language                2
original_title                  12
overview                    196459
popularity                       2
tagline                     926127
genres                      312441
production_companies        583696
production_countries        432245
spoken_languages            417062
cast                        359177
director                    194740
director_of_photography     822123
writers                     557038
producers                   737700
music_composer              980960
imdb_rating                 645415
imdb_votes                  645415
poster_path                 302669
year                

In [66]:
df_sample = df.dropna().sample(n=1000, random_state=42)

In [67]:
df_sample.isna().sum()

id                         0
title                      0
vote_average               0
vote_count                 0
status                     0
release_date               0
revenue                    0
runtime                    0
budget                     0
imdb_id                    0
original_language          0
original_title             0
overview                   0
popularity                 0
tagline                    0
genres                     0
production_companies       0
production_countries       0
spoken_languages           0
cast                       0
director                   0
director_of_photography    0
writers                    0
producers                  0
music_composer             0
imdb_rating                0
imdb_votes                 0
poster_path                0
year                       0
profit                     0
roi                        0
imdb_url                   0
dtype: int64

## 3. Extract Poster Path <a class="anchor"  id="extractposter"></a>

### Set up Logger

In [68]:
import logging
import sys

class JupyterLoggerHandler(logging.StreamHandler):
    def __init__(self):
        super().__init__(sys.stdout)  # Use sys.stdout to force output to the notebook
    
    def emit(self, record):
        try:
            msg = self.format(record)
            stream = self.stream
            stream.write(f"{msg}\n")  # Use the notebook's standard output
            self.flush()
        except Exception:
            self.handleError(record)

# Configure logging to use the custom handler
logger = logging.getLogger()
logger.setLevel(logging.INFO)
handler = JupyterLoggerHandler()
handler.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
handler.setFormatter(formatter)
# Remove all handlers that might have been added previously
while logger.handlers:
    logger.removeHandler(logger.handlers[0])
# Add the custom handler
logger.addHandler(handler)

# # Test logging
logging.info("Test logging message")

2025-05-06 20:06:44,401 - INFO - Test logging message


In [69]:
def get_poster_paths(df):
    # Buang baris dengan nilai NaN di kolom 'poster_path'
    df = df.dropna(subset=['poster_path'])
    
    # Tambahkan kolom 'poster_url' dengan URL lengkap
    base_url = "https://www.themoviedb.org/t/p/w600_and_h900_bestv2"
    df["poster_path"] = base_url + df["poster_path"]
    
    return df

In [70]:
df_poster_sample = get_poster_paths(df_sample)

In [71]:
df_poster_sample['id'] = range(len(df_poster_sample))

In [75]:
import numpy as np

# Ganti nilai inf dengan NaN dulu (agar bisa diproses)
df_poster_sample[['profit', 'roi']] = df_poster_sample[['profit', 'roi']].replace([np.inf, -np.inf], np.nan)

# Ganti nilai negatif dan NaN menjadi 0
df_poster_sample['profit'] = df_poster_sample['profit'].apply(lambda x: 0 if pd.isna(x) or x < 0 else x)
df_poster_sample['roi'] = df_poster_sample['roi'].apply(lambda x: 0 if pd.isna(x) or x < 0 else x)

In [76]:
df_poster_sample.to_csv("testSample.csv",index=False)

In [77]:
# pd.set_option('display.max_colwidth', None)
pd.reset_option('display.max_colwidth')

In [78]:
df_poster_sample

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,budget,imdb_id,original_language,original_title,overview,popularity,tagline,genres,production_companies,production_countries,spoken_languages,cast,director,director_of_photography,writers,producers,music_composer,imdb_rating,imdb_votes,poster_path,year,profit,roi,imdb_url
453576,0,Trauma Center,5.5,387.0,Released,2019-12-06,92968.0,87.0,0.0,tt9625664,en,Trauma Center,Lt. Wakes is a vengeful police detective deter...,1.4,The enemy is closer than you think.,"Action, Thriller","Lionsgate, Grindstone Entertainment Group, EFO...","Puerto Rico, United States of America","English, Español","Kathryn Wrenn Woods, Leslee Emmett, Carroll 'P...",Matt Eskandari,Bryan Koss,Paul J. Da Silva,"Stan Wertlieb, Alex Eckert, George Furla, Barr...",Nima Fakhrara,4.0,12165.0,https://www.themoviedb.org/t/p/w600_and_h900_b...,2019,92968.0,0.0,https://www.imdb.com/title/tt9625664/
78646,1,Dawn of the Planet of the Apes,7.3,11649.0,Released,2014-07-08,710644566.0,130.0,170000000.0,tt2103281,en,Dawn of the Planet of the Apes,A group of scientists in San Francisco struggl...,10.2,One last chance for peace.,"Science Fiction, Action, Drama, Thriller","TSG Entertainment, Chernin Entertainment, Inge...","United Kingdom, United States of America",English,"Keir O'Donnell, Doc Shaw, Michael Papajohn, Me...",Matt Reeves,Michael Seresin,"Rick Jaffa, Amanda Silver, Mark Bomback","Amanda Silver, Dylan Clark, Peter Chernin, Tho...",Michael Giacchino,7.6,499687.0,https://www.themoviedb.org/t/p/w600_and_h900_b...,2014,540644566.0,318.0,https://www.imdb.com/title/tt2103281/
4855,2,Gone in Sixty Seconds,6.4,4540.0,Released,2000-06-09,237200000.0,118.0,90000000.0,tt0187078,en,Gone in Sixty Seconds,Upon learning that he has to come out of retir...,10.0,"Ice Cold, Hot Wired.","Action, Crime, Thriller","Touchstone Pictures, Jerry Bruckheimer Films",United States of America,English,"Grace Zabriskie, Douglas Bennett, Cosimo Fusco...",Dominic Sena,Paul Cameron,"H.B. Halicki, Scott Rosenberg","Barry H. Waldman, Chad Oman, Mike Stenson, Rob...",Trevor Rabin,6.5,302778.0,https://www.themoviedb.org/t/p/w600_and_h900_b...,2000,147200000.0,163.6,https://www.imdb.com/title/tt0187078/
514066,3,Malcolm & Marie,7.0,1391.0,Released,2021-01-29,0.0,106.0,2500000.0,tt12676326,en,Malcolm & Marie,As a filmmaker and his girlfriend return home ...,15.3,Madly in love.,"Drama, Romance",Little Lamb Productions,United States of America,English,"Zendaya, John David Washington",Sam Levinson,Marcell Rév,Sam Levinson,"Yariv Milchan, Zendaya, Kevin Turen, Ashley Le...",Labrinth,6.6,42464.0,https://www.themoviedb.org/t/p/w600_and_h900_b...,2021,0.0,0.0,https://www.imdb.com/title/tt12676326/
26069,4,Norma Rae,7.2,179.0,Released,1979-03-02,22228000.0,110.0,4500000.0,tt0079638,en,Norma Rae,Norma Rae is a southern textile worker employe...,1.2,The story of a woman with the courage to risk ...,Drama,20th Century Fox,United States of America,English,"Bill Pannell, Barbara Baxley, Lee de Broux, Jo...",Martin Ritt,John A. Alonzo,"Harriet Frank Jr., Irving Ravetch, Norman Gimbel","Alexandra Rose, Tamara Asseyev",David Shire,7.3,13414.0,https://www.themoviedb.org/t/p/w600_and_h900_b...,1979,17728000.0,394.0,https://www.imdb.com/title/tt0079638/
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13389,995,Come September,6.8,73.0,Released,1961-01-01,14170000.0,112.0,0.0,tt0054759,en,Come September,"Robert Talbot, an American millionaire, arrive...",8.3,A Quiet hideaway... A Secret rendezvous... the...,"Comedy, Romance","Universal International Pictures, Raoul Walsh ...",United States of America,"English, Italiano","John Stacy, Joan Freeman, Liliana Celli, Giaco...",Robert Mulligan,William H. Daniels,"Maurice Richlin, Stanley Roberts, Robert Russe...","Robert Arthur, Raoul Walsh",Hans J. Salter,6.9,4235.0,https://www.themoviedb.org/t/p/w600_and_h900_b...,1961,14170000.0,0.0,https://www.imdb.com/title/tt0054759/
338784,996,Manikarnika: The Queen of Jhansi,6.6,51.0,Released,2019-01-25,0.0,148.0,17500000.0,tt6903440,hi,iमणिकर्णिका,"Story of Rani Lakshmibai, one of the leading f...",0.8,The Queen of Jhansi,"History, Drama, Action","Kairos Kontent Studios, Zee Studios",India,हिन्दी,"Kangana Ranaut, Edward Sonnenblick, Nalneesh N...",Kangana Ranaut,"Gnana Shekar V. S., Kiran Deohans, Sachin K. K...","Prasoon Joshi, Vijayendra Prasad","Roshan Singh, Ajay R Yadav, Ravindar Thakur, K...","Ehsaan Noorani, Loy Mendonsa, Ankit Balhara, S...",6.4,16512.0,https://www.themoviedb.org/t/p/w600_and_h900_b...,2019,0.0,0.0,https://www.imdb.com/title/tt6903440/
11452,997,Ernest Goes to Camp,5.2,159.0,Released,1987-05-22,23509382.0,92.0,3000000.0,tt0092974,en,Ernest Goes to Camp,"Ernest, a lovable loser who works as a summer ...",7.6,"Welcome to Kamp Kikakee, the craziest summer c...","Comedy, Family","Silver Screen Partners II, Touchstone Pictures...",United States of America,English,"Eddy Schumacher, Brenda Haynes, Mike Hutchinso...",John Cherry,"Harry Mathias, Jim May","John Cherry, Coke Sams","Elmo Williams, Stacy Williams, Martin Erlichman",Shane Keister,5.5,12892.0,https://www.themoviedb.org/t/p/w600_and_h900_b...,1987,20509382.0,683.6,https://www.imdb.com/title/tt0092974/
50644,998,Unconquered,6.2,54.0,Released,1947-10-10,0.0,147.0,5000000.0,tt0039931,en,Unconquered,"England, 1763. After being convicted of a crim...",9.4,I bought this woman for my own… and I'll kill ...,"Drama, Adventure, History",Paramount Pictures,United States of America,English,"Porter Hall, Raymond Hatton, Iron Eyes Cody, V...",Cecil B. DeMille,Ray Rennahan,"Fredric M. Frank, Charles Bennett, Jesse Lasky...",Cecil B. DeMille,Victor Young,6.9,3113.0,https://www.themoviedb.org/t/p/w600_and_h900_b...,1947,0.0,0.0,https://www.imdb.com/title/tt0039931/


#### 4. Show Posters with Details Function <a class="anchor"  id="showposter"></a>

In [29]:
# Show Posters Function

from IPython.display import HTML

def show_posters(df, poster_path_column='poster_path', title_column='title', date_column='release_date', 
                 overview_column='overview', director_column='director', status_column='status', 
                 genre_column='genres', output_file='posters.html'):
    html = "<div style='display: flex; flex-wrap: wrap;'>"
    for _, row in df.iterrows():
        poster_path = row[poster_path_column]
        title = row[title_column]
        release_date = row[date_column].strftime('%Y-%m-%d') if pd.notnull(row[date_column]) else 'N/A'  # Format to show only the date part
        overview = row[overview_column]
        director = row[director_column]
        status = row[status_column]
        genre = row[genre_column]
        
        if pd.notnull(poster_path):
            html += f"""
            <div style="position: relative; margin: 10px;">
                <img src="{poster_path}" alt="{title}" style="width: 200px; height: auto;"/>
                <div style="position: absolute; top: 0; left: 0; width: 200px; height: 300px; background-color: rgba(0, 0, 0, 0.7); color: white; opacity: 0; transition: opacity 0.5s;">
                    <div style="padding: 10px;">
                        <strong>{title}</strong><br>
                        Director: {director}<br>
                        Release: {release_date}<br>
                        Genre: {genre}<br>
                        <p style="font-size: 11px;">{overview}</p>
                    </div>
                </div>
            </div>
            """
        else:
            html += f"""
            <div style="position: relative; margin: 10px; width: 200px; height: 300px; background-color: rgba(0, 0, 0, 0.7); color: white;">
                <div style="padding: 10px;">
                    <strong>{title}</strong><br>
                    Director: {director}<br>
                    Release: {release_date}<br>
                    Genre: {genre}<br>
                    <p style="font-size: 11px;">{overview}</p>
                </div>
            </div>
            """
    
    html += """
    <script>
    document.querySelectorAll('div[style*="position: relative"]').forEach(function(div) {
        div.addEventListener('mouseover', function() {
            div.children[1].style.opacity = 1;
        });
        div.addEventListener('mouseout', function() {
            div.children[1].style.opacity = 0;
        });
    });
    </script>
    """
    html += "</div>"
    
    with open(output_file, 'w') as file:
        file.write(html)     
    display(HTML(html))

In [30]:
show_posters(df_poster_sample.sort_values(by='vote_count', ascending= False).head(20))