In [165]:
# import all necessary library

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import sklearn
import re
from dateutil import parser

In [166]:
# load dataset
# CSV uses semicolon delimiter and non-UTF8 encoding (common in scraped datasets)

df = pd.read_csv(
    "../data/raw/messy_IMDB_dataset.csv",
    sep=";",
    encoding="latin1"
)

In [167]:
# 1. Basic overview of the dataframe
# ----------------------------------
# Check number of rows and columns
print(df.shape)
# Display column names
print(df.columns.tolist())
# Preview first 5 rows
df.head()

(101, 12)
['IMBD title ID', 'Original titlÊ', 'Release year', 'Genrë¨', 'Duration', 'Country', 'Content Rating', 'Director', 'Unnamed: 8', 'Income', ' Votes ', 'Score']


Unnamed: 0,IMBD title ID,Original titlÊ,Release year,Genrë¨,Duration,Country,Content Rating,Director,Unnamed: 8,Income,Votes,Score
0,tt0111161,The Shawshank Redemption,1995-02-10,Drama,142.0,USA,R,Frank Darabont,,$ 28815245,2.278.845,9.3
1,tt0068646,The Godfather,09 21 1972,"Crime, Drama",175.0,USA,R,Francis Ford Coppola,,$ 246120974,1.572.674,9.2
2,tt0468569,The Dark Knight,23 -07-2008,"Action, Crime, Drama",152.0,US,PG-13,Christopher Nolan,,$ 1005455211,2.241.615,9.
3,tt0071562,The Godfather: Part II,1975-09-25,"Crime, Drama",220.0,USA,R,Francis Ford Coppola,,"$ 4o8,035,783",1.098.714,9.0
4,tt0110912,Pulp Fiction,1994-10-28,"Crime, Drama",,USA,R,Quentin Tarantino,,$ 222831817,1.780.147,"8,9f"


In [168]:
# 2. Standardize column headers

df = df.rename(columns={
    'IMBD title ID': 'imdb_title_id',
    'Original titlÊ': 'original_title',
    'Release year': 'release_year',
    'Genrë¨': 'genre',
    'Duration': 'duration',
    'Country': 'country',
    'Content Rating': 'content_rating',
    'Director': 'director',
    'Unnamed: 8': 'unnamed_8',   # bisa diganti atau di-drop kalau tidak berguna
    'Income': 'income',
    ' Votes ': 'votes',
    'Score': 'score'
})

print(df.columns)

Index(['imdb_title_id', 'original_title', 'release_year', 'genre', 'duration',
       'country', 'content_rating', 'director', 'unnamed_8', 'income', 'votes',
       'score'],
      dtype='object')


In [169]:
# 3. Data types & missing values
# -------------------------------
# Check data types and non-null counts
df.info()

# Count missing values per column
df.isna().sum()

# Percentage of missing values per column
(df.isna().mean() * 100).round(2)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   imdb_title_id   100 non-null    object 
 1   original_title  100 non-null    object 
 2   release_year    100 non-null    object 
 3   genre           100 non-null    object 
 4   duration        99 non-null     object 
 5   country         100 non-null    object 
 6   content_rating  77 non-null     object 
 7   director        100 non-null    object 
 8   unnamed_8       0 non-null      float64
 9   income          100 non-null    object 
 10  votes           100 non-null    object 
 11  score           100 non-null    object 
dtypes: float64(1), object(11)
memory usage: 9.6+ KB


imdb_title_id       0.99
original_title      0.99
release_year        0.99
genre               0.99
duration            1.98
country             0.99
content_rating     23.76
director            0.99
unnamed_8         100.00
income              0.99
votes               0.99
score               0.99
dtype: float64

In [170]:
# Drop the 'unnamed_8' column (empty data)
df = df.drop(columns=['unnamed_8'])

In [171]:
# 3. Summary statistics
# ---------------------
# Numerical columns summary
df.describe()

# Include categorical columns
df.describe(include="object")

Unnamed: 0,imdb_title_id,original_title,release_year,genre,duration,country,content_rating,director,income,votes,score
count,100,100,100,100,99,100,77,100,100,100,100.0
unique,100,100,99,59,71,18,7,64,100,100,28.0
top,tt0111161,The Shawshank Redemption,2000-05-19,Drama,130,USA,R,Christopher Nolan,$ 28815245,2.278.845,8.6
freq,1,1,2,9,3,62,45,6,1,1,11.0


In [172]:
# 3. Duplicate rows
# -----------------
# Check number of duplicated rows
print("number of duplicated rows: ", df.duplicated().sum())

# Cek jumlah missing values per kolom
print(df.isnull().sum())

# Cek total missing values di seluruh dataframe
print("Total missing values:", df.isnull().sum().sum())

# Jumlah baris yang mengandung missing values
print("Number of rows with missing values:", missing_rows.shape[0])



number of duplicated rows:  0
imdb_title_id      1
original_title     1
release_year       1
genre              1
duration           2
country            1
content_rating    24
director           1
income             1
votes              1
score              1
dtype: int64
Total missing values: 35
Number of rows with missing values: 25


In [173]:
# Filter rows where imdb_title_id is missing
missing_imdb = df[df['imdb_title_id'].isnull()]
print(missing_imdb)

# Jumlah baris yang missing
print("Number of rows with missing imdb_title_id:", missing_imdb.shape[0])


   imdb_title_id original_title release_year genre duration country  \
13           NaN            NaN          NaN   NaN      NaN     NaN   

   content_rating director income votes score  
13            NaN      NaN    NaN   NaN   NaN  
Number of rows with missing imdb_title_id: 1


In [174]:
# Drop rows where imdb_title_id is missing
df = df.dropna(subset=['imdb_title_id'])

# Cek ukuran dataframe setelah drop
print(df.shape)

(100, 11)


In [175]:
df.describe()

Unnamed: 0,imdb_title_id,original_title,release_year,genre,duration,country,content_rating,director,income,votes,score
count,100,100,100,100,99,100,77,100,100,100,100.0
unique,100,100,99,59,71,18,7,64,100,100,28.0
top,tt0111161,The Shawshank Redemption,2000-05-19,Drama,130,USA,R,Christopher Nolan,$ 28815245,2.278.845,8.6
freq,1,1,2,9,3,62,45,6,1,1,11.0


In [176]:
# Count empty strings in object columns
for col in df.select_dtypes(include="object").columns:
    empty_count = (df[col].astype(str).str.strip() == "").sum()
    print(f"{col}: {empty_count} empty/whitespace values")

imdb_title_id: 0 empty/whitespace values
original_title: 0 empty/whitespace values
release_year: 0 empty/whitespace values
genre: 0 empty/whitespace values
duration: 1 empty/whitespace values
country: 0 empty/whitespace values
content_rating: 0 empty/whitespace values
director: 0 empty/whitespace values
income: 0 empty/whitespace values
votes: 0 empty/whitespace values
score: 0 empty/whitespace values


In [177]:
# Hitung mode dari kolom 'duration'
mode_value = df['duration'].mode()[0]

# Isi cell kosong dengan mode
df['duration'] = df['duration'].fillna(mode_value)

# Cek hasil
print("Mode value used:", mode_value)
print(df['duration'].isnull().sum())  # harusnya 0

Mode value used: 116
0


In [178]:
df.describe()

Unnamed: 0,imdb_title_id,original_title,release_year,genre,duration,country,content_rating,director,income,votes,score
count,100,100,100,100,100,100,77,100,100,100,100.0
unique,100,100,99,59,71,18,7,64,100,100,28.0
top,tt0111161,The Shawshank Redemption,2000-05-19,Drama,116,USA,R,Christopher Nolan,$ 28815245,2.278.845,8.6
freq,1,1,2,9,4,62,45,6,1,1,11.0


In [179]:
# Print unique values and their counts from 'content_rating'
print(df['content_rating'].value_counts())

# Kalau mau termasuk NaN (missing values) juga dihitung:
print(df['content_rating'].value_counts(dropna=False))


content_rating
R            45
PG-13        12
PG           11
G             6
Not Rated     1
Approved      1
Unrated       1
Name: count, dtype: int64
content_rating
R            45
NaN          23
PG-13        12
PG           11
G             6
Not Rated     1
Approved      1
Unrated       1
Name: count, dtype: int64


In [180]:
# Gabungkan kategori
df['content_rating'] = df['content_rating'].replace({
    'Unrated': 'Not Rated',
    'Approved': 'Not Rated'   # atau biarkan 'Approved' kalau mau analisis historis
})

# Isi NaN dengan 'Not Rated'
df['content_rating'] = df['content_rating'].fillna('Not Rated')

# Cek hasil distribusi baru
print(df['content_rating'].value_counts())


content_rating
R            45
Not Rated    26
PG-13        12
PG           11
G             6
Name: count, dtype: int64


In [181]:
df.head()

Unnamed: 0,imdb_title_id,original_title,release_year,genre,duration,country,content_rating,director,income,votes,score
0,tt0111161,The Shawshank Redemption,1995-02-10,Drama,142.0,USA,R,Frank Darabont,$ 28815245,2.278.845,9.3
1,tt0068646,The Godfather,09 21 1972,"Crime, Drama",175.0,USA,R,Francis Ford Coppola,$ 246120974,1.572.674,9.2
2,tt0468569,The Dark Knight,23 -07-2008,"Action, Crime, Drama",152.0,US,PG-13,Christopher Nolan,$ 1005455211,2.241.615,9.
3,tt0071562,The Godfather: Part II,1975-09-25,"Crime, Drama",220.0,USA,R,Francis Ford Coppola,"$ 4o8,035,783",1.098.714,9.0
4,tt0110912,Pulp Fiction,1994-10-28,"Crime, Drama",,USA,R,Quentin Tarantino,$ 222831817,1.780.147,"8,9f"


In [182]:
# Regex pattern untuk IMDb ID: 'tt' + digit (minimal 7 angka)
pattern = r'^tt\d{7,}$'

# Cari baris yang tidak sesuai format
outliers = df[~df['imdb_title_id'].astype(str).str.match(pattern)]

print("Number of outliers:", outliers.shape[0])
print(outliers[['imdb_title_id']])


Number of outliers: 0
Empty DataFrame
Columns: [imdb_title_id]
Index: []


In [183]:
# 1. Cek apakah ada nilai kosong
missing_titles = df[df['original_title'].isnull()]
print("Missing titles:", missing_titles.shape[0])

# 2. Cek apakah ada nilai yang bukan string
non_string_titles = df[~df['original_title'].apply(lambda x: isinstance(x, str))]
print("Non-string titles:", non_string_titles.shape[0])

# 3. Cek apakah ada judul yang terlalu pendek (misalnya 1 huruf atau kosong string)
short_titles = df[df['original_title'].str.len() < 2]
print("Suspiciously short titles:", short_titles.shape[0])

# 4. Cek karakter aneh (misalnya hanya angka atau simbol)
weird_titles = df[df['original_title'].str.match(r'^[^A-Za-z]+$', na=False)]
print("Weird titles:", weird_titles.shape[0])
print(weird_titles[['original_title']])

Missing titles: 0
Non-string titles: 0
Suspiciously short titles: 0
Weird titles: 1
   original_title
90           1917


In [184]:
# 10 unique values (acak urutan)
print(df['release_year'].unique()[:10])

# 10 unique values paling sering muncul
print(df['release_year'].value_counts().head(10))


['1995-02-10' '09 21 1972' ' 23 -07-2008' '1975-09-25' '1994-10-28'
 '22 Feb 04' '1994-03-11' '1957-09-04' '2010-09-24' '10-29-99']
release_year
2000-05-19                     2
1995-02-10                     1
2007-04-06                     1
1983-10-21                     1
1995-12-01                     1
2000-01-21                     1
2009-10-02                     1
The 6th of marzo, year 1951    1
1945-06-25                     1
1984-09-28                     1
Name: count, dtype: int64


In [185]:
# Tambahkan kolom baru hasil parsing
df['release_year_parsed'] = pd.to_datetime(
    df['release_year'], 
    errors='coerce', 
    infer_datetime_format=True
)

# Lihat semua baris yang hasil parsing-nya NaT
naT_rows = df[df['release_year_parsed'].isna()]
print(naT_rows[['release_year', 'release_year_parsed']])

                   release_year release_year_parsed
1                    09 21 1972                 NaT
2                   23 -07-2008                 NaT
5                     22 Feb 04                 NaT
9                      10-29-99                 NaT
12       23rd December of 1966                  NaT
15                     01/16-03                 NaT
18                   18/11/1976                 NaT
45                     21-11-46                 NaT
70  The 6th of marzo, year 1951                 NaT
83                   1984-02-34                 NaT
84                   1976-13-24                 NaT


  df['release_year_parsed'] = pd.to_datetime(


In [186]:
# Mapping bulan non-standar ke bahasa Inggris
month_map = {
    "marzo": "March",
    # tambahkan jika ada bulan lain
}

def fix_date(val):
    if pd.isnull(val):
        return pd.NaT
    s = str(val).strip()
    
    # ubah kata bulan non-standar
    for k, v in month_map.items():
        if k.lower() in s.lower():
            s = s.lower().replace(k.lower(), v)
    
    try:
        # parsing fleksibel
        dt = parser.parse(s, dayfirst=True, yearfirst=False)
        return dt
    except:
        return pd.NaT

# 1. Tambahkan kolom hasil parsing awal
df['release_year_parsed'] = pd.to_datetime(df['release_year'], errors='coerce', infer_datetime_format=True)

# 2. Perbaiki baris yang NaT dengan fungsi fix_date
mask_nat = df['release_year_parsed'].isna()
df.loc[mask_nat, 'release_year_parsed'] = df.loc[mask_nat, 'release_year'].apply(fix_date)

# 3. Pisahkan tahunnya saja ke kolom release_year tanpa desimal
df['release_year'] = df['release_year_parsed'].dt.year.astype('Int64')  # nullable integer, aman kalau ada NaT


# 4. Ganti nama kolom tanggal lengkap jadi release_date
df = df.rename(columns={'release_year_parsed': 'release_date'})

# Cek hasil
print(df[['release_year', 'release_date']].head(20))


    release_year release_date
0           1995   1995-02-10
1           1972   1972-09-21
2           2008   2008-07-23
3           1975   1975-09-25
4           1994   1994-10-28
5           2004   2004-02-22
6           1994   1994-03-11
7           1957   1957-09-04
8           2010   2010-09-24
9           1999   1999-10-29
10          1994   1994-10-06
11          2002   2002-01-18
12          1966   1966-12-23
14          1999   1999-05-07
15          2003   2003-01-16
16          1980   1980-09-19
17          1990   1990-09-20
18          1976   1976-11-18
19          2014   2014-11-06
20          1995   1995-12-15


  df['release_year_parsed'] = pd.to_datetime(df['release_year'], errors='coerce', infer_datetime_format=True)


In [187]:
df.head()

Unnamed: 0,imdb_title_id,original_title,release_year,genre,duration,country,content_rating,director,income,votes,score,release_date
0,tt0111161,The Shawshank Redemption,1995,Drama,142.0,USA,R,Frank Darabont,$ 28815245,2.278.845,9.3,1995-02-10
1,tt0068646,The Godfather,1972,"Crime, Drama",175.0,USA,R,Francis Ford Coppola,$ 246120974,1.572.674,9.2,1972-09-21
2,tt0468569,The Dark Knight,2008,"Action, Crime, Drama",152.0,US,PG-13,Christopher Nolan,$ 1005455211,2.241.615,9.,2008-07-23
3,tt0071562,The Godfather: Part II,1975,"Crime, Drama",220.0,USA,R,Francis Ford Coppola,"$ 4o8,035,783",1.098.714,9.0,1975-09-25
4,tt0110912,Pulp Fiction,1994,"Crime, Drama",,USA,R,Quentin Tarantino,$ 222831817,1.780.147,"8,9f",1994-10-28
