

# **1. Import Libraries**



In [1]:
import pandas as pd

In [2]:
# Langkah 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [11]:
# Load dataset
file_path = "/content/drive/MyDrive/Colab Notebooks/netflix_dirty.csv"# ganti sesuai path di Colab
df_dirty = pd.read_csv(file_path)


# **2. Baca Dataset pelanggan yang kotor**

In [4]:
# Tampilkan 5 baris pertama
print("=== Dataset Asli ===")
print(df.head())
print("\nInfo Dataset:")
print(df.info())
print("\nJumlah Data & Kolom:", df.shape)

=== Dataset Asli ===
  show_id     type                             title         director  \
0      s1    Movie              Dick Johnson Is Dead  Kirsten Johnson   
1      s3  TV Show                         Ganglands  Julien Leclercq   
2      s6  TV Show                     Midnight Mass    Mike Flanagan   
3     s14    Movie  Confessions of an Invisible Girl    Bruno Garotti   
4      s8    Movie                           Sankofa     Haile Gerima   

         country date_added  release_year rating  duration  \
0  United States  9/25/2021          2020  PG-13    90 min   
1         France  9/24/2021          2021  TV-MA  1 Season   
2  United States  9/24/2021          2021  TV-MA  1 Season   
3         Brazil  9/22/2021          2021  TV-PG    91 min   
4  United States  9/24/2021          1993  TV-MA   125 min   

                                           listed_in  cast  
0                                      Documentaries   NaN  
1  Crime TV Shows, International TV Shows, TV

# **3. Mengecek Missing Values**

In [5]:
print("\n=== Cek Missing Values ===")
print(df.isnull().sum())


=== Cek Missing Values ===
show_id            0
type               0
title              0
director          25
country           25
date_added         0
release_year       0
rating            25
duration           0
listed_in          0
cast            8800
dtype: int64


# **4. Perbaikan Missing Values**

In [12]:
# Kolom 'director' → isi dengan 'Unknown'
df_dirty['director'].fillna("Unknown", inplace=True)

# Kolom 'country' → isi dengan 'Unknown'
df_dirty['country'].fillna("Unknown", inplace=True)

# Kolom 'rating' → isi dengan modus (nilai yang paling sering muncul)
df_dirty['rating'].fillna(df_dirty['rating'].mode()[0], inplace=True)

# Kolom 'cast' → karena NaN sangat banyak, isi dengan 'Not Available'
df_dirty['cast'].fillna("Not Available", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_dirty['director'].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_dirty['country'].fillna("Unknown", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are set

# **5. Cek Ulang Perbaikan Mising Velue**

In [13]:
print("\n=== Missing Values Setelah Perbaikan ===")
print(df_dirty.isnull().sum())


=== Missing Values Setelah Perbaikan ===
show_id         0
type            0
title           0
director        0
country         0
date_added      0
release_year    0
rating          0
duration        0
listed_in       0
cast            0
dtype: int64


# **6. Cek & Hapus Duplikasi**

In [14]:
# Cell 5: Cek data duplikat
print("=== Cek Data Duplikat ===")
dup_count = df_dirty.duplicated().sum()
print(f"Jumlah baris duplikat: {dup_count}")

# Jika ada duplikat, tampilkan contoh 5 baris pertama yang duplikat
if dup_count > 0:
    print("\nContoh data duplikat:")
    display(df_dirty[df_dirty.duplicated()].head())
else:
    print("Tidak ada data duplikat.")

=== Cek Data Duplikat ===
Jumlah baris duplikat: 10

Contoh data duplikat:


Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in,cast
8790,s5505,Movie,Kiss & Cry,Sean Cisterna,Canada,5/1/2017,2017,TV-MA,93 min,"Dramas, International Movies, Romantic Movies",Not Available
8791,s6264,TV Show,Beating Again,Not Given,South Korea,5/22/2017,2015,TV-14,1 Season,"International TV Shows, Korean TV Shows, Roman...",Not Available
8792,s1463,TV Show,Running Man,Not Given,Pakistan,1/1/2021,2020,TV-Y7,1 Season,"Kids' TV, TV Comedies",Not Available
8793,s6741,Movie,Farce,Shadi Ali,Egypt,6/6/2019,2017,TV-MA,94 min,"Comedies, International Movies",Not Available
8794,s2758,Movie,Pretty Little Stalker,Sam Irvin,United States,3/31/2020,2018,TV-14,84 min,Thrillers,Not Available


In [15]:
# === Hapus Data Duplikat ===
print("=== Hapus Data Duplikat ===")

# Hitung jumlah data sebelum
before = df_dirty.shape[0]

# Hapus baris yang duplikat
df_clean = df_dirty.drop_duplicates()

# Hitung jumlah data sesudah
after = df_clean.shape[0]

print(f"Jumlah baris sebelum cleaning: {before}")
print(f"Jumlah baris setelah cleaning: {after}")
print(f"Jumlah baris yang terhapus (duplikat): {before - after}")

=== Hapus Data Duplikat ===
Jumlah baris sebelum cleaning: 8800
Jumlah baris setelah cleaning: 8790
Jumlah baris yang terhapus (duplikat): 10


# **7. Cek Outlier Pada Kolom Realese_Tahun**

In [27]:
# 11. Cek outlier pada kolom release_year
print("\n=== Nilai Unik release_year ===")
print(sorted(df_clean['release_year'].unique()))


=== Nilai Unik release_year ===
[np.int64(1925), np.int64(1942), np.int64(1943), np.int64(1944), np.int64(1945), np.int64(1946), np.int64(1947), np.int64(1954), np.int64(1955), np.int64(1956), np.int64(1958), np.int64(1959), np.int64(1960), np.int64(1961), np.int64(1962), np.int64(1963), np.int64(1964), np.int64(1965), np.int64(1966), np.int64(1967), np.int64(1968), np.int64(1969), np.int64(1970), np.int64(1971), np.int64(1972), np.int64(1973), np.int64(1974), np.int64(1975), np.int64(1976), np.int64(1977), np.int64(1978), np.int64(1979), np.int64(1980), np.int64(1981), np.int64(1982), np.int64(1983), np.int64(1984), np.int64(1985), np.int64(1986), np.int64(1987), np.int64(1988), np.int64(1989), np.int64(1990), np.int64(1991), np.int64(1992), np.int64(1993), np.int64(1994), np.int64(1995), np.int64(1996), np.int64(1997), np.int64(1998), np.int64(1999), np.int64(2000), np.int64(2001), np.int64(2002), np.int64(2003), np.int64(2004), np.int64(2005), np.int64(2006), np.int64(2007), np.int

# **8. Standarisasi Format Data**

In [32]:
# 1. Kolom title → huruf kecil + hapus spasi berlebih
df['title'] = df['title'].str.strip().str.lower()

In [33]:
# Cek hasil standarisasi kolom title
print("=== Contoh kolom title setelah standarisasi ===")
print(df['title'].head(10))


=== Contoh kolom title setelah standarisasi ===
0                dick johnson is dead
1                           ganglands
2                       midnight mass
3    confessions of an invisible girl
4                             sankofa
5       the great british baking show
6                        the starling
7     motu patlu in the game of zones
8                        je suis karl
9            motu patlu in wonderland
Name: title, dtype: object


In [34]:
# 4. Kolom rating → seragamkan kapitalisasi
df['rating'] = df['rating'].str.upper().str.strip()

In [35]:
# Cek hasil standarisasi kolom rating
print("\n=== Nilai unik pada kolom rating setelah standarisasi ===")
print(df['rating'].unique())


=== Nilai unik pada kolom rating setelah standarisasi ===
['PG-13' 'TV-MA' 'TV-PG' 'TV-14' 'TV-Y7' 'TV-Y' 'PG' 'TV-G' nan 'R' 'G'
 'NC-17' 'NR' 'TV-Y7-FV' 'UR']


# **9. Cek Nilai Tidak Valid**

In [36]:
# 1. Cek nilai tidak valid pada release_year
print("=== Nilai tidak valid pada release_year ===")
invalid_years = df[(df['release_year'] < 1900) | (df['release_year'] > 2025)]
print(invalid_years['release_year'].unique())

=== Nilai tidak valid pada release_year ===
[]


# **10. Cek Kolom Tidak Relevan**

In [39]:
# 1. Cek jumlah nilai unik per kolom
print("=== Jumlah nilai unik per kolom ===")
print(df.nunique())

=== Jumlah nilai unik per kolom ===
show_id         8790
type               2
title           8781
director        4518
country           86
date_added      1713
release_year      74
rating            14
duration         220
listed_in        513
cast               0
dtype: int64


In [40]:
# Identifikasi kolom kandidat tidak relevan
irrelevant_cols = []

# Kolom dengan 1 nilai unik
single_value_cols = df.nunique()[df.nunique() == 1].index.tolist()
irrelevant_cols.extend(single_value_cols)

In [41]:
# Hapus kolom tidak relevan
df = df.drop(columns=set(irrelevant_cols), errors='ignore')

print("\n=== Setelah perbaikan ===")
print("Kolom dataset sekarang:", df.columns.tolist())


=== Setelah perbaikan ===
Kolom dataset sekarang: ['show_id', 'type', 'title', 'director', 'country', 'date_added', 'release_year', 'rating', 'duration', 'listed_in', 'cast']


# **11. Dataset Setelah Cleansing**
Menampilkan dataset akhir setelah proses cleansing:  
- Tidak ada missing value penting  
- Tidak ada duplikasi  
- Standarisasi (Kolom title → huruf kecil + hapus spasi berlebih)  
- Standarisasi (Kolom rating → seragamkan kapitalisasi)
- Tidak ada nilai yang tidak valid
- Tidak ada Kolong Tidak Relevan

In [42]:
# === Tampilkan hasil dataset setelah cleansing ===

print("=== Preview Dataset Bersih ===")
display(df.head(10))   # tampilkan 10 baris pertama

print("\n=== Info Dataset Bersih ===")
print(df.info())

print("\n=== Jumlah Baris & Kolom Dataset Bersih ===")
print(df.shape)

=== Preview Dataset Bersih ===


Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in,cast
0,s1,Movie,dick johnson is dead,Kirsten Johnson,United States,9/25/2021,2020,PG-13,90 min,Documentaries,
1,s3,TV Show,ganglands,Julien Leclercq,France,9/24/2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",
2,s6,TV Show,midnight mass,Mike Flanagan,United States,9/24/2021,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries",
3,s14,Movie,confessions of an invisible girl,Bruno Garotti,Brazil,9/22/2021,2021,TV-PG,91 min,"Children & Family Movies, Comedies",
4,s8,Movie,sankofa,Haile Gerima,United States,9/24/2021,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies",
5,s9,TV Show,the great british baking show,Andy Devonshire,United Kingdom,9/24/2021,2021,TV-14,9 Seasons,"British TV Shows, Reality TV",
6,s10,Movie,the starling,Theodore Melfi,United States,9/24/2021,2021,PG-13,104 min,"Comedies, Dramas",
7,s939,Movie,motu patlu in the game of zones,Suhas Kadav,India,5/1/2021,2019,TV-Y7,87 min,"Children & Family Movies, Comedies, Music & Mu...",
8,s13,Movie,je suis karl,Christian Schwochow,Germany,9/23/2021,2021,TV-MA,127 min,"Dramas, International Movies",
9,s940,Movie,motu patlu in wonderland,Suhas Kadav,India,5/1/2021,2013,TV-Y7,76 min,"Children & Family Movies, Music & Musicals",



=== Info Dataset Bersih ===
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8800 entries, 0 to 8799
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   show_id       8800 non-null   object 
 1   type          8800 non-null   object 
 2   title         8800 non-null   object 
 3   director      8775 non-null   object 
 4   country       8775 non-null   object 
 5   date_added    8800 non-null   object 
 6   release_year  8800 non-null   int64  
 7   rating        8775 non-null   object 
 8   duration      8800 non-null   object 
 9   listed_in     8800 non-null   object 
 10  cast          0 non-null      float64
dtypes: float64(1), int64(1), object(9)
memory usage: 756.4+ KB
None

=== Jumlah Baris & Kolom Dataset Bersih ===
(8800, 11)


# **12. Kesimpulan**
