In [1]:
# Exercise 1: Identifying and Handling Missing Data
import pandas as pd

# Sample dataset with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', None],
    'Age': [24, 30, None, 22, 35],
    'Salary': [48000, None, 57000, None, 60000]
}
df = pd.DataFrame(data)

# Filling missing values and dropping rows
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(df['Salary'].median(), inplace=True)
df.dropna(subset=['Name'], inplace=True)
print('After cleaning:\n', df)


After cleaning:
       Name    Age   Salary
0    Alice  24.00  48000.0
1      Bob  30.00  57000.0
2  Charlie  27.75  57000.0
3    David  22.00  57000.0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Salary'].fillna(df['Salary'].median(), inplace=True)


In [2]:
# Exercise 2: Standardizing Categorical Data
# Sample dataset with inconsistent categorical values
data = {
    'Product': ['Laptop', 'Laptop', 'Desktop', 'Tablet', 'Tablet'],
    'Category': ['Electronics', 'electronics', 'Electronics', 'Gadgets', 'gadgets']
}
df = pd.DataFrame(data)

# Standardize category values
df['Category'] = df['Category'].str.capitalize()
print('Standardized Data:\n', df)


Standardized Data:
    Product     Category
0   Laptop  Electronics
1   Laptop  Electronics
2  Desktop  Electronics
3   Tablet      Gadgets
4   Tablet      Gadgets


In [8]:
# Pratctice Tasks 1
# Load a dataset of your choice and identify missing values 

import pandas as pd

# Ganti path di bawah ini sesuai hasil dari output kagglehub kamu
path = "/home/nadhifa/Downloads/archive/Titanic-Dataset.csv"

# Membaca dataset Titanic
df = df[['Name', 'Sex', 'Age', 'Pclass', 'Survived', 'PassengerId', 
         'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']]

# Menampilkan 5 baris pertama untuk melihat isi data
df.head()


Unnamed: 0,Name,Sex,Age,Pclass,Survived,PassengerId,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,"Braund, Mr. Owen Harris",male,22.0,3,0,1,1,0,A/5 21171,7.25,,S
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,1,2,1,0,PC 17599,71.2833,C85,C
2,"Heikkinen, Miss. Laina",female,26.0,3,1,3,0,0,STON/O2. 3101282,7.925,,S
3,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,1,4,1,0,113803,53.1,C123,S
4,"Allen, Mr. William Henry",male,35.0,3,0,5,0,0,373450,8.05,,S


In [10]:
# Practice Task 2
# Normalizing numerical columns (Age & Fare)

from sklearn.preprocessing import MinMaxScaler

# Menampilkan kolom numerik yang akan dinormalisasi
print("Kolom numerik sebelum normalisasi:")
print(df[['Age', 'Fare']].head())

# Daftar kolom numerik
numeric_columns = ['Age', 'Fare']

# Membuat objek scaler
scaler = MinMaxScaler()

# Menerapkan normalisasi Min-Max
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])

print("\nKolom numerik setelah normalisasi:")
print(df[['Age', 'Fare']].head())


Kolom numerik sebelum normalisasi:
        Age      Fare
0  0.271174  0.014151
1  0.472229  0.139136
2  0.321438  0.015469
3  0.434531  0.103644
4  0.434531  0.015713

Kolom numerik setelah normalisasi:
        Age      Fare
0  0.271174  0.014151
1  0.472229  0.139136
2  0.321438  0.015469
3  0.434531  0.103644
4  0.434531  0.015713


In [11]:
# Practice Task 3
# Standardizing categorical columns & removing duplicate rows

# Menampilkan contoh data sebelum standardisasi
print("Sebelum standardisasi kolom kategori:")
print(df[['Sex', 'Embarked']].head())

# Standarisasi kolom kategorikal (huruf awal kapital)
df['Sex'] = df['Sex'].str.capitalize()
df['Embarked'] = df['Embarked'].str.capitalize()

print("\nSetelah standardisasi kolom kategori:")
print(df[['Sex', 'Embarked']].head())

# Mengecek jumlah data sebelum hapus duplikasi
print("\nJumlah data sebelum menghapus duplikasi:", len(df))

# Menghapus baris duplikat
df = df.drop_duplicates()

# Mengecek jumlah data setelah hapus duplikasi
print("Jumlah data setelah menghapus duplikasi:", len(df))


Sebelum standardisasi kolom kategori:
      Sex Embarked
0    male        S
1  female        C
2  female        S
3  female        S
4    male        S

Setelah standardisasi kolom kategori:
      Sex Embarked
0    Male        S
1  Female        C
2  Female        S
3  Female        S
4    Male        S

Jumlah data sebelum menghapus duplikasi: 891
Jumlah data setelah menghapus duplikasi: 891
