In [1]:
import pandas as pd

In [2]:
# sample dataset
data = {
    'Name': ['Ali', 'Sara', 'John',None, 'Mary'],
    'Age': [25, None, 30, 22, None],
    'City': ['KL', 'Penang', None, 'Johor','KL']
}

df = pd.DataFrame(data)
print('Original DataFrame:')
print(df)

# Check the missing values Row by Row

In [6]:
for index, row in df.iterrows():
    print(f"Row {index}: Missing = {row.isnull().any()}, Details = {row.isnull().to_dict()}")

Row 0: Missing = False, Details = {'Name': False, 'Age': False, 'City': False}
Row 1: Missing = True, Details = {'Name': False, 'Age': True, 'City': False}
Row 2: Missing = True, Details = {'Name': False, 'Age': False, 'City': True}
Row 3: Missing = True, Details = {'Name': True, 'Age': False, 'City': False}
Row 4: Missing = True, Details = {'Name': False, 'Age': True, 'City': False}


In [7]:
# show only row with missing data
missing_rows = df[df.isnull().any(axis=1)]
print('Rows with missing data:')
print(missing_rows)

Rows with missing data:
   Name   Age    City
1  Sara   NaN  Penang
2  John  30.0    None
3  None  22.0   Johor
4  Mary   NaN      KL


In [8]:
# Simulate row that would be dropped using dropna()
to_drop = df[df.isnull().any(axis=1)]
print('These rows would be dropped using dropna():')

These rows would be dropped using dropna():


In [9]:
df_cleaned=df.dropna()
print('\nAfter dropna():')
print(df_cleaned)


After dropna():
  Name   Age City
0  Ali  25.0   KL


In [10]:
print('Before fillna():')
print(df)

Before fillna():
   Name   Age    City
0   Ali  25.0      KL
1  Sara   NaN  Penang
2  John  30.0    None
3  None  22.0   Johor
4  Mary   NaN      KL


In [12]:
df_filled=df.fillna({
    'Name': 'Unknown',
    'Age': df['Age'].mean(),
    'City':'Not Available'
})

In [13]:
print('\nAfter fillna():')
print(df_filled)


After fillna():
      Name        Age           City
0      Ali  25.000000             KL
1     Sara  25.666667         Penang
2     John  30.000000  Not Available
3  Unknown  22.000000          Johor
4     Mary  25.666667             KL


In [19]:
df_sfill=df.fillna({
    'Name': 'Unknown',
    'Age': round (df['Age'].mean(),2),
    'City':'Not Available'
})

In [20]:
print('\nAfter fillna(): age upto decimal digit')
print(df_sfill)


After fillna(): age upto decimal digit
      Name    Age           City
0      Ali  25.00             KL
1     Sara  25.67         Penang
2     John  30.00  Not Available
3  Unknown  22.00          Johor
4     Mary  25.67             KL


In [21]:
import math

In [22]:
df_sfill=df.fillna({
    'Name': 'Unknown',
    'Age': math.floor (df['Age'].mean()),
    'City':'Not Available'
})

In [27]:
print('\nAfter fillna(): floor without decimal')
print(df)


After fillna(): floor without decimal
   Name   Age    City
0   Ali  25.0      KL
1  Sara   NaN  Penang
2  John  30.0    None
3  None  22.0   Johor
4  Mary   NaN      KL


## Data Cleaning Exercise
## Step 1:Use Day-15 students_performance_dirty.csv , You can download from GitHub (if you miss out Day-15)
## Step 2: Get basic information about dataset
## Step 3: print('Missing values per column:')
## Step 4: Check missing data line by line
## Step 5: Drop missing rows (if any)
## Step 6: Compare before and after¶
## Step 7: Fill missing values
## Step 8: Compare before and after

In [48]:
import pandas as pd

In [49]:
df = pd.read_csv('students_performance_dirty(1).csv')
print("Dataset loaded successfully!")
print(df.head())

Dataset loaded successfully!
   gender  study_hours  attendance_pct  math_score  reading_score  final_score
0    male          1.7            83.6        62.0           91.0         38.9
1     NaN          2.6            91.2        82.0           60.0         37.5
2  female          2.9            97.5        69.0           57.0         35.0
3  female          4.8            85.7        78.0           62.0         36.5
4    male          3.9           -10.0        64.0           95.0         30.9


In [50]:
for index, row in df.iterrows():
    print(f"Row {index}: Missing = {row.isnull().any()}, Details = {row.isnull().to_dict()}")

Row 0: Missing = False, Details = {'gender': False, 'study_hours': False, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 1: Missing = True, Details = {'gender': True, 'study_hours': False, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 2: Missing = False, Details = {'gender': False, 'study_hours': False, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 3: Missing = False, Details = {'gender': False, 'study_hours': False, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 4: Missing = False, Details = {'gender': False, 'study_hours': False, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 5: Missing = False, Details = {'gender': False, 'study_hours': False, 'attendance_pct': False, 'math_score': False, 'reading_score': False, 'final_score': False}
Row 6:

In [51]:
missing_rows = df[df.isnull().any(axis=1)]
print('Rows with missing data:')
print(missing_rows)

Rows with missing data:
    gender  study_hours  attendance_pct  math_score  reading_score  \
1      NaN          2.6            91.2        82.0           60.0   
6     male          NaN            87.1        95.0           78.0   
7    femle          NaN            70.6        59.0           70.0   
11     NaN          4.6            86.2        63.0           78.0   
12   femle          NaN            95.0        59.0           92.0   
13    male          NaN            86.5        61.0           50.0   
20  female          NaN            93.5        65.0           54.0   
28  female          NaN            71.7        65.0           53.0   
33     NaN          NaN            96.0        39.0           96.0   
34     NaN          3.6            91.3        64.0           62.0   
43     NaN          3.9            76.5        78.0           73.0   
50     NaN          3.0            84.9        78.0           85.0   
52     NaN          3.9            91.1        60.0           68.0

In [52]:
to_drop = df[df.isnull().any(axis=1)]
print('These rows would be dropped using dropna():')

These rows would be dropped using dropna():


In [55]:
df_dropped =df.dropna()
print('Shape before dropna():' ,df.shape)
print('Shape before dropna():' ,df_dropped.shape)

Shape before dropna(): (60, 6)
Shape before dropna(): (45, 6)


In [56]:
df_cleaned=df.dropna()
print('\nAfter dropna():')
print(df_cleaned)


After dropna():
    gender  study_hours  attendance_pct  math_score  reading_score  \
0     male          1.7            83.6        62.0           91.0   
2   female          2.9            97.5        69.0           57.0   
3   female          4.8            85.7        78.0           62.0   
4     male          3.9           -10.0        64.0           95.0   
5     male          1.4            93.7        77.0           72.0   
8   female          2.7            74.7        68.0           71.0   
9   female          4.2            92.4        63.0           66.0   
10    male          4.7            67.9        94.0           73.0   
14    male          3.9            79.0        68.0           91.0   
15    male          4.7            76.5        58.0           78.0   
16    male          2.9            93.2        64.0           68.0   
17  female          3.3            89.7        72.0           65.0   
18  female          2.2           150.0        68.0           85.0   
19 

In [58]:
df

Unnamed: 0,gender,study_hours,attendance_pct,math_score,reading_score,final_score
0,male,1.7,83.6,62.0,91.0,38.9
1,,2.6,91.2,82.0,60.0,37.5
2,female,2.9,97.5,69.0,57.0,35.0
3,female,4.8,85.7,78.0,62.0,36.5
4,male,3.9,-10.0,64.0,95.0,30.9
5,male,1.4,93.7,77.0,72.0,39.0
6,male,,87.1,95.0,78.0,43.4
7,femle,,70.6,59.0,70.0,32.8
8,female,2.7,74.7,68.0,71.0,35.3
9,female,4.2,92.4,63.0,66.0,35.0


In [61]:
# STEP 7: Fill missing values
df_filled=df.fillna({
    'gender':'Unknown',
    'study_hours': df['study_hours'].mean()
})

In [62]:
df_filled

Unnamed: 0,gender,study_hours,attendance_pct,math_score,reading_score,final_score
0,male,1.7,83.6,62.0,91.0,38.9
1,Unknown,2.6,91.2,82.0,60.0,37.5
2,female,2.9,97.5,69.0,57.0,35.0
3,female,4.8,85.7,78.0,62.0,36.5
4,male,3.9,-10.0,64.0,95.0,30.9
5,male,1.4,93.7,77.0,72.0,39.0
6,male,3.413462,87.1,95.0,78.0,43.4
7,femle,3.413462,70.6,59.0,70.0,32.8
8,female,2.7,74.7,68.0,71.0,35.3
9,female,4.2,92.4,63.0,66.0,35.0


In [66]:
# STEP8 Compare before and after
# Compare each row before and afer fillna()
for i in range(len(df)):
    print(f"\nRow {i} Before: {df.iloc[i].to_dict()}")
    print(f"Row {i} After: {df_filled.iloc[i].to_dict()}")


Row 0 Before: {'gender': 'male', 'study_hours': 1.7, 'attendance_pct': 83.6, 'math_score': 62.0, 'reading_score': 91.0, 'final_score': 38.9}
Row 0 After: {'gender': 'male', 'study_hours': 1.7, 'attendance_pct': 83.6, 'math_score': 62.0, 'reading_score': 91.0, 'final_score': 38.9}

Row 1 Before: {'gender': nan, 'study_hours': 2.6, 'attendance_pct': 91.2, 'math_score': 82.0, 'reading_score': 60.0, 'final_score': 37.5}
Row 1 After: {'gender': 'Unknown', 'study_hours': 2.6, 'attendance_pct': 91.2, 'math_score': 82.0, 'reading_score': 60.0, 'final_score': 37.5}

Row 2 Before: {'gender': 'female', 'study_hours': 2.9, 'attendance_pct': 97.5, 'math_score': 69.0, 'reading_score': 57.0, 'final_score': 35.0}
Row 2 After: {'gender': 'female', 'study_hours': 2.9, 'attendance_pct': 97.5, 'math_score': 69.0, 'reading_score': 57.0, 'final_score': 35.0}

Row 3 Before: {'gender': 'female', 'study_hours': 4.8, 'attendance_pct': 85.7, 'math_score': 78.0, 'reading_score': 62.0, 'final_score': 36.5}
Row 3 