In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('test.csv')
df.head()


Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [3]:
print("Missing values before cleaning:")
print(df.isnull().sum())

Missing values before cleaning:
PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64


In [4]:
df_clean = df.dropna()
print(f"Rows after dropping ALL missing values: {len(df_clean)}")

Rows after dropping ALL missing values: 87


In [5]:
# Fill missing 'Age' with median (better than mean for skewed data)
df['Age'].fillna(df['Age'].median(), inplace=True)

# Fill missing 'Fare' with median
df['Fare'].fillna(df['Fare'].median(), inplace=True)

# 'Cabin' has too many missing values → Drop the column
df.drop('Cabin', axis=1, inplace=True)

print("Missing values after cleaning:")
print(df.isnull().sum())

Missing values after cleaning:
PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


In [6]:
duplicates = df[df.duplicated(subset=['PassengerId'])]
print(f"Duplicate Passenger IDs: {len(duplicates)}")

Duplicate Passenger IDs: 0


In [7]:
invalid_sex = df[~df['Sex'].isin(['male', 'female'])]
print(f"Invalid 'Sex' entries: {len(invalid_sex)}")

Invalid 'Sex' entries: 0


In [8]:
invalid_pclass = df[~df['Pclass'].isin([1, 2, 3])]
print(f"Invalid 'Pclass' entries: {len(invalid_pclass)}")

Invalid 'Pclass' entries: 0


In [9]:
invalid_age = df[df['Age'] < 0]
invalid_fare = df[df['Fare'] < 0]
print(f"Invalid 'Age' entries: {len(invalid_age)}")
print(f"Invalid 'Fare' entries: {len(invalid_fare)}")

Invalid 'Age' entries: 0
Invalid 'Fare' entries: 0
