<a href="https://colab.research.google.com/github/m18158923-ux/AI-Assisted-Data-Cleaning-Lab/blob/main/Data_Cleaning_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import pandas as pd

# Use the full path to the file
df = pd.read_csv('/tested.csv')

# Step 2: Discovery (The Initial Scan)
print("Dataset Summary:")
df.info()

print("\nMissing Values Per Column:")
print(df.isnull().sum())

Dataset Summary:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB

Missing Values Per Column:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin    

In [16]:
# 1. Fill 'Age' with the median (middle value)
df['Age'] = df['Age'].fillna(df['Age'].median())

# 2. Fill the single missing 'Fare' with the median
df['Fare'] = df['Fare'].fillna(df['Fare'].median())

# 3. For 'Cabin', since so many are missing, we'll fill them with 'Unknown'
df['Cabin'] = df['Cabin'].fillna('Unknown')

# Verify that the holes are filled
print("Missing values after fixing:")
print(df.isnull().sum())


Missing values after fixing:
PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Cabin          0
Embarked       0
dtype: int64


In [17]:
# Convert 'Sex' into numbers (0 and 1)
# Convert 'Embarked' (the port where they got on) into numbers
df_encoded = pd.get_dummies(df, columns=['Sex', 'Embarked'])

# Show the new columns created
print("New columns after One-Hot Encoding:")
print(df_encoded.columns)

# Look at the first few rows to see the change
df_encoded.head()

New columns after One-Hot Encoding:
Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Sex_female', 'Sex_male', 'Embarked_C',
       'Embarked_Q', 'Embarked_S'],
      dtype='object')


Unnamed: 0,PassengerId,Survived,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,892,0,3,"Kelly, Mr. James",34.5,0,0,330911,7.8292,Unknown,False,True,False,True,False
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",47.0,1,0,363272,7.0,Unknown,True,False,False,False,True
2,894,0,2,"Myles, Mr. Thomas Francis",62.0,0,0,240276,9.6875,Unknown,False,True,False,True,False
3,895,0,3,"Wirz, Mr. Albert",27.0,0,0,315154,8.6625,Unknown,False,True,False,False,True
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,1,1,3101298,12.2875,Unknown,True,False,False,False,True


In [18]:
from sklearn.preprocessing import StandardScaler

# We pick columns where the numbers are on different scales
cols_to_scale = ['Age', 'Fare', 'Pclass']

scaler = StandardScaler()

# This transforms the numbers so they all have a mean of 0
df_encoded[cols_to_scale] = scaler.fit_transform(df_encoded[cols_to_scale])

print("Standardization complete!")
df_encoded[cols_to_scale].head()


Standardization complete!


Unnamed: 0,Age,Fare,Pclass
0,0.386231,-0.497413,0.873482
1,1.37137,-0.512278,0.873482
2,2.553537,-0.4641,-0.315819
3,-0.204852,-0.482475,0.873482
4,-0.598908,-0.417492,0.873482


In [19]:
# Check for any remaining missing values one last time
print("Final Missing Value Count:", df_encoded.isnull().sum().sum())

# Save your cleaned dataset to a new CSV file
df_encoded.to_csv('Cleaned_Titanic_Data.csv', index=False)
print("File 'Cleaned_Titanic_Data.csv' is ready for download!")


Final Missing Value Count: 0
File 'Cleaned_Titanic_Data.csv' is ready for download!
