In [1]:
import pandas as pd
import numpy as np


In [2]:
df = pd.read_csv("titanic.csv")


In [3]:
df.head()


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [4]:
df.info()



<class 'pandas.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    str    
 4   Sex          418 non-null    str    
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    str    
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     str    
 11  Embarked     418 non-null    str    
dtypes: float64(2), int64(5), str(5)
memory usage: 39.3 KB


In [5]:
df.isnull().sum()


PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [6]:
df.drop(columns=["Cabin"], inplace=True)


In [7]:
df["Age"].fillna(df["Age"].median(), inplace=True)


C:\Users\0000\AppData\Local\Temp\ipykernel_43836\1567474574.py:1: ChainedAssignmentError: A value is being set on a copy of a DataFrame or Series through chained assignment using an inplace method.
Such inplace method never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy (due to Copy-on-Write).

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' instead, to perform the operation inplace on the original object, or try to avoid an inplace operation using 'df[col] = df[col].method(value)'.

See the documentation for a more detailed explanation: https://pandas.pydata.org/pandas-docs/stable/user_guide/copy_on_write.html
  df["Age"].fillna(df["Age"].median(), inplace=True)


0      34.5
1      47.0
2      62.0
3      27.0
4      22.0
       ... 
413    27.0
414    39.0
415    38.5
416    27.0
417    27.0
Name: Age, Length: 418, dtype: float64

In [8]:
df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)


C:\Users\0000\AppData\Local\Temp\ipykernel_43836\585847971.py:1: ChainedAssignmentError: A value is being set on a copy of a DataFrame or Series through chained assignment using an inplace method.
Such inplace method never works to update the original DataFrame or Series, because the intermediate object on which we are setting values always behaves as a copy (due to Copy-on-Write).

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' instead, to perform the operation inplace on the original object, or try to avoid an inplace operation using 'df[col] = df[col].method(value)'.

See the documentation for a more detailed explanation: https://pandas.pydata.org/pandas-docs/stable/user_guide/copy_on_write.html
  df["Embarked"].fillna(df["Embarked"].mode()[0], inplace=True)


0      Q
1      S
2      Q
3      S
4      S
      ..
413    S
414    C
415    S
416    S
417    C
Name: Embarked, Length: 418, dtype: str

In [9]:
before = df.shape[0]
df.drop_duplicates(inplace=True)
after = df.shape[0]

print("Rows before:", before)
print("Rows after:", after)


Rows before: 418
Rows after: 418


In [10]:
df["Sex"] = df["Sex"].map({"male": 0, "female": 1})


In [11]:
df["Fare"] = df["Fare"].astype(float)


In [12]:
def age_group(age):
    if age < 18:
        return "Child"
    elif age < 60:
        return "Adult"
    else:
        return "Senior"

df["Age_Group"] = df["Age"].apply(age_group)


In [13]:
df["Fare_Band"] = pd.qcut(df["Fare"], q=4, labels=["Low", "Medium", "High", "Very High"])


In [14]:
df.to_csv("cleaned_data.csv", index=False)


In [15]:
import os
os.getcwd()


'F:\\jupyter notebook\\task 5'

In [16]:
os.listdir()


['.ipynb_checkpoints', 'cleaned_data.csv', 'titanic.csv', 'Untitled.ipynb']

In [None]:
Data Cleaning Notes

1.Loaded Titanic dataset using pandas read_csv.
2.Inspected structure using head() and info().
3.Checked missing values using isnull().sum().
4.Dropped Cabin due to excessive missing data.
5.Filled Age with median to handle outliers.
6.Filled Embarked with mode as categorical data.
7.Removed duplicate rows for consistency.
8.Converted Sex column to numeric.
9.Created Age_Group feature.
10.Created Fare_Band using quartiles