In [1]:
import pandas as pd
import scipy as stats

In [2]:
df = pd.read_csv('/content/drive/MyDrive/ML/data preprocessing/titanic-train.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
df.shape

(891, 12)

In [4]:
df.dtypes

Unnamed: 0,0
PassengerId,int64
Survived,int64
Pclass,int64
Name,object
Gender,object
Age,float64
SibSp,int64
Parch,int64
Ticket,object
Fare,float64


## Remove Duplicates Rows

In [5]:
df_cleaned = df.drop_duplicates()

df_cleaned.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Handling Missing Values

In [6]:
print(df_cleaned.isnull().sum()) # show missing values

PassengerId      0
Survived         0
Pclass           0
Name             0
Gender           0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [7]:
df_cleaned['Age'] = df_cleaned['Age'].fillna(df_cleaned['Age'].median()) # fill missing value of

# show missing values again
print(df_cleaned.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Gender           0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


## Remove Column

In [8]:
df_cleaned = df_cleaned.drop(['Cabin'], axis= 1)

df_cleaned.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [9]:
df_cleaned = df_cleaned.drop(['Embarked'], axis=1)

df_cleaned.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Gender,Age,SibSp,Parch,Ticket,Fare
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05


In [10]:
print(df_cleaned.isnull().sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Gender         0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
dtype: int64


# Normalization:
Rescales the data to a fixed range, usually [0, 1].

##📌 When to Use:
When your data has different scales (like age in years vs. income in dollars).

When using distance-based algorithms like KNN or K-Means that depend on how close points are.

-----------------------------------
# Scaling (Standardization):
Transforms data to have zero mean and unit variance (standard normal distribution).

##📌 When to Use:
When the algorithm assumes normally distributed data (like Logistic Regression or SVM).

When features have very different units (e.g., temperature vs. salary).



In [14]:
# Nomarlization

from sklearn.preprocessing import MinMaxScaler

#Initialize the scaler
Scaler = MinMaxScaler()

# Apply Min-Max Scaling to 'Age'
df_cleaned['Age'] = Scaler.fit_transform(df_cleaned[['Age']])

df_cleaned['Age'].head()

Unnamed: 0,Age
0,0.271174
1,0.472229
2,0.321438
3,0.434531
4,0.434531


In [15]:
# Scaling

from sklearn.preprocessing import StandardScaler

Scaler  = StandardScaler()

df_cleaned['Age'] = Scaler.fit_transform(df_cleaned[['Age']])

df_cleaned['Age'].head()

Unnamed: 0,Age
0,-0.565736
1,0.663861
2,-0.258337
3,0.433312
4,0.433312


# Label Encoding

In [20]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

df_cleaned['Gender'] = label_encoder.fit_transform(df_cleaned['Gender'])

df_cleaned['Gender'].head()

Unnamed: 0,Gender
0,1
1,0
2,0
3,0
4,1


# One Hot Encoding

In [22]:
# apply one hot encoding on all categorical columns
df_cleaned = pd.get_dummies(df_cleaned, drop_first=True)

df_cleaned.head()

Unnamed: 0,PassengerId,Survived,Pclass,Gender,Age,SibSp,Parch,Fare,Gender_Encoded,"Name_Abbott, Mr. Rossmore Edward",...,Ticket_STON/O2. 3101290,Ticket_SW/PP 751,Ticket_W./C. 14258,Ticket_W./C. 14263,Ticket_W./C. 6607,Ticket_W./C. 6608,Ticket_W./C. 6609,Ticket_W.E.P. 5734,Ticket_W/C 14208,Ticket_WE/P 5735
0,1,0,3,1,-0.565736,1,0,7.25,1,False,...,False,False,False,False,False,False,False,False,False,False
1,2,1,1,0,0.663861,1,0,71.2833,0,False,...,False,False,False,False,False,False,False,False,False,False
2,3,1,3,0,-0.258337,0,0,7.925,0,False,...,False,False,False,False,False,False,False,False,False,False
3,4,1,1,0,0.433312,1,0,53.1,0,False,...,False,False,False,False,False,False,False,False,False,False
4,5,0,3,1,0.433312,0,0,8.05,1,False,...,False,False,False,False,False,False,False,False,False,False
