# **LAB 1- Data Preprocessing**

## Task 1 – Dataset Cleaning 

In [66]:
import pandas as pd
import numpy as np

### Load the Titanic dataset (train.csv)

In [67]:
titanic = pd.read_csv("train.csv")

###  Display the first 10 rows

In [68]:
titanic.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


### Check for missing values in each column

In [69]:
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


### Fill missing values in the "Age" column with the mean age

In [70]:
age_mean = np.mean(titanic['Age'])
print("Age Mean:",age_mean)
titanic["Age"] = titanic["Age"].fillna(age_mean)

Age Mean: 29.69911764705882


No Null values

In [71]:
titanic.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

### Drop rows where the "Embarked" column is missing

In [72]:
titanic.shape

(891, 12)

In [73]:
titanic.dropna(subset=['Embarked'])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.000000,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.000000,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.000000,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.000000,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.000000,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.000000,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.000000,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,29.699118,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.000000,0,0,111369,30.0000,C148,C


## Task 2 – Encoding Categorical Data 

### Convert the "Sex" column into numeric (0 = Male, 1 = Female)

In [74]:
#  sex column before pandas mapping
before_mapping = pd.DataFrame(titanic['Sex']) 

titanic['Sex'] = titanic['Sex'].map({'male': 0, 'female': 1})


#  sex column after pandas mapping
after_mapping =  pd.DataFrame(titanic['Sex']) 

In [75]:
pd.concat([before_mapping, after_mapping], axis=1, keys=['Before ', 'After '])

Unnamed: 0_level_0,Before,After
Unnamed: 0_level_1,Sex,Sex
0,male,0
1,female,1
2,female,1
3,female,1
4,male,0
...,...,...
886,male,0
887,female,1
888,female,1
889,male,0


## Apply One-Hot Encoding on the "Embarked" column

In [78]:
# Apply One-Hot Encoding on the "Embarked" column
embarked_dummies = pd.get_dummies(titanic['Embarked'], prefix='Embarked')
titanic = pd.concat([titanic, embarked_dummies], axis=1)
titanic


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked,Embarked_C,Embarked_Q,Embarked_S,Embarked_C.1,Embarked_Q.1,Embarked_S.1,Embarked_C.2,Embarked_Q.2,Embarked_S.2
0,1,0,3,"Braund, Mr. Owen Harris",0,22.000000,1,0,A/5 21171,7.2500,...,S,False,False,True,False,False,True,False,False,True
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.000000,1,0,PC 17599,71.2833,...,C,True,False,False,True,False,False,True,False,False
2,3,1,3,"Heikkinen, Miss. Laina",1,26.000000,0,0,STON/O2. 3101282,7.9250,...,S,False,False,True,False,False,True,False,False,True
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.000000,1,0,113803,53.1000,...,S,False,False,True,False,False,True,False,False,True
4,5,0,3,"Allen, Mr. William Henry",0,35.000000,0,0,373450,8.0500,...,S,False,False,True,False,False,True,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",0,27.000000,0,0,211536,13.0000,...,S,False,False,True,False,False,True,False,False,True
887,888,1,1,"Graham, Miss. Margaret Edith",1,19.000000,0,0,112053,30.0000,...,S,False,False,True,False,False,True,False,False,True
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,29.699118,1,2,W./C. 6607,23.4500,...,S,False,False,True,False,False,True,False,False,True
889,890,1,1,"Behr, Mr. Karl Howell",0,26.000000,0,0,111369,30.0000,...,C,True,False,False,True,False,False,True,False,False


# Task 3 –Feature Scaling & Splitting

## Select features: Age, Fare, Sex, Pclass.

In [80]:
selected_features = titanic[['Age','Fare','Sex','Pclass']]
selected_features

Unnamed: 0,Age,Fare,Sex,Pclass
0,22.000000,7.2500,0,3
1,38.000000,71.2833,1,1
2,26.000000,7.9250,1,3
3,35.000000,53.1000,1,1
4,35.000000,8.0500,0,3
...,...,...,...,...
886,27.000000,13.0000,0,2
887,19.000000,30.0000,1,1
888,29.699118,23.4500,1,3
889,26.000000,30.0000,0,1


## Apply StandardScaler to normalize them

In [81]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled_features = scaler.fit_transform(selected_features)
scaled_features

array([[-0.5924806 , -0.50244517, -0.73769513,  0.82737724],
       [ 0.63878901,  0.78684529,  1.35557354, -1.56610693],
       [-0.2846632 , -0.48885426,  1.35557354,  0.82737724],
       ...,
       [ 0.        , -0.17626324,  1.35557354,  0.82737724],
       [-0.2846632 , -0.04438104, -0.73769513, -1.56610693],
       [ 0.17706291, -0.49237783, -0.73769513,  0.82737724]],
      shape=(891, 4))

# Split data into 80% training and 20% testing.

In [82]:
from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(scaled_features, test_size=0.2, random_state=42)
X_train.shape, X_test.shape


((712, 4), (179, 4))