## Import necessary libraries

In [62]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder


## Load and explore the data

In [9]:
test_data = pd.read_csv('../data/test.csv')

In [10]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [11]:
test_data.shape

(418, 11)

In [12]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Pclass       418 non-null    int64  
 2   Name         418 non-null    object 
 3   Sex          418 non-null    object 
 4   Age          332 non-null    float64
 5   SibSp        418 non-null    int64  
 6   Parch        418 non-null    int64  
 7   Ticket       418 non-null    object 
 8   Fare         417 non-null    float64
 9   Cabin        91 non-null     object 
 10  Embarked     418 non-null    object 
dtypes: float64(2), int64(4), object(5)
memory usage: 36.1+ KB


In [13]:
test_data.isnull().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

## Data preprocessing

### Handling missing values

In [16]:
# Convert 'Age' to numeric, forcing invalid parsing to NaN
test_data['Age'] = pd.to_numeric(test_data['Age'], errors='coerce')

# Fill missing ages with the median age
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].median())

In [17]:
# drop the `Cabin` column because of it's too many missing values
test_data.drop(columns='Cabin', axis=1, inplace=True)

In [18]:
# handling the missing values in `Fare` column with the mean value of the corresponding `Pclass` of the passengers
mean_fare_by_class = test_data.groupby('Pclass')['Fare'].mean()

print(mean_fare_by_class)

test_data.loc[test_data['Fare'].isnull(), 'Fare'] = test_data.loc[test_data['Fare'].isnull(), 'Pclass'].map(mean_fare_by_class)

Pclass
1    94.280297
2    22.202104
3    12.459678
Name: Fare, dtype: float64


In [19]:
test_data.isnull().sum()

PassengerId    0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [20]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,S


### Create new features

**a. Extract the title from the Name**

In [23]:
test_data['Title'] = test_data['Name'].apply(lambda name: name.split(',')[1].split('.')[0].strip())

In [24]:
test_data['Title'].value_counts()

Title
Mr        240
Miss       78
Mrs        72
Master     21
Col         2
Rev         2
Ms          1
Dr          1
Dona        1
Name: count, dtype: int64

In [25]:
title_mapping = {
    'Mlle': 'Miss', 'Ms': 'Miss', 'Mme': 'Mrs',
    'Lady': 'Royalty', 'Countess': 'Royalty', 'Sir': 'Royalty', 'Jonkheer': 'Royalty', 'Don': 'Royalty',
    'Dona': 'Royalty', 'Capt': 'Officer', 'Col': 'Officer', 'Major': 'Officer', 'Dr': 'Officer', 'Rev': 'Officer'
}

test_data['Title'] = test_data['Title'].replace(title_mapping)

In [26]:
test_data['Title'].value_counts()

Title
Mr         240
Miss        79
Mrs         72
Master      21
Officer      5
Royalty      1
Name: count, dtype: int64

**b. Create Family Size and isAlone feature**

In [28]:
# Create FamilySize feature: total number of family members (siblings/spouses + parents/children + self)
test_data['FamilySize'] = test_data['SibSp'] + test_data['Parch'] + 1

# Create IsAlone feature: flag to indicate if the passenger is traveling alone (1 = Alone, 0 = Not Alone)
test_data['IsAlone'] = 1  # Initialize to 1 (indicating alone)
test_data['IsAlone'].loc[test_data['FamilySize'] > 1] = 0  # Set to 0 if FamilySize > 1 (not alone)

# Check the new features in the dataset
test_data[['SibSp', 'Parch', 'FamilySize', 'IsAlone']].head()

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  test_data['IsAlone'].loc[test_data['FamilySize'] > 1] = 0  # Set to 0 if FamilySize > 1 (not alone)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.h

Unnamed: 0,SibSp,Parch,FamilySize,IsAlone
0,0,0,1,1
1,1,0,2,0
2,0,0,1,1
3,0,0,1,1
4,1,1,3,0


**c. Extract Ticket Prefix**

In [30]:
# # Extract Ticket Prefix (if any): split the 'Ticket' column and get the prefix (part before space), else 'None'
# test_data['TicketPrefix'] = test_data['Ticket'].apply(lambda x: x.split()[0] if len(x.split()) > 1 else 'None')

# # Check the counts of each TicketPrefix
# # test_data['TicketPrefix'].value_counts()

**d. Fare Binning**

In [32]:
# Create FareBin by grouping Fare into quartiles: categorizes 'Fare' into 4 quartiles
test_data['FareBin'] = pd.qcut(test_data['Fare'], 4, labels=['Low', 'Mid', 'High', 'VeryHigh'])

# Check the distribution of Fare and FareBin
test_data[['Fare', 'FareBin']].head()

Unnamed: 0,Fare,FareBin
0,7.8292,Low
1,7.0,Low
2,9.6875,Mid
3,8.6625,Mid
4,12.2875,Mid


**e. Age Binning**

In [34]:
# Create AgeBin by grouping Age into specific bins: categorizes 'Age' into 5 age groups
test_data['AgeBin'] = pd.cut(test_data['Age'], bins=[0, 12, 18, 35, 60, 120], 
                             labels=['Child', 'Teenager', 'Adult', 'Middle Age', 'Senior'])

# Check the distribution of Age and AgeBin
test_data[['Age', 'AgeBin']].head()

Unnamed: 0,Age,AgeBin
0,34.5,Adult
1,47.0,Middle Age
2,62.0,Senior
3,27.0,Adult
4,22.0,Adult


### Encoding categorical data

**a. One-hot encoding categorical variables**

In [37]:
# One-Hot Encode categorical variables like 'Sex', 'Embarked', 'Title', and 'TicketPrefix'
categorical_features = ['Sex', 'Embarked', 'Title', 'TicketPrefix']

# Apply one-hot encoding
test_data = pd.get_dummies(test_data, columns=categorical_features, drop_first=True)

# Check the updated dataset
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,FamilySize,IsAlone,...,TicketPrefix_SC/PARIS,TicketPrefix_SC/Paris,TicketPrefix_SOTON/O.Q.,TicketPrefix_SOTON/O2,TicketPrefix_SOTON/OQ,TicketPrefix_STON/O,TicketPrefix_STON/O2.,TicketPrefix_STON/OQ.,TicketPrefix_W./C.,TicketPrefix_W.E.P.
0,892,3,"Kelly, Mr. James",34.5,0,0,330911,7.8292,1,1,...,False,False,False,False,False,False,False,False,False,False
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",47.0,1,0,363272,7.0,2,0,...,False,False,False,False,False,False,False,False,False,False
2,894,2,"Myles, Mr. Thomas Francis",62.0,0,0,240276,9.6875,1,1,...,False,False,False,False,False,False,False,False,False,False
3,895,3,"Wirz, Mr. Albert",27.0,0,0,315154,8.6625,1,1,...,False,False,False,False,False,False,False,False,False,False
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,1,1,3101298,12.2875,3,0,...,False,False,False,False,False,False,False,False,False,False


**b. Label encoding for ordinal variables**

In [64]:
# Label encode AgeBin and FareBin 
label_encoder = LabelEncoder()

# Encoding 'AgeBin' and 'FareBin'
test_data['AgeBin_Label'] = label_encoder.fit_transform(test_data['AgeBin'])
test_data['FareBin_Label'] = label_encoder.fit_transform(test_data['FareBin'])

# Drop the original AgeBin and FareBin (optional)
test_data.drop(['AgeBin', 'FareBin'], axis=1, inplace=True)

# Check the encoded labels
test_data[['AgeBin_Label', 'FareBin_Label']].head()

Unnamed: 0,AgeBin_Label,FareBin_Label
0,0,1
1,2,1
2,3,2
3,0,2
4,0,2


In [68]:
# Save the cleaned and encoded DataFrame to a CSV file
test_data.to_csv('../data/test_data_cleaned_encoded.csv', index=False)

In [71]:
test_data.head()

Unnamed: 0,PassengerId,Pclass,Name,Age,SibSp,Parch,Ticket,Fare,FamilySize,IsAlone,...,TicketPrefix_SOTON/O.Q.,TicketPrefix_SOTON/O2,TicketPrefix_SOTON/OQ,TicketPrefix_STON/O,TicketPrefix_STON/O2.,TicketPrefix_STON/OQ.,TicketPrefix_W./C.,TicketPrefix_W.E.P.,AgeBin_Label,FareBin_Label
0,892,3,"Kelly, Mr. James",34.5,0,0,330911,7.8292,1,1,...,False,False,False,False,False,False,False,False,0,1
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",47.0,1,0,363272,7.0,2,0,...,False,False,False,False,False,False,False,False,2,1
2,894,2,"Myles, Mr. Thomas Francis",62.0,0,0,240276,9.6875,1,1,...,False,False,False,False,False,False,False,False,3,2
3,895,3,"Wirz, Mr. Albert",27.0,0,0,315154,8.6625,1,1,...,False,False,False,False,False,False,False,False,0,2
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,1,1,3101298,12.2875,3,0,...,False,False,False,False,False,False,False,False,0,2


In [73]:
test_data.columns

Index(['PassengerId', 'Pclass', 'Name', 'Age', 'SibSp', 'Parch', 'Ticket',
       'Fare', 'FamilySize', 'IsAlone', 'Sex_male', 'Embarked_Q', 'Embarked_S',
       'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Officer', 'Title_Royalty',
       'TicketPrefix_A./5.', 'TicketPrefix_A.5.', 'TicketPrefix_A/4',
       'TicketPrefix_A/5', 'TicketPrefix_A/5.', 'TicketPrefix_AQ/3.',
       'TicketPrefix_AQ/4', 'TicketPrefix_C', 'TicketPrefix_C.A.',
       'TicketPrefix_CA', 'TicketPrefix_CA.', 'TicketPrefix_F.C.',
       'TicketPrefix_F.C.C.', 'TicketPrefix_LP', 'TicketPrefix_None',
       'TicketPrefix_PC', 'TicketPrefix_PP', 'TicketPrefix_S.C./PARIS',
       'TicketPrefix_S.O./P.P.', 'TicketPrefix_S.O.C.', 'TicketPrefix_SC',
       'TicketPrefix_SC/A.3', 'TicketPrefix_SC/A4', 'TicketPrefix_SC/AH',
       'TicketPrefix_SC/PARIS', 'TicketPrefix_SC/Paris',
       'TicketPrefix_SOTON/O.Q.', 'TicketPrefix_SOTON/O2',
       'TicketPrefix_SOTON/OQ', 'TicketPrefix_STON/O', 'TicketPrefix_STON/O2.',
    

In [75]:
test_data.isnull().sum()

PassengerId                0
Pclass                     0
Name                       0
Age                        0
SibSp                      0
Parch                      0
Ticket                     0
Fare                       0
FamilySize                 0
IsAlone                    0
Sex_male                   0
Embarked_Q                 0
Embarked_S                 0
Title_Miss                 0
Title_Mr                   0
Title_Mrs                  0
Title_Officer              0
Title_Royalty              0
TicketPrefix_A./5.         0
TicketPrefix_A.5.          0
TicketPrefix_A/4           0
TicketPrefix_A/5           0
TicketPrefix_A/5.          0
TicketPrefix_AQ/3.         0
TicketPrefix_AQ/4          0
TicketPrefix_C             0
TicketPrefix_C.A.          0
TicketPrefix_CA            0
TicketPrefix_CA.           0
TicketPrefix_F.C.          0
TicketPrefix_F.C.C.        0
TicketPrefix_LP            0
TicketPrefix_None          0
TicketPrefix_PC            0
TicketPrefix_P

In [77]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 54 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   PassengerId              418 non-null    int64  
 1   Pclass                   418 non-null    int64  
 2   Name                     418 non-null    object 
 3   Age                      418 non-null    float64
 4   SibSp                    418 non-null    int64  
 5   Parch                    418 non-null    int64  
 6   Ticket                   418 non-null    object 
 7   Fare                     418 non-null    float64
 8   FamilySize               418 non-null    int64  
 9   IsAlone                  418 non-null    int64  
 10  Sex_male                 418 non-null    bool   
 11  Embarked_Q               418 non-null    bool   
 12  Embarked_S               418 non-null    bool   
 13  Title_Miss               418 non-null    bool   
 14  Title_Mr                 4