In [40]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, StandardScaler

**Exercise 1**

In [20]:
df = pd.read_csv('Titanic-Dataset.csv')
print(df.head())
print(df.tail())
df.info()

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  
  

In [22]:
print(df.isnull().sum())

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


**Exercise 2**

In [36]:
# df = df.drop(['Cabin'], axis=1)
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
print(df.isnull().sum())

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)


**Exercise 3.  Data Transformation**

In [41]:
numerical_columns = ['Age', 'Fare']

scaler = MinMaxScaler()
df_minmax_scaled = df.copy()
df_minmax_scaled[numerical_columns] = scaler.fit_transform(df[numerical_columns])

standardizer = StandardScaler()
df_standardized = df.copy()
df_standardized[numerical_columns] = standardizer.fit_transform(df[numerical_columns])

print(df_minmax_scaled[numerical_columns].head())
print(df_standardized[numerical_columns].head())

        Age      Fare
0  0.271174  0.014151
1  0.472229  0.139136
2  0.321438  0.015469
3  0.434531  0.103644
4  0.434531  0.015713
        Age      Fare
0 -0.565736 -0.502445
1  0.663861  0.786845
2 -0.258337 -0.488854
3  0.433312  0.420730
4  0.433312 -0.486337


In [50]:
df_onehot = pd.get_dummies(df, columns=['Sex', 'Embarked'], drop_first=True)

print(df_onehot[['Sex_male', 'Embarked_Q', 'Embarked_S']].head())

   Sex_male  Embarked_Q  Embarked_S
0      True       False        True
1     False       False       False
2     False       False        True
3     False       False        True
4      True       False        True


In [60]:
df_binned = df.copy()
df_binned['Age_binned'] = pd.cut(df['Age'], bins=4, labels=['Young', 'Middle-Aged', 'Senior', 'Elderly'])

print(df_binned[['Age', 'Age_binned']])

      Age   Age_binned
0    22.0  Middle-Aged
1    38.0  Middle-Aged
2    26.0  Middle-Aged
3    35.0  Middle-Aged
4    35.0  Middle-Aged
..    ...          ...
886  27.0  Middle-Aged
887  19.0        Young
888  28.0  Middle-Aged
889  26.0  Middle-Aged
890  32.0  Middle-Aged

[891 rows x 2 columns]


**Exercise 4.  Feature Engineering**

In [64]:
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df['FarePerPerson'] = df['Fare'] / df['FamilySize']

print(df[['SibSp', 'Parch', 'FamilySize', 'Fare', 'FarePerPerson']].head())
df['IsAlone'] = (df['FamilySize'] == 1).astype(int)  # 1 if alone, 0 otherwise

print(df[['FamilySize', 'IsAlone']].head())

   SibSp  Parch  FamilySize     Fare  FarePerPerson
0      1      0           2   7.2500        3.62500
1      1      0           2  71.2833       35.64165
2      0      0           1   7.9250        7.92500
3      1      0           2  53.1000       26.55000
4      0      0           1   8.0500        8.05000
   FamilySize  IsAlone
0           2        0
1           2        0
2           1        1
3           2        0
4           1        1


**Exercise 5. Data Cleaning**

In [77]:
duplicates = df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
df_cleaned = df.drop_duplicates()

Number of duplicate rows: 0


In [90]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,891.0,29.361582,13.019697,0.42,22.0,28.0,35.0,80.0
SibSp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292
FamilySize,891.0,1.904602,1.613459,1.0,1.0,1.0,2.0,11.0
FarePerPerson,891.0,19.916375,35.841257,0.0,7.25,8.3,23.666667,512.3292
IsAlone,891.0,0.602694,0.489615,0.0,0.0,1.0,1.0,1.0


In [79]:
from scipy import stats
import numpy as np

z_scores = np.abs(stats.zscore(df_cleaned['Fare']))

threshold = 3
outliers_z = np.where(z_scores > threshold)

df_no_outliers_z = df_cleaned.drop(outliers_z[0], axis=0)

print(f"Data shape after removing outliers (Z-score): {df_no_outliers_z.shape}")

Data shape after removing outliers (Z-score): (871, 14)


In [83]:
df_no_outliers_z


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,FamilySize,FarePerPerson,IsAlone
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,S,2,3.62500,0
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C,2,35.64165,0
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,S,1,7.92500,1
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,S,2,26.55000,0
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,S,1,8.05000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,S,1,13.00000,1
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,S,1,30.00000,1
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,28.0,1,2,W./C. 6607,23.4500,S,4,5.86250,0
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C,1,30.00000,1


**Exercise 6**

In [86]:
from sklearn.model_selection import train_test_split

X = df_cleaned.drop('Survived', axis=1)
y = df_cleaned['Survived']

scaler = MinMaxScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")

Training set size: (712, 13)
Testing set size: (179, 13)
