# Loading Necessary Datasets

In [1]:
# Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, KBinsDiscretizer

# creating necessary methods

## drop column method

In [2]:
def drop_unused_columns(df, columns):
    return df.drop(columns = columns , inplace = True)

## Encoding Methods

#### LabelEncoding Method

In [3]:
def labelencoder(df, columns):
    le = LabelEncoder()
    for col in columns:
        df[col] = le.fit_transform(df[col])
    return df

#### One-Hot Encoder

In [4]:
def one_hot(df, columns):
    return pd.get_dummies(df, columns=columns)

## Discretization Method

In [5]:
def k_bins_discretizer(df, columns, n_bins, encode, strategy):
    dis = KBinsDiscretizer(n_bins=n_bins, encode=encode, strategy=strategy)
    for col in columns:
        df[col] = dis.fit_transform(df[[col]])
        df = df.astype({col: 'int'})
    return df

# Normalization Method

In [49]:
def min_max_scaler(df, columns):
    scaler = MinMaxScaler()
    for col in columns:
        df[col] = scaler.fit_transform(df[col])
    df.columns = columns
    return df

# Exploring & Cleaning Columns

In [7]:
# Laoding Dataset 
df = pd.read_csv('https://raw.githubusercontent.com/PeterLOVANAS/Titanic-machine-learning-project/main/datasets/Titanic_dataset_com.csv')

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1310 entries, 0 to 1309
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   float64
 1   pclass       1309 non-null   float64
 2   survived     1309 non-null   float64
 3   name         1309 non-null   object 
 4   sex          1309 non-null   object 
 5   age          1046 non-null   float64
 6   sibsp        1309 non-null   float64
 7   parch        1309 non-null   float64
 8   ticket       1309 non-null   object 
 9   fare         1308 non-null   float64
 10  cabin        295 non-null    object 
 11  embarked     1307 non-null   object 
 12  boat         486 non-null    object 
 13  body         121 non-null    float64
 14  home.dest    745 non-null    object 
dtypes: float64(8), object(7)
memory usage: 153.6+ KB


# Drop NaN: 'all'

In [9]:
df.dropna(how='all', inplace = True)

# age column

In [10]:
df_age_median = df.groupby(['sex', 'pclass'])['age'].median().reset_index()
# df_age_median = pd.DataFrame(df_age_median)
df_age_median

Unnamed: 0,sex,pclass,age
0,female,1.0,36.0
1,female,2.0,28.0
2,female,3.0,22.0
3,male,1.0,42.0
4,male,2.0,29.5
5,male,3.0,25.0


In [11]:
def impute_age(row):
    condition = (
        (df_age_median['sex'] == row['sex']) &
        (df_age_median['pclass'] == row['pclass'])
    )
    # Check if there are rows that meet the condition
    if condition.any():
        return df_age_median[condition]['age'].values[0]
    else:
        # Handle the case where there are no matching rows
        return None  # You can choose an appropriate value for missing age here

# Apply the impute_age function to fill missing values in 'Age' column
df['age'] = df.apply(lambda row: impute_age(row) if pd.isnull(row['age']) else row['age'], axis=1)

# sex column

In [12]:
le = LabelEncoder()
df['sex'] = le.fit_transform(df['sex'])

In [13]:
df.head()

Unnamed: 0,PassengerId,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,1.0,"Allen, Miss. Elisabeth Walton",0,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,2.0,1.0,1.0,"Allison, Master. Hudson Trevor",1,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,3.0,1.0,0.0,"Allison, Miss. Helen Loraine",0,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,4.0,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",1,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,5.0,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",0,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


# cabin column

In [14]:
# Fill missing "Cabin" values based on passenger class
class_cabin_mapping = df.groupby('pclass')['cabin'].transform(lambda x: x.fillna(x.mode()[0]))
# Update the "Cabin" column with the imputed values
df['cabin'] = class_cabin_mapping
df.head()

Unnamed: 0,PassengerId,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,1.0,"Allen, Miss. Elisabeth Walton",0,29.0,0.0,0.0,24160,211.3375,B5,S,2.0,,"St Louis, MO"
1,2.0,1.0,1.0,"Allison, Master. Hudson Trevor",1,0.9167,1.0,2.0,113781,151.55,C22 C26,S,11.0,,"Montreal, PQ / Chesterville, ON"
2,3.0,1.0,0.0,"Allison, Miss. Helen Loraine",0,2.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"
3,4.0,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",1,30.0,1.0,2.0,113781,151.55,C22 C26,S,,135.0,"Montreal, PQ / Chesterville, ON"
4,5.0,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",0,25.0,1.0,2.0,113781,151.55,C22 C26,S,,,"Montreal, PQ / Chesterville, ON"


# boat column

In [15]:
df['boat'].isnull().sum()

823

#### NaNs Value of boat and survived : 1

In [16]:
df[(df['survived'] == 1) & (df['boat'].isnull())]#.fillna(value = {"boat": df['boat'].mode()[0]})

Unnamed: 0,PassengerId,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home.dest
192,193.0,1.0,1.0,"Lurette, Miss. Elise",0,58.0,0.0,0.0,PC 17569,146.5208,B80,C,,,
358,359.0,2.0,1.0,"Bystrom, Mrs. (Karolina)",0,42.0,0.0,0.0,236852,13.0,D,S,,,"New York, NY"
395,396.0,2.0,1.0,"Doling, Miss. Elsie",0,18.0,0.0,1.0,231919,23.0,D,S,,,Southampton
396,397.0,2.0,1.0,"Doling, Mrs. John T (Ada Julia Bone)",0,34.0,0.0,1.0,231919,23.0,D,S,,,Southampton
458,459.0,2.0,1.0,"Ilett, Miss. Bertha",0,17.0,0.0,0.0,SO/C 14885,10.5,D,S,,,Guernsey
489,490.0,2.0,1.0,"Louch, Mrs. Charles Alexander (Alice Adelaide ...",0,42.0,1.0,0.0,SC/AH 3085,26.0,D,S,,,"Weston-Super-Mare, Somerset"
513,514.0,2.0,1.0,"Nasser, Mrs. Nicholas (Adele Achem)",0,14.0,1.0,0.0,237736,30.0708,D,C,,,"New York, NY"
545,546.0,2.0,1.0,"Renouf, Mrs. Peter Henry (Lillian Jefferys)",0,30.0,3.0,0.0,31027,21.0,D,S,,,"Elizabeth, NJ"
572,573.0,2.0,1.0,"Trout, Mrs. William H (Jessie L)",0,28.0,0.0,0.0,240929,12.65,D,S,,,"Columbus, OH"
656,657.0,3.0,1.0,"Backstrom, Mrs. Karl Alfred (Maria Mathilda Gu...",0,33.0,3.0,0.0,3101278,15.85,G6,S,,,"Ruotsinphytaa, Finland New York, NY"


In [17]:
# condition for filling missing values
condition = (df['survived'] == 1) & (df['boat'].isnull())

# Calculate the mode of the "boat" column
boat_mode = df['boat'].mode()[0]

# Use fillna() to replace missing values based on the condition
df.loc[condition, 'boat'] = boat_mode

In [18]:
df['boat'].isnull().sum()

800

#### NaNs Value of boat and survived : 0

In [19]:
#imputing based on Pclass and Sex
boat_mode_by_class_sex = df.groupby(['pclass', 'sex'])['boat'].transform(lambda x: x.mode()[0])
df['boat'].fillna(boat_mode_by_class_sex, inplace=True)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 0 to 1308
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   float64
 1   pclass       1309 non-null   float64
 2   survived     1309 non-null   float64
 3   name         1309 non-null   object 
 4   sex          1309 non-null   int32  
 5   age          1309 non-null   float64
 6   sibsp        1309 non-null   float64
 7   parch        1309 non-null   float64
 8   ticket       1309 non-null   object 
 9   fare         1308 non-null   float64
 10  cabin        1309 non-null   object 
 11  embarked     1307 non-null   object 
 12  boat         1309 non-null   object 
 13  body         121 non-null    float64
 14  home.dest    745 non-null    object 
dtypes: float64(8), int32(1), object(6)
memory usage: 158.5+ KB


# ticket column

In [21]:
drop_unused_columns(df, columns=['ticket'])

# fare column

In [22]:
df['fare'].fillna(method='bfill', inplace = True)

In [52]:
# min_max_scaler(df, ['fare'])

In [24]:
df.head()

Unnamed: 0,PassengerId,pclass,survived,name,sex,age,sibsp,parch,fare,cabin,embarked,boat,body,home.dest
0,1.0,1.0,1.0,"Allen, Miss. Elisabeth Walton",0,29.0,0.0,0.0,211.3375,B5,S,2,,"St Louis, MO"
1,2.0,1.0,1.0,"Allison, Master. Hudson Trevor",1,0.9167,1.0,2.0,151.55,C22 C26,S,11,,"Montreal, PQ / Chesterville, ON"
2,3.0,1.0,0.0,"Allison, Miss. Helen Loraine",0,2.0,1.0,2.0,151.55,C22 C26,S,8,,"Montreal, PQ / Chesterville, ON"
3,4.0,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",1,30.0,1.0,2.0,151.55,C22 C26,S,3,135.0,"Montreal, PQ / Chesterville, ON"
4,5.0,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",0,25.0,1.0,2.0,151.55,C22 C26,S,8,,"Montreal, PQ / Chesterville, ON"


# cabin column

In [25]:
df['Cabin_Deck'] = df['cabin'].str[0]

In [26]:
df['Has_Cabin'] = df['cabin'].notna().astype(int)

In [27]:
drop_unused_columns(df, columns=['cabin'])

In [28]:
df.head()

Unnamed: 0,PassengerId,pclass,survived,name,sex,age,sibsp,parch,fare,embarked,boat,body,home.dest,Cabin_Deck,Has_Cabin
0,1.0,1.0,1.0,"Allen, Miss. Elisabeth Walton",0,29.0,0.0,0.0,211.3375,S,2,,"St Louis, MO",B,1
1,2.0,1.0,1.0,"Allison, Master. Hudson Trevor",1,0.9167,1.0,2.0,151.55,S,11,,"Montreal, PQ / Chesterville, ON",C,1
2,3.0,1.0,0.0,"Allison, Miss. Helen Loraine",0,2.0,1.0,2.0,151.55,S,8,,"Montreal, PQ / Chesterville, ON",C,1
3,4.0,1.0,0.0,"Allison, Mr. Hudson Joshua Creighton",1,30.0,1.0,2.0,151.55,S,3,135.0,"Montreal, PQ / Chesterville, ON",C,1
4,5.0,1.0,0.0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",0,25.0,1.0,2.0,151.55,S,8,,"Montreal, PQ / Chesterville, ON",C,1


# Cabin_Deck Column

In [29]:
df = labelencoder(df, columns=['Cabin_Deck'])

# home.dest column

In [30]:
drop_unused_columns(df, columns=['home.dest'])

# body column

In [31]:
drop_unused_columns(df, columns=['body'])

# change data type

In [32]:
def change_data_type(df):
    df['fare'] = round(df["fare"], 0)
    df = df.astype({'pclass': 'int','survived' : 'int' ,'PassengerId': 'int', 'pclass' : 'int', 'age' : 'int', 'sibsp': 'int', 'parch' : 'int'})
    return df

In [33]:
df = change_data_type(df)

# embarked column

In [34]:
# df = labelencoder(df, columns=['embarked'])

In [35]:
df.head()

Unnamed: 0,PassengerId,pclass,survived,name,sex,age,sibsp,parch,fare,embarked,boat,Cabin_Deck,Has_Cabin
0,1,1,1,"Allen, Miss. Elisabeth Walton",0,29,0,0,211.0,S,2,1,1
1,2,1,1,"Allison, Master. Hudson Trevor",1,0,1,2,152.0,S,11,2,1
2,3,1,0,"Allison, Miss. Helen Loraine",0,2,1,2,152.0,S,8,2,1
3,4,1,0,"Allison, Mr. Hudson Joshua Creighton",1,30,1,2,152.0,S,3,2,1
4,5,1,0,"Allison, Mrs. Hudson J C (Bessie Waldo Daniels)",0,25,1,2,152.0,S,8,2,1


# name column

#### extract title

In [36]:
df[['last_name', 'title', 'first_name', 'middle_name']] = df['name'].str.split(', |\.', expand=True)

In [37]:
drop_unused_columns(df, columns=['name', 'middle_name'])

In [38]:
df.head()

Unnamed: 0,PassengerId,pclass,survived,sex,age,sibsp,parch,fare,embarked,boat,Cabin_Deck,Has_Cabin,last_name,title,first_name
0,1,1,1,0,29,0,0,211.0,S,2,1,1,Allen,Miss,Elisabeth Walton
1,2,1,1,1,0,1,2,152.0,S,11,2,1,Allison,Master,Hudson Trevor
2,3,1,0,0,2,1,2,152.0,S,8,2,1,Allison,Miss,Helen Loraine
3,4,1,0,1,30,1,2,152.0,S,3,2,1,Allison,Mr,Hudson Joshua Creighton
4,5,1,0,0,25,1,2,152.0,S,8,2,1,Allison,Mrs,Hudson J C (Bessie Waldo Daniels)


# change columns order

In [39]:
df = df[['PassengerId','title', 'last_name', 'first_name', 'pclass', 'survived', 'sex', 'age', 'sibsp', 'parch', 'fare', 'embarked',
                 'boat', 'Cabin_Deck', 'Has_Cabin']]

In [40]:
df.head()

Unnamed: 0,PassengerId,title,last_name,first_name,pclass,survived,sex,age,sibsp,parch,fare,embarked,boat,Cabin_Deck,Has_Cabin
0,1,Miss,Allen,Elisabeth Walton,1,1,0,29,0,0,211.0,S,2,1,1
1,2,Master,Allison,Hudson Trevor,1,1,1,0,1,2,152.0,S,11,2,1
2,3,Miss,Allison,Helen Loraine,1,0,0,2,1,2,152.0,S,8,2,1
3,4,Mr,Allison,Hudson Joshua Creighton,1,0,1,30,1,2,152.0,S,3,2,1
4,5,Mrs,Allison,Hudson J C (Bessie Waldo Daniels),1,0,0,25,1,2,152.0,S,8,2,1


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1309 entries, 0 to 1308
Data columns (total 15 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int32  
 1   title        1309 non-null   object 
 2   last_name    1309 non-null   object 
 3   first_name   1309 non-null   object 
 4   pclass       1309 non-null   int32  
 5   survived     1309 non-null   int32  
 6   sex          1309 non-null   int32  
 7   age          1309 non-null   int32  
 8   sibsp        1309 non-null   int32  
 9   parch        1309 non-null   int32  
 10  fare         1309 non-null   float64
 11  embarked     1307 non-null   object 
 12  boat         1309 non-null   object 
 13  Cabin_Deck   1309 non-null   int32  
 14  Has_Cabin    1309 non-null   int32  
dtypes: float64(1), int32(9), object(5)
memory usage: 117.6+ KB
