In [371]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

In [372]:
train_csv = 'C:/Users/nasir/OneDrive/Documents/VS Code/Python/ML/Titanic/.venv/Data/train.csv'
data = pd.read_csv(train_csv)
data 

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [373]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [374]:
target = 'Survived'
y = data[target]
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

In [375]:
X = data.drop(columns=[target])
X

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [376]:
data

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


<h3>Pipelining steps</h3>
<ol>
    <li>Drop Name and Ticket</li>
    <li>Set PassengerId as index</li>
    <li>Convert Sex to integer column with name 'isFemale' (1 for female, 0 for male)</li>
    <li>Convert Cabin to binary (0 for null, 1 otherwise)</li>
    <li>Impute Age, Pclass, Sex, Sibsp with strategy most frequent</li>
    <li>Impute Fare with strategy mean</li>
    <li>One Hot Encode Embarked</li>
    <li>Apply z normalization</li>
</ol>

<h3>Transformer class to drop columns</h3>

In [377]:
class DropColumns(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.drop(columns=self.columns)
    
drop_columns = DropColumns(['Ticket', 'Name'])
drop_columns.fit_transform(X)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,1,3,male,22.0,1,0,7.2500,,S
1,2,1,female,38.0,1,0,71.2833,C85,C
2,3,3,female,26.0,0,0,7.9250,,S
3,4,1,female,35.0,1,0,53.1000,C123,S
4,5,3,male,35.0,0,0,8.0500,,S
...,...,...,...,...,...,...,...,...,...
886,887,2,male,27.0,0,0,13.0000,,S
887,888,1,female,19.0,0,0,30.0000,B42,S
888,889,3,female,,1,2,23.4500,,S
889,890,1,male,26.0,0,0,30.0000,C148,C


<h3>Transformer class to set index</h3>

In [378]:
class SetIndex(BaseEstimator, TransformerMixin):
    def __init__(self, index):
        self.index=index

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.set_index(self.index)
    
index_setter = SetIndex('PassengerId')

index_setter.fit_transform(X)

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...
887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


<h3>Transformer to convert Sex to binary</h3>

In [379]:
class SextoIsFemale(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        x = X.copy()
        x.rename(columns={'Sex':'isFemale'}, inplace=True)
        x['isFemale'] = x['isFemale'].map({'female':1, 'male' : 0})
        return x
    
sex_to_is_female = SextoIsFemale()

sex_to_is_female.fit_transform(X)

Unnamed: 0,PassengerId,Pclass,Name,isFemale,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",0,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,38.0,1,0,PC 17599,71.2833,C85,C
2,3,3,"Heikkinen, Miss. Laina",1,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,35.0,1,0,113803,53.1000,C123,S
4,5,3,"Allen, Mr. William Henry",0,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",0,27.0,0,0,211536,13.0000,,S
887,888,1,"Graham, Miss. Margaret Edith",1,19.0,0,0,112053,30.0000,B42,S
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",1,,1,2,W./C. 6607,23.4500,,S
889,890,1,"Behr, Mr. Karl Howell",0,26.0,0,0,111369,30.0000,C148,C


<h3>Transformer to convert cabin to binary integer (0 for null, 1 otherwise)</h3>

In [380]:
class CabinOrNot(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        x = X.copy()
        x['Cabin'] = x['Cabin'].map(lambda i: int(pd.notnull(i)))
        return x

cabin_or_not = CabinOrNot()

cabin_or_not.fit_transform(X)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,0,S
1,2,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,1,C
2,3,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,0,S
3,4,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,1,S
4,5,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,0,S
...,...,...,...,...,...,...,...,...,...,...,...
886,887,2,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,0,S
887,888,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,1,S
888,889,3,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,0,S
889,890,1,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,1,C


<h3>Creating pipeline</h3>

In [381]:
imputer_and_OH_encoder = ColumnTransformer(transformers=[
    ('impute_with_mean', SimpleImputer(strategy='mean'), ['Fare']),
    ('impute_with_most_frequent', SimpleImputer(strategy='most_frequent'), ['Age', 'Pclass', 'isFemale', 'SibSp']),
    ('OH_encode', OneHotEncoder(handle_unknown='ignore'), ['Embarked'])
])

preprocessor = Pipeline(steps=[
    ('Drop_name_ticket', drop_columns),
    ('Set_index', index_setter),
    ('Sex_to_isFemale', sex_to_is_female),
    ('Cabin_or_not', cabin_or_not),
    ('imputer_and_OH_encoder', imputer_and_OH_encoder),
    ('Standard_scaler', StandardScaler())
])

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(random_state=0, max_depth=5))
])

pipeline.fit(X, y)
y_hat = pipeline.predict(X)
accuracy_score(y_hat, y)

0.8473625140291807

In [382]:
training_data = pd.read_csv('C:/Users/nasir/OneDrive/Documents/VS Code/Python/ML/Titanic/.venv/Data/train.csv')
test_data = pd.read_csv('C:/Users/nasir/OneDrive/Documents/VS Code/Python/ML/Titanic/.venv/Data/test.csv')

target = 'Survived'

y = training_data[target]

X = training_data.drop(columns = [target])
X_test = test_data

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0, test_size=0.2)

pipeline.fit(X_train, y_train)

y_hat = pipeline.predict(X_train)
print('Accuracy on training data:', accuracy_score(y_hat, y_train))

y_hat = pipeline.predict(X_valid)
print('Accuracy on test data:', accuracy_score(y_hat, y_valid))

y_hat = pipeline.predict(X_test)

submission = pd.DataFrame({
    'PassengerId' : test_data['PassengerId'],
    'Survived' : y_hat
})

submission.to_csv('submission.csv', index=False)

score = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
score

Accuracy on training data: 0.851123595505618
Accuracy on test data: 0.8324022346368715


array([0.78212291, 0.82022472, 0.84831461, 0.78651685, 0.86516854])

<h3>Using XGboost</h3>

In [400]:
from xgboost import XGBClassifier

training_data = pd.read_csv('C:/Users/nasir/OneDrive/Documents/VS Code/Python/ML/Titanic/.venv/Data/train.csv')
test_data = pd.read_csv('C:/Users/nasir/OneDrive/Documents/VS Code/Python/ML/Titanic/.venv/Data/test.csv')

target = 'Survived'

y = training_data[target]

X = training_data.drop(columns = [target])
X_test = test_data

X_train, X_valid, y_train, y_valid = train_test_split(X, y, random_state=0, test_size=0.2)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBClassifier(n_estimators=1000, learning_rate=0.05, early_stopping_rounds=5))
])

pipeline.fit(X_train, y_train,
             model__eval_set=[(preprocessor.transform(X_valid), y_valid)], 
             model__verbose=False)

y_hat = pipeline.predict(X_valid)

print(accuracy_score(y_hat, y_valid))

y_hat = pipeline.predict(X_test)

submission = pd.DataFrame({
    'PassengerId' : test_data['PassengerId'],
    'Survived' : y_hat
})

submission.to_csv('submission.csv', index=False)

0.8547486033519553
