Let's Import all the libraries needed

In [31]:
import tensorflow as tf
import tensorflow_decision_forests as tfdf
from keras import layers
import pandas as pd
import numpy as np
import os
from google.colab import files

First thing first, we need to import the data. Let's dowload it from my GitHub repository

In [7]:
### Get the data ###
dataTrain = pd.read_csv('https://raw.githubusercontent.com/pCroiz/AI-Titanic-RandomForest/main/data/train.csv')
dataTest = pd.read_csv('https://raw.githubusercontent.com/pCroiz/AI-Titanic-RandomForest/main/data/test.csv')

Now let's preprocess the data

In [8]:
### Preprocess the data ###
print(dataTrain)

def preprocess(df):
    df = df.copy()

    def normalize_name(x):
        return " ".join([v.strip(",()[].\"'") for v in x.split(" ")])

    def ticket_number(x):
        return x.split(" ")[-1]

    def ticket_item(x):
        items = x.split(" ")
        if len(items) == 1:
            return "NONE"
        return "_".join(items[0:-1])

    df["Name"] = df["Name"].apply(normalize_name)
    df["Ticket_number"] = df["Ticket"].apply(ticket_number)
    df["Ticket_item"] = df["Ticket"].apply(ticket_item)

    # Error with the Cabin column
    df.drop('Cabin', axis=1, inplace=True)

    # Error with the Embarked column
    df.drop('Embarked', axis=1, inplace=True)

    return df

preProc_dataTrain = preprocess(dataTrain)
preProc_dataTest = preprocess(dataTest)

     PassengerId  Survived  Pclass  \
0              1         0       3   
1              2         1       1   
2              3         1       3   
3              4         1       1   
4              5         0       3   
..           ...       ...     ...   
886          887         0       2   
887          888         1       1   
888          889         0       3   
889          890         1       1   
890          891         0       3   

                                                  Name     Sex   Age  SibSp  \
0                              Braund, Mr. Owen Harris    male  22.0      1   
1    Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                               Heikkinen, Miss. Laina  female  26.0      0   
3         Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                             Allen, Mr. William Henry    male  35.0      0   
..                                                 ...     ...   ... 

Let's get the Input features

In [11]:
### Get the input features ###

features = []
removesFeature = ["Ticket","PassengerId","Survived"]

for key in preProc_dataTrain.keys():
    if key != 'Survived':
        features.append(tfdf.keras.FeatureUsage(name=key))

Let's define our input function. Usefull to convert pd data to tf data

In [17]:
### Definition of the Input Function ###

# Usefull to convert the pd dataset to an tf dataset
def tokenize_names(features, labels=None):
    """Divite the names into tokens. TF-DF can consume text tokens natively."""
    features["Name"] =  tf.strings.split(features["Name"])
    return features, labels

train_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preProc_dataTrain,label="Survived").map(tokenize_names)
test_ds = tfdf.keras.pd_dataframe_to_tf_dataset(preProc_dataTest).map(tokenize_names)

Now, let's create our model

In [18]:
model = tfdf.keras.GradientBoostedTreesModel(
    verbose=0, # Very few logs
    features=features,
    exclude_non_specified_features=True, # Only use the features in "features"
    random_seed=1234,
)


Now we format the training dataset and we train/fit the model

In [19]:
model.fit(train_ds)

<tf_keras.src.callbacks.History at 0x7f7a1c2fb250>

Let's evaluate the model

In [20]:
self_evaluation = model.make_inspector().evaluation()
print(f"Accuracy: {self_evaluation.accuracy} Loss:{self_evaluation.loss}")

Accuracy: 0.8260869383811951 Loss:0.8741847276687622


Let's make the prediction

In [25]:
def prediction_to_kaggle_format(model, threshold=0.5):
    proba_survive = model.predict(test_ds, verbose=0)[:,0]
    return pd.DataFrame({
        "PassengerId": dataTest["PassengerId"],
        "Survived": (proba_survive >= threshold).astype(int)
    })

kaggle_predictions = prediction_to_kaggle_format(model)
print(kaggle_predictions)



     PassengerId  Survived
0            892         0
1            893         0
2            894         0
3            895         0
4            896         1
..           ...       ...
413         1305         0
414         1306         1
415         1307         0
416         1308         0
417         1309         0

[418 rows x 2 columns]


In [32]:
def make_submission(kaggle_predictions):
    path="submission.csv"
    kaggle_predictions.to_csv(path, index=False,header=True)
    print(f"Submission exported to {path}")

make_submission(kaggle_predictions)

# Assuming 'filename.txt' is the name of your file
files.download('/content/submission.csv')

Submission exported to submission.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import drive
drive.mount('/content/drive')