## import packages


In [1]:
try:
# Attempt to load TensorFlow
  import tensorflow as tf
  print("TensorFlow version:", tf.__version__)
  
except Exception as e: 
# In case of an error, Tensorflow is installed in your environment
  print(e)
  !pip3 install tensorflow
  print("TensorFlow version:", tf.__version__)

TensorFlow version: 2.11.0


In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals
import tensorflow as tf
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas_profiling
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import classification_report

  import pandas_profiling


## Load the data and get some statistics

In [3]:
## Load dataset (train and eval)
dftrain = pd.read_csv(
    'https://storage.googleapis.com/tf-datasets/titanic/train.csv')  # training data
dfeval = pd.read_csv(
    'https://storage.googleapis.com/tf-datasets/titanic/eval.csv')  # testing data

y_train = dftrain.pop('survived')
y_eval = dfeval.pop('survived')

In [4]:
## Check if data are unbalanced or not
print(np.size(np.where(y_train==0)))
print(np.size(np.where(y_train==1)))

384
243


In [5]:
## We rebalance the classes

# Instantiate the RandomOverSampler object
oversampler = RandomOverSampler(random_state=42)

# Apply oversampling
X_resampled, y_resampled = oversampler.fit_resample(dftrain, y_train)

# Recreate the DataFrame with the rebalanced data
dftrain = pd.DataFrame(X_resampled, columns=dftrain.columns)
y_train = y_resampled

In [6]:
## Now the classes are balanced

print(np.size(np.where(y_train==0)))
print(np.size(np.where(y_train==1)))

384
384


In [7]:
print(dftrain.head())
print()
print(dftrain.dtypes)
print()
print(dftrain.describe())

      sex   age  n_siblings_spouses  parch     fare  class     deck  \
0    male  22.0                   1      0   7.2500  Third  unknown   
1  female  38.0                   1      0  71.2833  First        C   
2  female  26.0                   0      0   7.9250  Third  unknown   
3  female  35.0                   1      0  53.1000  First        C   
4    male  28.0                   0      0   8.4583  Third  unknown   

   embark_town alone  
0  Southampton     n  
1    Cherbourg     n  
2  Southampton     y  
3  Southampton     n  
4   Queenstown     y  

sex                    object
age                   float64
n_siblings_spouses      int64
parch                   int64
fare                  float64
class                  object
deck                   object
embark_town            object
alone                  object
dtype: object

              age  n_siblings_spouses       parch        fare
count  768.000000          768.000000  768.000000  768.000000
mean    29.509544        

In [12]:
## We create an interactive dashboard to have some information on our data

report = pandas_profiling.ProfileReport(dftrain, title='Exploratory Data Analysis Report',
                                         minimal=False, html={'style':{'full_width':True}}
                                        )

report.to_widgets()

report.to_file("report/report.html")

  return func(*args, **kwargs)
Summarize dataset: 100%|██████████| 34/34 [00:09<00:00,  3.69it/s, Completed]                                     
Generate report structure: 100%|██████████| 1/1 [00:06<00:00,  6.52s/it]
Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

                                                             

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

Render HTML: 100%|██████████| 1/1 [00:03<00:00,  3.98s/it]
Export report to file: 100%|██████████| 1/1 [00:00<00:00, 76.60it/s]


## Some preprocessing to train a neural network with Tensorflow

In [13]:
object_to_int_columns = ['n_siblings_spouses', 'parch'] 

## Convert columns of type object to int 
dftrain[object_to_int_columns] = dftrain[object_to_int_columns].astype(np.int64)
dfeval[object_to_int_columns] = dfeval[object_to_int_columns].astype(np.int64)

In [14]:
string_to_int_columns = ['sex', 'class', 'deck', 'embark_town', 'alone'] 

## Convert columns of type object to string
dftrain[string_to_int_columns] = dftrain[string_to_int_columns].astype(str)

In [15]:
## Normalize numeric features
numeric_features = ['age', 'fare'] ## Numeric/continuous columns (float)

### Preprocess numeric features
numeric_features = dftrain[numeric_features].astype(np.float64)

In [16]:
## The class tf.keras.Input is used to create 
## an input layer in a neural network model 
## built with the Keras library of TensorFlow.

inputs = {}

for name, column in dftrain.items():

    if type(column[0]) == str:
        dtype = tf.string
    elif type(column[0]) == np.int64:
        dtype = tf.int64
    else:
        dtype = tf.float64

    inputs[name] = tf.keras.Input(shape=(), name=name, dtype=dtype)

In [17]:
inputs

{'sex': <KerasTensor: shape=(None,) dtype=string (created by layer 'sex')>,
 'age': <KerasTensor: shape=(None,) dtype=float64 (created by layer 'age')>,
 'n_siblings_spouses': <KerasTensor: shape=(None,) dtype=int64 (created by layer 'n_siblings_spouses')>,
 'parch': <KerasTensor: shape=(None,) dtype=int64 (created by layer 'parch')>,
 'fare': <KerasTensor: shape=(None,) dtype=float64 (created by layer 'fare')>,
 'class': <KerasTensor: shape=(None,) dtype=string (created by layer 'class')>,
 'deck': <KerasTensor: shape=(None,) dtype=string (created by layer 'deck')>,
 'embark_town': <KerasTensor: shape=(None,) dtype=string (created by layer 'embark_town')>,
 'alone': <KerasTensor: shape=(None,) dtype=string (created by layer 'alone')>}

In [18]:
## preprocessed will be a list of tensors that will 
# contain information about our neural network input data
preprocessed = []

for name in numeric_features:
    var = np.array(dftrain[name]).reshape(-1, 1)
    # print(var)

    normalizer = tf.keras.layers.Normalization(axis=-1)
    normalizer.adapt(var)
    
    x = inputs[name][:, tf.newaxis]
    x = normalizer(x)
    preprocessed.append(x)
    
preprocessed

[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'normalization')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'normalization_1')>]

In [19]:
categorical_columns = ['sex', 'n_siblings_spouses', 'parch', 'class', 'deck',
                       'embark_town', 'alone']

for name in categorical_columns:

    vocab = sorted(set(dftrain[name]))

    if type(vocab[0]) is str:
        lookup = tf.keras.layers.StringLookup(
            vocabulary=vocab, output_mode='one_hot')
    else:
        lookup = tf.keras.layers.IntegerLookup(
            vocabulary=vocab, output_mode='one_hot')

    x = inputs[name][:, tf.newaxis]
    x = lookup(x)
    preprocessed.append(x)

In [20]:
preprocessed

[<KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'normalization')>,
 <KerasTensor: shape=(None, 1) dtype=float32 (created by layer 'normalization_1')>,
 <KerasTensor: shape=(None, 3) dtype=float32 (created by layer 'string_lookup')>,
 <KerasTensor: shape=(None, 8) dtype=float32 (created by layer 'integer_lookup')>,
 <KerasTensor: shape=(None, 7) dtype=float32 (created by layer 'integer_lookup_1')>,
 <KerasTensor: shape=(None, 4) dtype=float32 (created by layer 'string_lookup_1')>,
 <KerasTensor: shape=(None, 9) dtype=float32 (created by layer 'string_lookup_2')>,
 <KerasTensor: shape=(None, 5) dtype=float32 (created by layer 'string_lookup_3')>,
 <KerasTensor: shape=(None, 3) dtype=float32 (created by layer 'string_lookup_4')>]

In [21]:
preprocesssed_result = tf.concat(preprocessed, axis=-1)
preprocessor = tf.keras.Model(inputs=inputs, outputs=preprocesssed_result) 

x_train = preprocessor(inputs) ## x_train is a tensor that contains our data
x_train

<KerasTensor: shape=(None, 41) dtype=float32 (created by layer 'model')>

In [22]:
y_train_binary = tf.keras.utils.to_categorical(list(y_train), num_classes=2)
y_eval_binary = tf.keras.utils.to_categorical(list(y_eval), num_classes=2)

In [33]:
# We will create a for loop to test different parameters of our sequential neural network and keep 
# the one that minimizes the loss

## We set some hyperparameters ##
least_value_loss = float('inf')
least_loss_model = None
nn_sequential_least = None
epochs = 10
validation_split = 0.2
N_train = dftrain.shape[0]

## We go through the different parameters to find the "best" possible model ##
## For this we will keep the minimum loss model ##
nodes = [10, 15, 20]
dropout_probs = [0, 0.2, 0.4]
# learning_rate = [0.01, 0.005, 0.001]
batch_sizes = [16, 32, 64]
# kernel_regularizer_l1 = tf.kears.regularizers.l1(0.0001)
kernel_regularizer_l2 = tf.keras.regularizers.l2(0.0001)


parameters_best_model = {}


for nodes in nodes:
    for dropout_prob in dropout_probs:
        # for lr in learning_rate:
        for batch_size in batch_sizes:
            
            initial_learning_rate = 0.01
            decay_steps = (N_train // batch_size) * 2
            decay_rate = 1
            staircase = False

            ## The learning rate decreases with time
            lr_schedule = tf.keras.optimizers.schedules.InverseTimeDecay(
                initial_learning_rate,
                decay_steps,
                decay_rate,
                staircase=staircase
            )

            optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
    
            print(f"{nodes} nodes, {dropout_prob} dropout_prob, lr {None}, batch size {batch_size}")
            
            body = tf.keras.Sequential([
                
                tf.keras.layers.Dense(nodes, activation='relu', kernel_regularizer=kernel_regularizer_l2),
                tf.keras.layers.Dropout(dropout_prob), ## Dropout is added to regularize
                                                            ## the neural network model to avoid overlearning
                tf.keras.layers.Dense(nodes, activation='relu', kernel_regularizer=kernel_regularizer_l2),
                tf.keras.layers.Dropout(dropout_prob),
                tf.keras.layers.Dense(nodes, activation='relu', kernel_regularizer=kernel_regularizer_l2),
                tf.keras.layers.Dropout(dropout_prob),
                tf.keras.layers.Dense(nodes, activation='relu', kernel_regularizer=kernel_regularizer_l2),
                tf.keras.layers.Dropout(dropout_prob),
                
                tf.keras.layers.Dense(2, activation='softmax')
                
                ])
            
            
            result = body(x_train)
            model = tf.keras.Model(inputs, result)
            
            model.compile(optimizer=optimizer, loss=tf.keras.losses.binary_crossentropy,
            metrics=['accuracy'])
            
            history = model.fit(dict(dftrain), y_train_binary, epochs=epochs, 
                                batch_size=batch_size, validation_split=validation_split, verbose=0)
            
            value_loss = model.evaluate(dict(dfeval), y_eval_binary, verbose=1)[0]
            


            if value_loss < least_value_loss:

                least_value_loss = value_loss
                least_loss_model = model
                
                nn_sequential_least = body 
                
                parameters_best_model['num_nodes'] = nodes
                parameters_best_model['dropout_prob'] = dropout_prob
                parameters_best_model['lr'] = None
                parameters_best_model['batch_size'] = batch_size

10 nodes, 0 dropout_prob, lr None, batch size 16
10 nodes, 0 dropout_prob, lr None, batch size 32


Exception ignored in: <function WeakKeyDictionary.__init__.<locals>.remove at 0x000001B76A71BCA0>
Traceback (most recent call last):
  File "c:\Users\mfatt\anaconda3\lib\weakref.py", line 346, in remove
    self = selfref()
KeyboardInterrupt: 


10 nodes, 0 dropout_prob, lr None, batch size 64


In [None]:
print("The parameters of the model minimizing the loss :")
print()
print(parameters_best_model)

In [109]:
## Configuration of the neural network

# # print(nn_sequential_least.get_config())
print(nn_sequential_least.summary())


# Récupérer la taille des données d'entrée
input_shape = nn_sequential_least.layers[0].input_shape
print("Taille des données en entrée :", input_shape)

print()

print("Configuration du réseau de neurones: ")
for i in range(1, 5):
    print(nn_sequential_least.layers[i].input_shape)

print()

# Récupérer la taille des données d'entrée
output_shape = nn_sequential_least.layers[4].output_shape
print("Taille des données en sortie :", output_shape)

# for i in range(0, 7):
#     print(nn_sequential_least.layers[i].input_shape)

# print()

# for i in range(0, 7):
#     print(nn_sequential_least.layers[i].output_shape)

Model: "sequential_403"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1637 (Dense)          (None, 20)                840       
                                                                 
 dropout_1234 (Dropout)      (None, 20)                0         
                                                                 
 dense_1638 (Dense)          (None, 20)                420       
                                                                 
 dropout_1235 (Dropout)      (None, 20)                0         
                                                                 
 dense_1639 (Dense)          (None, 20)                420       
                                                                 
 dropout_1236 (Dropout)      (None, 20)                0         
                                                                 
 dense_1640 (Dense)          (None, 20)             

In [110]:
eval_loss, eval_accuracy = least_loss_model.evaluate(dict(dfeval), y_eval_binary)

print()
print('Eval accuracy:', eval_accuracy)
print()
print('Eval loss:', eval_loss)


Eval accuracy: 0.810606062412262

Eval loss: 0.43949180841445923


In [214]:
# # Save the best model
# least_loss_model.save(filepath="./sequential_model")


# # Save the configuration
# nn_sequential_least.save(filepath="./sequential_nn_config")



INFO:tensorflow:Assets written to: ./sequential_model\assets


INFO:tensorflow:Assets written to: ./sequential_model\assets






INFO:tensorflow:Assets written to: ./sequential_nn_config\assets


INFO:tensorflow:Assets written to: ./sequential_nn_config\assets


In [24]:
# load the best model and the configuration
loaded_model = tf.keras.models.load_model(filepath="./sequential_model")

loaded_config_nn = tf.keras.models.load_model(filepath="./sequential_nn_config")



In [25]:
## Configuration of the neural network

# print(loaded_config_nn.get_config())
print(loaded_config_nn.summary())


# Récupérer la taille des données d'entrée
input_shape = loaded_config_nn.layers[0].input_shape
print("Taille des données en entrée :", input_shape)

print()

print("Configuration du réseau de neurones: ")
for i in range(1, 7):
    print(loaded_config_nn.layers[i].input_shape)

print()

# Récupérer la taille des données d'entrée
output_shape = loaded_config_nn.layers[6].output_shape
print("Taille des données en sortie :", output_shape)

# for i in range(0, 7):
#     print(loaded_config_nn.layers[i].input_shape)

# print()

# for i in range(0, 7):
#     print(loaded_config_nn.layers[i].output_shape)

Model: "sequential_374"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1496 (Dense)          (None, 15)                630       
                                                                 
 dropout_1122 (Dropout)      (None, 15)                0         
                                                                 
 dense_1497 (Dense)          (None, 15)                240       
                                                                 
 dropout_1123 (Dropout)      (None, 15)                0         
                                                                 
 dense_1498 (Dense)          (None, 15)                240       
                                                                 
 dropout_1124 (Dropout)      (None, 15)                0         
                                                                 
 dense_1499 (Dense)          (None, 2)              

In [26]:
print("Features and theirs dimensions : ")
print()
for feature, shape in loaded_model.layers[9].input_shape.items():
    print(feature + " : " + str(shape))

Features and theirs dimensions : 

sex : (None,)
age : (None,)
n_siblings_spouses : (None,)
parch : (None,)
fare : (None,)
class : (None,)
deck : (None,)
embark_town : (None,)
alone : (None,)


In [27]:
## Labels
y_train_binary = tf.keras.utils.to_categorical(list(y_train), num_classes=2)
y_eval_binary = tf.keras.utils.to_categorical(list(y_eval), num_classes=2)

In [28]:
# Modèle retenu: '{'num_nodes': 15, 'dropout_prob': 0, 'lr': 0.005, 'batch_size': 32}

eval_loss, eval_accuracy = loaded_model.evaluate(dict(dfeval), y_eval_binary)

print()
print('Eval accuracy:', eval_accuracy)
print()
print('Eval loss:', eval_loss)


Eval accuracy: 0.8333333134651184

Eval loss: 0.41780975461006165


In [29]:
## dfeval contains the data of 264 passengers present on board 
# the titanic during the sinking. This function uses the prediction
# of the neural network model recorded above and compares it to the true value.

def input_and_predict():
    
    while True:
        
        try: 
            passenger_number = int(input("Enter an integer between 0 and 263."))
            
            if passenger_number >= 0 and passenger_number <= 263: 
                
                print()
                
                # Perform the prediction for the given passenger
                predictions = loaded_model.predict(dict(dfeval))
                prediction_passager = predictions[passenger_number]

                print()

                # Show results
                print("Passenger data : ")
                print(dfeval.iloc[passenger_number, ])

                print()

                p_1 = round(prediction_passager[1], 2)
                p_2 = round(prediction_passager[0], 2)

                if p_1 >= 0.5: print("Model prediction: Survivor with probability : ", p_1)
                else: print("Model prediction: Not-survivor with probability : ", p_2)

                print()

                if y_eval[passenger_number] == 1: survival = "Survivor"
                else: survival = "Not-Survivor"

                print("Actual value : ", survival)

                break

            else:
                print("Incorrect value. Enter an integer between 0 and 263.")
            
        except ValueError: print("Incorrect format. Enter an integer between 0 and 263.")

In [30]:
input_and_predict()



Passenger data : 
sex                       female
age                         22.0
n_siblings_spouses             0
parch                          0
fare                        7.75
class                      Third
deck                     unknown
embark_town           Queenstown
alone                          y
Name: 78, dtype: object

Model prediction: Survivor with probability :  0.83

Actual value :  Survivor


In [31]:
## Classification report with different metrics

predictions = loaded_model.predict(dict(dfeval))
y_pred_classes = np.argmax(predictions, axis=1)
y_true_binary = np.argmax(y_eval_binary, axis=1)

report = classification_report(y_true_binary, y_pred_classes)

print()
print("---------------- classification report ----------------")
print()
print(report)


---------------- classification report ----------------

              precision    recall  f1-score   support

           0       0.82      0.93      0.87       165
           1       0.86      0.67      0.75        99

    accuracy                           0.83       264
   macro avg       0.84      0.80      0.81       264
weighted avg       0.84      0.83      0.83       264

