In [29]:
import numpy as np
import pandas as pd

preprocessing : 

In [56]:
##### Preprocessing #####
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, LabelBinarizer

def preprocess(data: pd.DataFrame):
    data.set_index("PassengerId", inplace=True)

    # Name and Ticket columns are useless here
    # Cabin contains too much missing values so let's remove it too.
    
    #data = handle_names(data)
    data = handle_family_size(data)
    data.drop(["Ticket", "Cabin", "Name"], axis=1, inplace=True)
    
    # Handle missing values
    median_age = data["Age"].mean()
    data["Age"] = data["Age"].fillna(median_age)
    data["Embarked"] = data["Embarked"].fillna("S")

    # encode "sex" column (1->male, 0->female)
    binarizer = LabelBinarizer()
    sex_encoded = binarizer.fit_transform(data['Sex'])
    data["Sex"] = sex_encoded

    # encode "Embarked" column (One-Hot Encoding)
    data = Encode_OHE(data, "Embarked")
    
    # make sure we don't have any NaN values     
    return data.fillna(data.median())

def Encode_OHE(data:pd.DataFrame, label:str) -> pd.DataFrame:
    encoder = OneHotEncoder()
    
    embarked_encoded = encoder.fit_transform(data[[label]]).toarray()
    embarked_encoded_df = pd.DataFrame(
        embarked_encoded,
        columns=encoder.get_feature_names_out([label]),
        index=data.index
    )
    
    data = pd.concat([data, embarked_encoded_df], axis=1)
    data.drop(label, axis=1,inplace=True)
    
    return data

def handle_cabin(data:pd.DataFrame) -> pd.DataFrame:
    data["Cabin"] = data["Cabin"].fillna("C")
    cabin_floor = []
    for cabin in data["Cabin"]:
        if cabin[0] == "A":
            cabin_floor.append("A")
            continue
        if cabin[0] == "G":
            cabin_floor.append("G")
            continue
            
        cabin_floor.append("C")
    
    data["Floor"] = pd.Series(cabin_floor, index=data.index)
    print()
    data.drop("Cabin",axis=1, inplace=True)
     
def handle_names(data:pd.DataFrame) -> pd.DataFrame:
    # extract the title from the name
    regex = r'(.+, (.+)\..+)'
    titles = data['Name'].str.extract(regex)[1]
    
    # encode the titles
    encoded = []
    common_titles = ["Mr", "rare title", "Master", "Mrs", "Miss"]
    for title in titles:
        if title in common_titles:
            encoded.append(common_titles.index(title))
            continue
        
        encoded.append(common_titles.index("rare title"))
            
    data["Title"] = pd.Series(encoded, index=data.index)
    data.drop("Name", axis=1, inplace=True)
    
    return data 

def handle_family_size(data: pd.DataFrame) -> pd.DataFrame:
    # add siblings, parents, childs, and wife/husband and the passenger themselves
    family_size = data['SibSp'] + data['Parch'] + 1
    family_size = family_size.values
    # disretize family size
    
    for i, size in enumerate(family_size):
        if size > 1 and size < 5:
            family_size[i] = 2
        if size > 4:
            family_size[i] = 3
    
    data['Family'] = pd.Series(family_size, index=data.index)
    data.drop(["SibSp", "Parch"], inplace=True, axis=1)
    return data

In [57]:
##### get test and train dataset #####

df_train = pd.read_csv("../data/train.csv")
train_data = preprocess(df_train)
y = train_data["Survived"]
X = train_data.drop("Survived", axis=1)

df = pd.read_csv("../data/test.csv")
X_test = preprocess(df)


In [22]:
###### save X #######
X.to_csv("../data/X.csv")
X_test.to_csv("../data/X_test.csv")

Trying to recognize a pattern (what kind of people have more chance to die/survive) :

In [8]:
# trying to identify the important factors
import matplotlib.pyplot as plt

def ratio(survived:pd.Series) -> float:
    return 100 * sum(survived)/len(survived)

df = pd.read_csv("../data/train.csv")

women = df[df["Sex"] == 'female']["Survived"]
p_class = df[df["Pclass"] == 1]["Survived"]
age = df[df["Age"] < 15]["Survived"]
embark = df[df["Embarked"] == "S"]["Survived"]

print("% of women who survived:", ratio(women))
print("% of first class", ratio(p_class))
print("age < 20", ratio(age))
print("embark at S : ", ratio(embark))

survivors_by_age = df[["Survived", "Age"]][df["Survived"] == 1] 


[0, 1, 2, 1, 0, 0, 0, 3, 1, 1, 2, 2, 0, 0, 2, 1, 3, 0, 1, 1, 0, 0, 2, 0, 2, 1, 0, 0, 2, 0, 4, 1, 2, 0, 0, 0, 0, 0, 2, 2, 1, 1, 0, 2, 2, 0, 0, 2, 0, 1, 3, 0, 1, 1, 0, 0, 2, 0, 2, 3, 0, 2, 0, 3, 0, 3, 1, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 3, 2, 0, 0, 2, 0, 2, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, 2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 3, 0, 0, 2, 0, 0, 0, 1, 1, 0, 0, 2, 0, 0, 0, 1, 2, 1, 0, 0, 0, 0, 2, 0, 4, 4, 1, 0, 0, 0, 0, 2, 0, 0, 3, 0, 1, 0, 0, 3, 3, 1, 1, 0, 0, 0, 3, 2, 0, 0, 0, 3, 2, 0, 0, 2, 0, 3, 3, 2, 0, 1, 0, 0, 0, 1, 0, 2, 3, 1, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 0, 2, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 4, 2, 1, 0, 4, 0, 1, 0, 0, 1, 1, 1, 2, 2, 1, 0, 3, 0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 1, 0, 2, 2, 2, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 1, 0, 2, 0, 0, 0, 2, 0, 1, 2, 0, 0, 2, 0, 3, 2, 1, 0, 2, 2, 2, 1, 0, 0, 2, 1, 4, 2, 1, 0, 0, 2, 1, 0, 2, 0, 1, 1, 2, 2, 0, 0, 

ZeroDivisionError: division by zero

In [28]:
# analyse preproc data

df = preprocess(pd.read_csv('../data/train.csv'))

title = df[df["Title"] == 4]["Survived"]
print("titled : ", ratio(title))



[0, 1, 2, 1, 0, 0, 0, 3, 1, 1, 2, 2, 0, 0, 2, 1, 3, 0, 1, 1, 0, 0, 2, 0, 2, 1, 0, 0, 2, 0, 4, 1, 2, 0, 0, 0, 0, 0, 2, 2, 1, 1, 0, 2, 2, 0, 0, 2, 0, 1, 3, 0, 1, 1, 0, 0, 2, 0, 2, 3, 0, 2, 0, 3, 0, 3, 1, 0, 2, 0, 0, 2, 0, 0, 0, 0, 0, 0, 3, 2, 0, 0, 2, 0, 2, 1, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 2, 0, 2, 2, 0, 0, 0, 0, 2, 0, 0, 0, 2, 0, 3, 0, 0, 2, 0, 0, 0, 1, 1, 0, 0, 2, 0, 0, 0, 1, 2, 1, 0, 0, 0, 0, 2, 0, 4, 4, 1, 0, 0, 0, 0, 2, 0, 0, 3, 0, 1, 0, 0, 3, 3, 1, 1, 0, 0, 0, 3, 2, 0, 0, 0, 3, 2, 0, 0, 2, 0, 3, 3, 2, 0, 1, 0, 0, 0, 1, 0, 2, 3, 1, 2, 0, 0, 2, 2, 0, 0, 0, 0, 0, 2, 0, 0, 2, 0, 0, 2, 0, 0, 0, 2, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 0, 0, 2, 0, 2, 0, 2, 0, 0, 2, 2, 0, 0, 0, 4, 2, 1, 0, 4, 0, 1, 0, 0, 1, 1, 1, 2, 2, 1, 0, 3, 0, 0, 2, 0, 0, 0, 1, 2, 0, 0, 1, 0, 2, 2, 2, 0, 3, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 1, 0, 2, 0, 0, 0, 2, 0, 1, 2, 0, 0, 2, 0, 3, 2, 1, 0, 2, 2, 2, 1, 0, 0, 2, 1, 4, 2, 1, 0, 0, 2, 1, 0, 2, 0, 1, 1, 2, 2, 0, 0, 

Every models I tried are below : 

In [79]:
####### 0 survivors prediction ######

zeros = np.zeros(len(X_test.index), dtype=int)
pred = pd.DataFrame(zeros, index=X_test.index, columns=["Survived"])
pred.to_csv("../data/the_reaper_prediction.csv")

In [4]:
######## simple regression #########
from sklearn.linear_model import LogisticRegression

# X and y are the preprocessed data

model = LogisticRegression()


model.fit(X,y)
pred = model.predict(X_test)

pred_df = pd.DataFrame(pred, index=X_test.index, columns=["Survived"])

print(pred_df)
pred_df.to_csv("../data/prediction.csv")

             Survived
PassengerId          
892                 0
893                 0
894                 0
895                 0
896                 1
...               ...
1305                0
1306                1
1307                0
1308                0
1309                0

[418 rows x 1 columns]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [86]:
# grid search (find the best model)
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
# X and y are the preprocessed data

# run a gridsearch
model = LogisticRegression()
param_grid = {
    'penalty': ['l1', 'l2'],
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000]
}
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')

grid_search.fit(X,y)
best_model = grid_search.best_estimator_
pred = best_model.predict(X_test)


pred_df = pd.DataFrame(pred, index=X_test.index, columns=["Survived"])
pred_df.to_csv("../data/gridsearch_prediction.csv")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [82]:
# neural-network with logistic regression
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(32, input_shape=[X.shape[1]], activation='sigmoid'))
model.add(Dense(1, activation='sigmoid'))

model.compile(
    loss='accuracy',
    optimizer='adam',
)

indexs = X_test.index

scaler = StandardScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

model.fit(X, y, epochs=1000, batch_size=10)

pred = model.predict(X_test)
pred = np.round(pred)
print(pred)

pred_df = pd.DataFrame(pred, index=indexs, columns=["Survived"], dtype=int)
pred_df.to_csv("../data/neural_network_pred.csv")


Epoch 1/1000


ValueError: in user code:

    File "/home/student/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "/home/student/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/home/student/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "/home/student/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 1151, in train_step
        loss = self.compute_loss(x, y, y_pred, sample_weight)
    File "/home/student/.local/lib/python3.10/site-packages/keras/src/engine/training.py", line 1209, in compute_loss
        return self.compiled_loss(
    File "/home/student/.local/lib/python3.10/site-packages/keras/src/engine/compile_utils.py", line 252, in __call__
        self.build(y_pred)
    File "/home/student/.local/lib/python3.10/site-packages/keras/src/engine/compile_utils.py", line 194, in build
        self._losses = tf.nest.map_structure(
    File "/home/student/.local/lib/python3.10/site-packages/keras/src/engine/compile_utils.py", line 365, in _get_loss_object
        loss = losses_mod.get(loss)
    File "/home/student/.local/lib/python3.10/site-packages/keras/src/losses.py", line 2965, in get
        return deserialize(identifier, use_legacy_format=use_legacy_format)
    File "/home/student/.local/lib/python3.10/site-packages/keras/src/losses.py", line 2912, in deserialize
        return legacy_serialization.deserialize_keras_object(
    File "/home/student/.local/lib/python3.10/site-packages/keras/src/saving/legacy/serialization.py", line 537, in deserialize_keras_object
        raise ValueError(

    ValueError: Unknown loss function: 'accuracy'. Please ensure you are using a `keras.utils.custom_object_scope` and that this object is included in the scope. See https://www.tensorflow.org/guide/keras/save_and_serialize#registering_the_custom_object for details.


In [58]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100)
model.fit(X,y)
pred = model.predict(X_test)

pred_df = pd.DataFrame(pred, index=X_test.index, columns=["Survived"])
pred_df.to_csv("../predictions/randomForest3.csv")