In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
df_train = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

In [3]:
df_median_age = df_train.groupby(['Sex', 'Pclass'])['Age'].median().reset_index()
df_median_age = df_median_age.rename(columns={'Age' : 'Median_Age'})

df_train = df_train.merge(df_median_age, on=['Sex', 'Pclass'], how='left')
df_train.loc[df_train['Age'].isna(), 'Age'] = df_train['Median_Age']
df_train = df_train.drop(columns = 'Median_Age')

df_test = df_test.merge(df_median_age, on=['Sex', 'Pclass'], how='left')
df_test.loc[df_test['Age'].isna(), 'Age'] = df_test['Median_Age']
df_test = df_test.drop(columns = 'Median_Age')

df_train['Cabin_Type'] = df_train.loc[df_train['Cabin'].notna()]['Cabin'].apply(lambda x: x[0])
df_test['Cabin_Type'] = df_test.loc[df_test['Cabin'].notna()]['Cabin'].apply(lambda x: x[0])

df_train['Cabin_Nos'] = df_train.loc[df_train['Cabin'].notna()]['Cabin'].apply(lambda x: len(x.split()))
df_test['Cabin_Nos'] = df_test.loc[df_test['Cabin'].notna()]['Cabin'].apply(lambda x: len(x.split()))

df_train = df_train.drop(columns='Cabin')
df_test = df_test.drop(columns='Cabin')

df_train['Sex'] = df_train['Sex'].apply(lambda x: 1 if x=='male' else 0)
df_test['Sex'] = df_test['Sex'].apply(lambda x: 1 if x=='male' else 0)

df_train['Embarked'] = df_train['Embarked'].fillna('S')
df_test['Embarked'] = df_test['Embarked'].fillna('S')

new_df_train = pd.get_dummies(df_train['Embarked'])
new_cols = ['Embarked_{}'.format(x) for x in new_df_train.columns.values]
new_df_train.columns = new_cols
df_train = pd.concat([df_train, new_df_train], axis=1)

new_df_test = pd.get_dummies(df_test['Embarked'])
new_cols = ['Embarked_{}'.format(x) for x in new_df_test.columns.values]
new_df_test.columns = new_cols
df_test = pd.concat([df_test, new_df_test], axis=1)

df_train = df_train.drop(columns=['Embarked'])
df_test = df_test.drop(columns=['Embarked'])

new_df_train = pd.get_dummies(df_train['Cabin_Type'])
new_cols = ['Cabin_Type_{}'.format(x) for x in new_df_train.columns.values]
new_df_train.columns = new_cols
df_train = pd.concat([df_train, new_df_train], axis=1)

new_df_test = pd.get_dummies(df_test['Cabin_Type'])
new_cols = ['Cabin_Type_{}'.format(x) for x in new_df_test.columns.values]
new_df_test.columns = new_cols
df_test = pd.concat([df_test, new_df_test], axis=1)

df_train = df_train.drop(columns=['Cabin_Type'])
df_test = df_test.drop(columns=['Cabin_Type'])

new_df_train = pd.get_dummies(df_train['Pclass'])
new_cols = ['Pclass_{}'.format(x) for x in new_df_train.columns.values]
new_df_train.columns = new_cols
df_train = pd.concat([df_train, new_df_train], axis=1)

new_df_test = pd.get_dummies(df_test['Pclass'])
new_cols = ['Pclass_{}'.format(x) for x in new_df_test.columns.values]
new_df_test.columns = new_cols
df_test = pd.concat([df_test, new_df_test], axis=1)

df_train = df_train.drop(columns=['Pclass'])
df_test = df_test.drop(columns=['Pclass'])

df_train['Cabin_Nos'] = df_train['Cabin_Nos'].fillna(1)
df_test['Cabin_Nos'] = df_test['Cabin_Nos'].fillna(1)

df_test['Cabin_Type_T'] = 0

In [17]:
inp_col_list = list(df_train.columns.values)
inp_col_list.remove('PassengerId')
inp_col_list.remove('Survived')
inp_col_list.remove('Name')
inp_col_list.remove('Ticket')

X_train = df_train[inp_col_list]
Y_train = df_train['Survived']

inp_col_list = list(df_test.columns.values)
inp_col_list.remove('PassengerId')
inp_col_list.remove('Name')
inp_col_list.remove('Ticket')

X_test = df_test[inp_col_list]

In [18]:
standard_scaler = StandardScaler()
x_train_scaled = standard_scaler.fit_transform(X_train.values)
X_train_scaled = pd.DataFrame(x_train_scaled)
x_test_scaled = standard_scaler.fit_transform(X_test.values)
X_test_scaled = pd.DataFrame(x_test_scaled)

x_train, x_val, y_train, y_val = train_test_split(X_train_scaled, Y_train, test_size=0.2, random_state=0)

In [46]:
num_classes = 2
num_features = x_train.shape[1]
learning_rate = 3e-4
training_steps = 1000
batch_size = 64
display_step = 100

In [47]:
train_data=tf.data.Dataset.from_tensor_slices((x_train.values,y_train.values))
train_data=train_data.repeat().shuffle(100).batch(batch_size).prefetch(1)

In [48]:
W = tf.Variable(tf.random.normal([num_features, num_classes], dtype='float64'), name="weight")
b = tf.Variable(tf.zeros([num_classes], dtype='float64'), name="bias")

In [49]:
optimizer = tf.optimizers.SGD(learning_rate)

In [50]:
def logistic_regression(x):
    return tf.nn.softmax(tf.matmul(x, W) + b)

In [51]:
def cross_entropy(y_pred, y_true):
    y_true = tf.one_hot(y_true, depth = num_classes, dtype='float64')
    y_pred = tf.clip_by_value(y_pred, 1e-9, 1.)
    return -1*tf.reduce_mean(tf.reduce_sum(y_true * tf.math.log(y_pred)))

In [52]:
def accuracy(y_pred, y_true):
    correct_predictions = tf.equal(tf.argmax(y_pred, 1), tf.cast(y_true, tf.int64))
    return tf.reduce_mean(tf.cast(correct_predictions, tf.float32))

In [53]:
def run_optimization(x, y):
    with tf.GradientTape() as g:
        pred = logistic_regression(x)
        loss = cross_entropy(pred, y)
        gradients = g.gradient(loss, [W, b])
        optimizer.apply_gradients(zip(gradients, [W, b]))

In [54]:
for step, (batch_x, batch_y) in enumerate(train_data.take(training_steps), 1):
    run_optimization(batch_x, batch_y)
    if step % display_step == 0:
        pred = logistic_regression(batch_x)
        loss = cross_entropy(pred, batch_y)
        acc = accuracy(pred, batch_y)
        print("step: %i, loss: %f, accuracy: %f" % (step, loss, acc))

2022-05-24 22:29:44.932721: E tensorflow/stream_executor/cuda/cuda_blas.cc:226] failed to create cublas handle: CUBLAS_STATUS_NOT_INITIALIZED
2022-05-24 22:29:44.932784: W tensorflow/core/framework/op_kernel.cc:1745] OP_REQUIRES failed at matmul_op_impl.h:438 : INTERNAL: Attempting to perform BLAS operation using StreamExecutor without BLAS support


InternalError: Attempting to perform BLAS operation using StreamExecutor without BLAS support [Op:MatMul]

In [147]:
pred = logistic_regression(x_val.values)
print("Test Accuracy: %f" % accuracy(pred, y_val))

Test Accuracy: 0.810056


In [148]:
pred = logistic_regression(X_test_scaled.values)
pred = tf.argmax(input=pred, axis=1).numpy()

In [149]:
pd.concat([df_test, pd.Series(pred, name='Survived')], axis=1)[['PassengerId', 'Survived']].to_csv('../submissions/tf_logreg2_submission.csv', index=False)