In [159]:
import pandas as pd
import scipy as sp

from sklearn.preprocessing import OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler


In [160]:
# training data
df=pd.read_csv("train.csv")
Y=df["Survived"]
# test data
test_df=pd.read_csv("test.csv")

validate_df=test_df.copy()
validate_df

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0000,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S
...,...,...,...,...,...,...,...,...,...,...,...
413,1305,3,"Spector, Mr. Woolf",male,,0,0,A.5. 3236,8.0500,,S
414,1306,1,"Oliva y Ocana, Dona. Fermina",female,39.0,0,0,PC 17758,108.9000,C105,C
415,1307,3,"Saether, Mr. Simon Sivertsen",male,38.5,0,0,SOTON/O.Q. 3101262,7.2500,,S
416,1308,3,"Ware, Mr. Frederick",male,,0,0,359309,8.0500,,S


In [161]:
# the function below performs all that entails with feature selection such as data cleaning and preprocessing
def dfPerform(df,type):
    # drop some columns that would not lead to people dying
    cols_to_drop=["Cabin","Ticket","Name"]
    df.drop(cols_to_drop,axis=1,inplace=True)
    # perform ordinal encoding on non numerical features
    df_objects=df.select_dtypes(include="object")
    # drop the object features and remain with a df with numerical features only
    df.drop(df_objects.columns,axis=1,inplace=True)
    col_names=df_objects.columns
    # instantiate the ordinal encoder
    enc=OrdinalEncoder()
    df_objects=pd.DataFrame(enc.fit_transform(df_objects))
    # add back the column name to the encoded dataframe
    df_objects.columns=col_names
    # concatenate the encoded dataframe and the original numeric dataframe
    df=pd.concat([df,df_objects],axis=1,join="inner")
    # array for columns with null values
    cols_withNa=[]
    # instantiate the SimpleImputer to replace the null values
    impute=SimpleImputer(strategy="median",fill_value="median")
    for col in df.columns:
        # check if a column is null
        if df[col].isna().sum() >1:
            # append the column with null values
            cols_withNa.append(col)
            # impute them
            imputed_cols=pd.DataFrame(impute.fit_transform(df[cols_withNa]))
            imputed_cols.columns=cols_withNa

    # drop columns with null values  
    df.drop(cols_withNa,axis=1,inplace=True)
    # concatenate the imputed columns with the original dataframe
    df=pd.concat([df,imputed_cols],axis=1)
    
    if(type == "train"):
        for kol in df.columns:
            # perform correlation computation to get whether some certain features would really affect the survival rate
            stats,pvalue=sp.stats.pearsonr(df["Survived"],df[kol])
            print(f"{kol} has a pvalue of {pvalue}")
            # drop columns that have a Pvalue greater than 0.1
            if (pvalue >0.1):
                df.drop([kol],axis=1,inplace=True)
    return df

In [162]:
# separate the target and predictor variables
real_df=dfPerform(df=df,type='train')
y=real_df["Survived"]
train_y=y
x_train=real_df.drop(["Survived"],axis=1)

PassengerId has a pvalue of 0.8813657768798073
Survived has a pvalue of 0.0
Pclass has a pvalue of 2.5370473879814938e-25
SibSp has a pvalue of 0.29224392869829086
Parch has a pvalue of 0.014799245374727947
Fare has a pvalue of 6.1201893419246185e-15
Sex has a pvalue of 1.406066130879422e-69
Age has a pvalue of 0.05276068847585567
Embarked has a pvalue of 4.811092694586917e-07


In [163]:
validation_df=dfPerform(df=validate_df,type="test")
for col in list(validation_df.columns):
    if( col not in list(x_train.columns)):
        validation_df.drop([col],axis=1,inplace=True)
validation_df

Unnamed: 0,Pclass,Parch,Fare,Sex,Embarked,Age
0,3,0,7.8292,1.0,1.0,34.5
1,3,0,7.0000,0.0,2.0,47.0
2,2,0,9.6875,1.0,1.0,62.0
3,3,0,8.6625,1.0,2.0,27.0
4,3,1,12.2875,0.0,2.0,22.0
...,...,...,...,...,...,...
413,3,0,8.0500,1.0,2.0,27.0
414,1,0,108.9000,0.0,0.0,39.0
415,3,0,7.2500,1.0,2.0,38.5
416,3,0,8.0500,1.0,2.0,27.0


In [164]:
print(x_train)
train_x=x_train

     Pclass  Parch     Fare  Sex   Age  Embarked
0         3      0   7.2500  1.0  22.0       2.0
1         1      0  71.2833  0.0  38.0       0.0
2         3      0   7.9250  0.0  26.0       2.0
3         1      0  53.1000  0.0  35.0       2.0
4         3      0   8.0500  1.0  35.0       2.0
..      ...    ...      ...  ...   ...       ...
886       2      0  13.0000  1.0  27.0       2.0
887       1      0  30.0000  0.0  19.0       2.0
888       3      2  23.4500  0.0  28.0       2.0
889       1      0  30.0000  1.0  26.0       0.0
890       3      0   7.7500  1.0  32.0       1.0

[891 rows x 6 columns]


In [165]:
# ;ets performs some splits!!
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(x_train,y,train_size=0.8,test_size=0.2,random_state=0)
x_train

Unnamed: 0,Pclass,Parch,Fare,Sex,Age,Embarked
140,3,2,15.2458,0.0,28.0,0.0
439,2,0,10.5000,1.0,31.0,2.0
817,2,1,37.0042,1.0,31.0,0.0
378,3,0,4.0125,1.0,20.0,0.0
491,3,0,7.2500,1.0,21.0,2.0
...,...,...,...,...,...,...
835,1,1,83.1583,0.0,39.0,0.0
192,3,0,7.8542,0.0,19.0,2.0
629,3,0,7.7333,1.0,28.0,1.0
559,3,0,17.4000,0.0,36.0,2.0


In [166]:
y_validation=validation_df["Parch"]
x_test,a,c,d=train_test_split(validation_df,y_validation,train_size=0.43,random_state=0)
x_test


Unnamed: 0,Pclass,Parch,Fare,Sex,Embarked,Age
40,3,1,13.4167,1.0,0.0,39.0
18,3,0,7.9250,0.0,2.0,27.0
299,3,0,7.8542,1.0,2.0,29.0
79,3,0,7.7500,0.0,1.0,24.0
340,2,0,10.5000,1.0,2.0,18.0
...,...,...,...,...,...,...
323,1,0,26.5500,1.0,2.0,33.0
192,3,1,14.5000,1.0,2.0,11.5
117,3,1,16.7000,0.0,2.0,1.0
47,3,0,7.7500,1.0,1.0,27.0


In [167]:
from tensorflow import keras
from keras import layers
input_shape=[len(list(x_train.columns))]
model=keras.Sequential([
    layers.Dense(32,activation="relu",input_shape=input_shape),
    layers.BatchNormalization(),
    layers.Dropout(0.1),
    layers.Dense(64,activation="relu",kernel_initializer="he_normal",use_bias=False),
    layers.Dropout(0.1),
    layers.Dense(128,activation="relu",kernel_initializer="he_normal",use_bias=False),
    layers.Dropout(0.1),
    layers.Dense(128,activation="relu",kernel_initializer="he_normal",use_bias=False),
    layers.Dropout(0.1),
    layers.Dense(64,activation="relu",kernel_initializer="he_normal",use_bias=False),
    layers.Dropout(0.1),
    layers.Dense(32,activation="relu"),
    layers.Dropout(0.15),
    layers.Dense(32,activation="relu"),
    layers.Dense(16,activation="relu"),
    layers.Dense(8,activation="relu",kernel_initializer="he_normal",use_bias=False),
    layers.Dense(1,activation="sigmoid")
    
])

In [168]:
# # lets add ealry stopping rounds to prevent 
# from keras import callbacks
# early_stopping=callbacks.EarlyStopping(
#     min_delta=0.001,
#     patience=20,
#     restore_best_weights=True
# )


In [169]:
model.compile(
    optimizer="adam",
    loss="binary_crossentropy",
    metrics=["binary_accuracy"]
)

In [176]:
history=model.fit(
    x=train_x,
    y=train_y,
    batch_size=64,
    epochs=100,
    callbacks=[early_stopping]
)

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

In [177]:
y_preds=model.predict(x_test)

y_preds=((y_preds>0.5).astype(int)).ravel()
y_preds



array([0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0,
       1, 1, 0])

In [178]:
from matplotlib import pyplot as plt
history_df=pd.DataFrame(history.history)
history_df


Unnamed: 0,loss,binary_accuracy
0,0.420009,0.809203
1,0.401985,0.820426
2,0.394385,0.827160
3,0.407203,0.822671
4,0.401902,0.818182
...,...,...
95,0.391320,0.828283
96,0.388147,0.840629
97,0.396430,0.828283
98,0.360910,0.841751


In [182]:
from sklearn import metrics

print(f"PRECISION {round(metrics.precision_score(y_test,y_preds)*100,2)}")
print(f"ACCURACY {round(metrics.accuracy_score(y_test,y_preds)*100,2)}")
print(f"RECALL {round(metrics.recall_score(y_test,y_preds)*100,2)}")
print(f"F1 SCORE {round(metrics.f1_score(y_test,y_preds)*100,2)}")
print(f"AUC {round(metrics.roc_auc_score(y_test,y_preds)*100,2)}")


PRECISION 100.0
ACCURACY 100.0
RECALL 100.0
F1 SCORE 100.0
AUC 100.0


## ANOTHER APPROACH TO TRAIN

In [183]:
# # let us use an input function
# import tensorflow as tf
# def make_input_fn(data_df,label_df,num_epochs=10,shuffle=True,batch_size=32):
#     def input_function(): # inner function that will be returned
#         ds=tf.data.Dataset.from_tensor_slices((dict(data_df), label_df))
#         if shuffle:
#             ds=ds.shuffle(1000) #randomise the order of the data
#         ds=ds.batch(batch_size).repeat(num_epochs)
#         return ds #return a batch of the dataset
#     return input_function #return the function object for use

# train_input_fn=make_input_fn(train_x,train_y) #here we will call the input function that was returned
# eval_input_fn=make_input_fn(x_test,y_test,num_epochs=1,shuffle=False)
            

In [184]:
# linear_est=tf.feature_column.LinearClassifier(list(train_x.columns))

# linear_est.train(train_input_fn)
# result=linear_est.evaluate(eval_input_fn)
# print(result["accuracy"])