In [25]:
import pandas as pd
import numpy as np
import time
import sys
sys.path.append('..')
from utilities.utils import Util
from importlib import reload
import torch
import torch.nn as nn
import sklearn
from sklearn.preprocessing import LabelEncoder, minmax_scale

In [26]:
# define our dataframes
df_train = pd.read_csv('./data/train.csv')
df_test = pd.read_csv('./data/test.csv')

In [27]:
# check features we are provided
sample = df_train.iloc[0]
sample

PassengerId                          1
Survived                             0
Pclass                               3
Name           Braund, Mr. Owen Harris
Sex                               male
Age                               22.0
SibSp                                1
Parch                                0
Ticket                       A/5 21171
Fare                              7.25
Cabin                              NaN
Embarked                             S
Name: 0, dtype: object

In [28]:
# lets drop some features we will for sure not need
drop_columns = ['PassengerId', 'Name', 'Ticket']

# drop the features we don't need
df_train.drop(labels=drop_columns,axis=1,inplace=True)
df_test.drop(labels=drop_columns,axis=1,inplace=True)
print(f"Dropped {drop_columns}")

Dropped ['PassengerId', 'Name', 'Ticket']


In [29]:
# Get current number of data points
sample_ct_train = len(df_train)
sample_ct_test = len(df_test)
print(f"Train Samples: {sample_ct_train}")
print(f"Test Samples: {sample_ct_test}")

Train Samples: 891
Test Samples: 418


In [30]:
# for our remaining features, lets determine which features have NaN fields in the data
NaN_features = Util.NaN.get_nan_features_dict(df_train)
for feature in NaN_features:
    print(f"{feature}: {NaN_features[feature]}%")

Age: 0.1987%
Cabin: 0.771%
Embarked: 0.0022%


In [31]:
# remove the NaN datapoints from the dataset
removal_thresh = 0.01
for NaN_feature, percentage in NaN_features.items():
    if percentage > removal_thresh:
        continue
    else:
        print(f"Dropping all samples with feature \"{NaN_feature}\" == NaN from table")
        df_train.dropna(axis='index',subset=NaN_feature,inplace=True)

df_train.head(1)

Dropping all samples with feature "Embarked" == NaN from table


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,0,3,male,22.0,1,0,7.25,,S


In [32]:
# drop the column for cabin room id
print(f"Dropping {'Cabin'} from dataframe")
df_train.drop(axis=1, columns=['Cabin'], inplace=True)
sample = df_train.head(1)
sample

Dropping Cabin from dataframe


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S


In [33]:
# lets encode our features
encoder = LabelEncoder()
df_train['Sex'] = encoder.fit_transform(df_train['Sex'])
df_train['Pclass'] = encoder.fit_transform(df_train['Pclass'])
df_train['Embarked'] = encoder.fit_transform(df_train['Embarked'])
sample = df_train.head(1)
sample

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,2,1,22.0,1,0,7.25,2


In [34]:
# lets normalize our data
normalize_features = ['Age', 'SibSp', 'Parch', 'Fare']
for feature in normalize_features:
    if Util.NaN.contains_NaN(df_train,feature):
        print(f"Not normalizing feature: {feature}, found field containing NaN.")
        continue

    min_val, max_val = df_train[feature].min(), df_train[feature].max()
    normalized_values = minmax_scale(df_train[feature],feature_range=(0,1))
    df_train[feature] = normalized_values

sample = df_train.head(2)
sample

Not normalizing feature: Age, found field containing NaN.


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,2,1,22.0,0.125,0.0,0.014151,2
1,1,0,0,38.0,0.125,0.0,0.139136,0


In [10]:
df_train_dropped_nan_ages = df_train.dropna()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S


In [156]:
class Deep(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(8, 60)
        self.act1 = nn.ReLU()
        self.layer2 = nn.Linear(60, 60)
        self.act2 = nn.ReLU()
        self.layer3 = nn.Linear(60, 60)
        self.act3 = nn.ReLU()
        self.output = nn.Linear(60, 1)
        self.sigmoid = nn.Sigmoid()
 
    def forward(self, x):
        x = self.act1.forward(self.layer1.forward(x))
        x = self.act2.forward(self.layer2.forward(x))
        x = self.act3.forward(self.layer3.forward(x))
        x = self.sigmoid.forward(self.output.forward(x))
        return x

In [157]:
myNet = Deep()
for _ in range(25):
    sample = df_train_dropped_indexes_age.sample().values[0].astype(np.float32)
    tensor = torch.tensor(sample)
    print(myNet.forward(tensor))

tensor([0.4825], grad_fn=<SigmoidBackward0>)
tensor([0.4723], grad_fn=<SigmoidBackward0>)
tensor([0.4796], grad_fn=<SigmoidBackward0>)
tensor([0.4851], grad_fn=<SigmoidBackward0>)
tensor([0.4816], grad_fn=<SigmoidBackward0>)
tensor([0.4750], grad_fn=<SigmoidBackward0>)
tensor([0.4705], grad_fn=<SigmoidBackward0>)
tensor([0.4800], grad_fn=<SigmoidBackward0>)
tensor([0.4791], grad_fn=<SigmoidBackward0>)
tensor([0.4722], grad_fn=<SigmoidBackward0>)
tensor([0.4723], grad_fn=<SigmoidBackward0>)
tensor([0.4780], grad_fn=<SigmoidBackward0>)
tensor([0.4781], grad_fn=<SigmoidBackward0>)
tensor([0.4729], grad_fn=<SigmoidBackward0>)
tensor([0.4780], grad_fn=<SigmoidBackward0>)
tensor([0.4706], grad_fn=<SigmoidBackward0>)
tensor([0.4723], grad_fn=<SigmoidBackward0>)
tensor([0.4725], grad_fn=<SigmoidBackward0>)
tensor([0.4721], grad_fn=<SigmoidBackward0>)
tensor([0.4723], grad_fn=<SigmoidBackward0>)
tensor([0.4800], grad_fn=<SigmoidBackward0>)
tensor([0.4841], grad_fn=<SigmoidBackward0>)
tensor([0.