### Try Model: Tensor Flow Neural Network

In [16]:
from startmodskl import *
from startmodtf import *

# Process Nan_rows by replacing nan with mean_value
train_data = StartMod.process_nan_rows(train_data)
StartMod.idx_columns(train_data)

[('PassengerId', 0, dtype('int64')),
 ('Survived', 1, dtype('int64')),
 ('Pclass', 2, dtype('int64')),
 ('Name', 3, dtype('O')),
 ('Sex', 4, dtype('O')),
 ('Age', 5, dtype('float64')),
 ('SibSp', 6, dtype('int64')),
 ('Parch', 7, dtype('int64')),
 ('Ticket', 8, dtype('O')),
 ('Fare', 9, dtype('float64')),
 ('Cabin', 10, dtype('O')),
 ('Embarked', 11, dtype('O'))]

In [17]:
nan_cols = StartML.nan_columns(train_data)
nan_cols


['Cabin', 'Embarked']

In [18]:
# append column 'Ticket' to nan_cols
nan_cols.append('Ticket')
nan_cols

['Cabin', 'Embarked', 'Ticket']

In [19]:
# these nan_cols are not important for data analyzing and also object type, drop them out
train_data = train_data.drop(nan_cols, axis=1)
StartMod.idx_columns(train_data)

[('PassengerId', 0, dtype('int64')),
 ('Survived', 1, dtype('int64')),
 ('Pclass', 2, dtype('int64')),
 ('Name', 3, dtype('O')),
 ('Sex', 4, dtype('O')),
 ('Age', 5, dtype('float64')),
 ('SibSp', 6, dtype('int64')),
 ('Parch', 7, dtype('int64')),
 ('Fare', 8, dtype('float64'))]

In [20]:
# feature engineering column 'Name' with new column 'Title' and drop column 'Name' after that
StartMod.feature_engineering(train_data, 'Name', 'Title', ['Mr', 'Mrs', 'Ms', 'Dr', 'Prof', 'Sir', 'Captain'])
train_data = train_data.drop('Name', axis=1)

In [21]:
StartML.nan_columns(train_data)

['Title']

In [22]:
# process nan_value from column Title
StartML.process_nan_simply(train_data)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Title
0,1,0,3,male,22.0,1,0,7.2500,Mr
1,2,1,1,female,38.0,1,0,71.2833,Mr
2,3,1,3,female,26.0,0,0,7.9250,Unknown
3,4,1,1,female,35.0,1,0,53.1000,Mr
4,5,0,3,male,35.0,0,0,8.0500,Mr
5,6,0,3,male,44.5,0,0,8.4583,Mr
6,7,0,1,male,54.0,0,0,51.8625,Mr
7,8,0,3,male,2.0,3,1,21.0750,Unknown
8,9,1,3,female,27.0,0,2,11.1333,Mr
9,10,1,2,female,14.0,1,0,30.0708,Mr


In [23]:
StartML.idx_columns(train_data)

[('PassengerId', 0, dtype('int64')),
 ('Survived', 1, dtype('int64')),
 ('Pclass', 2, dtype('int64')),
 ('Sex', 3, dtype('O')),
 ('Age', 4, dtype('float64')),
 ('SibSp', 5, dtype('int64')),
 ('Parch', 6, dtype('int64')),
 ('Fare', 7, dtype('float64')),
 ('Title', 8, dtype('O'))]

In [24]:
# Encode all object-Columns
StartMod.encode_label_column(train_data, 'Sex')

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Title
0,1,0,3,1,22.0,1,0,7.2500,Mr
1,2,1,1,0,38.0,1,0,71.2833,Mr
2,3,1,3,0,26.0,0,0,7.9250,Unknown
3,4,1,1,0,35.0,1,0,53.1000,Mr
4,5,0,3,1,35.0,0,0,8.0500,Mr
5,6,0,3,1,44.5,0,0,8.4583,Mr
6,7,0,1,1,54.0,0,0,51.8625,Mr
7,8,0,3,1,2.0,3,1,21.0750,Unknown
8,9,1,3,0,27.0,0,2,11.1333,Mr
9,10,1,2,0,14.0,1,0,30.0708,Mr


In [25]:
new_values = StartMod.encode_label_column(train_data, 'Title', onehot=True)
new_data = pd.DataFrame(new_values)

In [26]:
StartML.nan_columns(new_data)


[]

In [27]:
new_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.0,1.0,0.0,0.0,1.0,0.0,3.0,1.0,22.0,1.0,0.0,7.25
1,0.0,1.0,0.0,0.0,2.0,1.0,1.0,0.0,38.0,1.0,0.0,71.2833
2,0.0,0.0,0.0,1.0,3.0,1.0,3.0,0.0,26.0,0.0,0.0,7.925
3,0.0,1.0,0.0,0.0,4.0,1.0,1.0,0.0,35.0,1.0,0.0,53.1
4,0.0,1.0,0.0,0.0,5.0,0.0,3.0,1.0,35.0,0.0,0.0,8.05


In [28]:
 train_data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Title
0,1,0,3,1,22.0,1,0,7.25,1
1,2,1,1,0,38.0,1,0,71.2833,1
2,3,1,3,0,26.0,0,0,7.925,3
3,4,1,1,0,35.0,1,0,53.1,1
4,5,0,3,1,35.0,0,0,8.05,1


In [29]:
StartML.idx_columns(new_data)
new_data.shape

(891, 12)

In [30]:
# Data is fit, no nan_column and object_column
# Choose method Polynomial Linear Regression to predict the Survivors (column 5)
model, y_true, y_pred = StartModTF.keras_sequential(new_data, 5)


Epoch 1/11
Epoch 2/11
Epoch 3/11
Epoch 4/11
Epoch 5/11
Epoch 6/11
Epoch 7/11
Epoch 8/11
Epoch 9/11
Epoch 10/11
Epoch 11/11


In [31]:
# Convert predicted value by rounding it (>0.5 will be considered as 1 else 0)
y_pred = np.array([1 if item>0.5 else 0 for item in y_pred])

# Compare the predicted value with true_value
len(y_true), len(y_pred)

(179, 179)

In [32]:
confusion_matrix(y_true, y_pred)

array([[95, 15],
       [19, 50]])

### Result: 
#### the correct predicted result 145 
#### the wrong predicted result 34

### so far, the best result we got.