In [1]:
import tensorflow as tf
import pandas as pd

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

import matplotlib.pyplot as plt

# Titanic data

In [2]:
titanic_df = pd.read_csv('~/dropbox/python숙달/data/titanic.csv')

In [3]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


# 나이 평균입력

In [4]:
titanic_df['Age'][pd.isnull(titanic_df['Age'])] = titanic_df.Age.mean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  titanic_df['Age'][pd.isnull(titanic_df['Age'])] = titanic_df.Age.mean()


In [5]:
# binary label
titanic_df['Survived'].unique()

array([0, 1], dtype=int64)

In [6]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          891 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [7]:
titanic_df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,13.002015,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,22.0,0.0,0.0,7.9104
50%,446.0,0.0,3.0,29.699118,0.0,0.0,14.4542
75%,668.5,1.0,3.0,35.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


In [8]:
X = titanic_df[['Age', 'Sex', 'Fare', 'Pclass']]

In [9]:
y = titanic_df.Survived

In [10]:
X = pd.get_dummies(X, columns=['Sex', 'Pclass'])

In [11]:
## 다중 공선성 문제를 방지하기 위해서 ONHOT ENCORDING한 COLUMN 하나를 제외하자. 
# reference : https://stats.stackexchange.com/questions/231285/dropping-one-of-the-columns-when-using-one-hot-encoding
X = X.iloc[:,:-1]

In [12]:
# 연속형 변수만 scaling
scaler = MinMaxScaler()
x_scaled = X[['Age', 'Fare']]
scaler.fit(x_scaled)
x_scaled = pd.DataFrame(scaler.transform(x_scaled), columns=['Age', 'Fare'])
print(scaler.transform([[25, 30]]))

[[0.30887158 0.0585561 ]]


In [13]:
cate_df = X[['Sex_female', 'Pclass_1', 'Pclass_2']]

In [14]:
df = pd.concat([x_scaled, cate_df], axis = 1)

# train test split

In [15]:
x_train, x_test, y_train, y_test = train_test_split(df, y, test_size=0.3, stratify= y)

# tf version

In [17]:
x_train = tf.constant(x_train, tf.float32)
y_train = tf.constant(y_train, tf.float32)
# change shape because match row
y_train = tf.reshape(y_train, shape=(-1,1))


In [18]:
initX = tf.initializers.GlorotUniform()
w = tf.Variable(initX(shape=[5,1]), tf.float32)
b = tf.Variable(initX(shape = [1]), tf.float32)

In [19]:
def compute_loss():
    z = tf.matmul(x_train, w) + b
    cost_i = tf.nn.sigmoid_cross_entropy_with_logits( labels=y_train , logits= z)
    cost = tf.reduce_mean( cost_i )
    return cost

In [20]:
optimizer = Adam(0.1)
for i in range(1000):
    optimizer.minimize(compute_loss, var_list = [w, b])
    print(i, "cost: ", compute_loss().numpy())

0 cost:  0.86545444
1 cost:  0.8121248
2 cost:  0.7700056
3 cost:  0.73850447
4 cost:  0.71489644
5 cost:  0.6960288
6 cost:  0.67956215
7 cost:  0.664064
8 cost:  0.6488452
9 cost:  0.63368034
10 cost:  0.61858153
11 cost:  0.603673
12 cost:  0.58912915
13 cost:  0.57514256
14 cost:  0.56190085
15 cost:  0.5495695
16 cost:  0.5382762
17 cost:  0.52810085
18 cost:  0.5190695
19 cost:  0.5111554
20 cost:  0.50428635
21 cost:  0.49835584
22 cost:  0.49323672
23 cost:  0.4887958
24 cost:  0.48490584
25 cost:  0.48145476
26 cost:  0.47835177
27 cost:  0.4755291
28 cost:  0.47294194
29 cost:  0.47056475
30 cost:  0.46838707
31 cost:  0.46640837
32 cost:  0.46463248
33 cost:  0.4630636
34 cost:  0.46170273
35 cost:  0.46054465
36 cost:  0.45957816
37 cost:  0.4587856
38 cost:  0.45814443
39 cost:  0.45762897
40 cost:  0.45721343
41 cost:  0.45687318
42 cost:  0.45658755
43 cost:  0.4563403
44 cost:  0.45612058
45 cost:  0.45592204
46 cost:  0.4557425
47 cost:  0.45558262
48 cost:  0.45544395

390 cost:  0.45393664
391 cost:  0.45393664
392 cost:  0.45393664
393 cost:  0.45393664
394 cost:  0.45393664
395 cost:  0.45393664
396 cost:  0.45393667
397 cost:  0.45393667
398 cost:  0.45393673
399 cost:  0.45393667
400 cost:  0.45393667
401 cost:  0.45393667
402 cost:  0.45393667
403 cost:  0.45393667
404 cost:  0.45393667
405 cost:  0.45393673
406 cost:  0.45393667
407 cost:  0.45393667
408 cost:  0.45393664
409 cost:  0.45393664
410 cost:  0.45393667
411 cost:  0.45393667
412 cost:  0.45393664
413 cost:  0.45393667
414 cost:  0.45393667
415 cost:  0.45393667
416 cost:  0.45393667
417 cost:  0.45393667
418 cost:  0.45393667
419 cost:  0.45393664
420 cost:  0.45393667
421 cost:  0.45393667
422 cost:  0.45393667
423 cost:  0.45393667
424 cost:  0.45393667
425 cost:  0.45393667
426 cost:  0.45393667
427 cost:  0.45393667
428 cost:  0.45393667
429 cost:  0.45393664
430 cost:  0.45393667
431 cost:  0.45393667
432 cost:  0.45393667
433 cost:  0.45393667
434 cost:  0.45393667
435 cost: 

771 cost:  0.45393667
772 cost:  0.45393667
773 cost:  0.45393667
774 cost:  0.45393667
775 cost:  0.45393667
776 cost:  0.45393667
777 cost:  0.45393667
778 cost:  0.45393667
779 cost:  0.45393667
780 cost:  0.45393667
781 cost:  0.45393667
782 cost:  0.45393667
783 cost:  0.45393667
784 cost:  0.45393667
785 cost:  0.45393667
786 cost:  0.45393667
787 cost:  0.45393667
788 cost:  0.45393667
789 cost:  0.45393667
790 cost:  0.45393667
791 cost:  0.45393667
792 cost:  0.45393667
793 cost:  0.45393667
794 cost:  0.45393667
795 cost:  0.45393667
796 cost:  0.45393667
797 cost:  0.45393667
798 cost:  0.45393667
799 cost:  0.45393667
800 cost:  0.45393667
801 cost:  0.45393667
802 cost:  0.45393667
803 cost:  0.45393667
804 cost:  0.45393667
805 cost:  0.45393667
806 cost:  0.45393667
807 cost:  0.45393667
808 cost:  0.45393667
809 cost:  0.45393667
810 cost:  0.45393667
811 cost:  0.45393667
812 cost:  0.45393667
813 cost:  0.45393667
814 cost:  0.45393667
815 cost:  0.45393667
816 cost: 

In [21]:
def hxfn( xdata):
    xd = tf.constant( xdata, tf.float32)
    z = tf.matmul(xd, w) + b
    hx = tf.nn.sigmoid(z)
    return (hx.numpy() > 0.5) + 0

In [22]:
df.columns

Index(['Age', 'Fare', 'Sex_female', 'Pclass_1', 'Pclass_2'], dtype='object')

In [23]:
hxfn([[0.30887158, 0.0585561, 1, 1, 0]])

array([[1]])

In [24]:
pred = hxfn(x_test)

In [25]:
accuracy_score(y_test, pred), f1_score(y_test, pred)

(0.7761194029850746, 0.6969696969696969)

# Keras version

In [26]:
initX = tf.initializers.GlorotUniform()
w = tf.Variable(initX(shape=[5,1]))
b = tf.Variable(initX(shape = [1]))

In [33]:
x_test, x_valid, y_test, y_valid = train_test_split(x_test, y_test, test_size=0.2, stratify= y_test)

In [35]:
dense = Dense( units = 1, input_dim = 5, activation = 'sigmoid')
model = Sequential([dense])
model.compile(loss = 'binary_crossentropy', optimizer = Adam(0.1), metrics = ['acc'])
earlyStop = EarlyStopping(monitor = 'loss', min_delta = 0.01, patience = 100)
h = model.fit( x_train, y_train, epochs = 1000, validation_data= (x_valid, y_valid), callbacks= [earlyStop])

Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000


Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000
Epoch 74/1000
Epoch 75/1000
Epoch 76/1000
Epoch 77/1000
Epoch 78/1000
Epoch 79/1000
Epoch 80/1000
Epoch 81/1000
Epoch 82/1000
Epoch 83/1000
Epoch 84/1000
Epoch 85/1000
Epoch 86/1000
Epoch 87/1000
Epoch 88/1000
Epoch 89/1000
Epoch 90/1000
Epoch 91/1000
Epoch 92/1000
Epoch 93/1000
Epoch 94/1000
Epoch 95/1000
Epoch 96/1000
Epoch 97/1000
Epoch 98/1000
Epoch 99/1000
Epoch 100/1000
Epoch 101/1000
Epoch 102/1000
Epoch 103/1000
Epoch 104/1000
Epoch 105/1000
Epoch 106/1000
Epoch 107/1000
Epoch 108/1000
Epoch 109/1000
Epoch 110/1000
Epoch 111/1000
Epoch 112/1000
Epoch 113/1000
Epoch 114/1000
Epoch 115/1000
Epoch 116/1000
Epoch 117/1000
Epoch 118/1000
Epoch 119/1000
Epoch 120/1000
Epoch 121/1000
Epoch 122/1000
Epoch 123/1000
Epoch 124/1000
Epoch 125/1000


Epoch 126/1000
Epoch 127/1000
Epoch 128/1000
Epoch 129/1000
Epoch 130/1000
Epoch 131/1000
Epoch 132/1000
Epoch 133/1000
Epoch 134/1000
Epoch 135/1000
Epoch 136/1000
Epoch 137/1000
Epoch 138/1000


In [36]:
w, b = dense.get_weights()

In [37]:
(model.predict([[0.30887158, 0.0585561, 1, 1, 0]])> 0.5) + 0

array([[1]])

In [39]:
model.evaluate(x_train, y_train)



[0.4475724697113037, 0.7943925261497498]

In [40]:
pred_k = (model.predict(x_test) > 0.5) + 0


In [42]:
accuracy_score(y_test, pred_k), f1_score(y_test, pred_k)

(0.794392523364486, 0.7179487179487181)