# Human Protein Atlas Image Classification

to Jan 11, 2019

class number : 28

In [1]:
import pandas     as pd
import numpy      as np
import tensorflow as tf
import cv2
import math
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import random

## utility function

In [2]:
def cmyk_array_unify(ary_c, ary_m, ary_y, ary_k):
    
    # 一次元化して配列が同じか。
    len_c = len(ary_c.reshape(-1,))
    len_m = len(ary_m.reshape(-1,))
    len_y = len(ary_y.reshape(-1,))
    len_k = len(ary_k.reshape(-1,))

    
    if( len_c - len_m + len_y - len_k ) == 0 :
       cmyk = []
       
       d_y_2 = len(ary_c)
       d_x_2 = len(ary_c[0])
       
       for i in range(d_y_2):
           d_x_3 = []
           for j in range(d_x_2):
               d_z_3 = []
               d_z_3.append(ary_c[i][j])
               d_z_3.append(ary_m[i][j])
               d_z_3.append(ary_y[i][j])
               d_z_3.append(ary_k[i][j])
               d_x_3.append(d_z_3)
           cmyk.append(d_x_3)
       return np.array(cmyk)
       
    else:
       print("配列の長さが違う。")
       return 0

In [3]:
def hpaic_image_loader(img_name, folder="./data/train/", resize=(128,128), dim_1 = True):
    """
    image loading and unifying as 3-level tensor
    """
    img_blue   = cv2.resize(cv2.imread( folder + img_name + "_blue"   + ".png", 0), dsize = resize)
    img_green  = cv2.resize(cv2.imread( folder + img_name + "_green"  + ".png", 0), dsize = resize)
    img_red    = cv2.resize(cv2.imread( folder + img_name + "_red"    + ".png", 0), dsize = resize)
    img_yellow = cv2.resize(cv2.imread( folder + img_name + "_yellow" + ".png", 0), dsize = resize)
    
    if dim_1:
        return cmyk_array_unify(img_blue, img_green, img_red, img_yellow).reshape(-1,)
        
    return cmyk_array_unify(img_blue, img_green, img_red, img_yellow)

In [4]:
def o_train_test_split(x_array, y_array, ratio=0.7):
    """
    rand
    """
    x_train_len         = int( len(x_array) * ratio )
    x_train_index_array = random.sample(range(0, len(x_array), 1), k = x_train_len)
    x_test_index_array  = list(set(range(0, len(x_array), 1)) - set(x_train_index_array))
    x_train_array       = [x_array[i] for i in x_train_index_array]
    x_test_array        = [x_array[i] for i in x_test_index_array]
    y_train_array       = [y_array[i] for i in x_train_index_array]
    y_test_array        = [y_array[i] for i in x_test_index_array]
    
    return (x_train_array, x_test_array, y_train_array, y_test_array)

## Tensorflow utility function

In [5]:
def weight_variable(shape):
  initial = tf.truncated_normal(shape, stddev=0.1)
  return tf.Variable(initial)
 
def bias_variable(shape):
  initial = tf.constant(0.1, shape=shape)
  return tf.Variable(initial)

def conv2d(x, W):
  return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
   
def max_pool_2x2(x):
  return tf.nn.max_pool(x, ksize=[1, 4,4, 1],
                        strides=[1,4,4, 1], padding='SAME')

In [6]:
def f_val(y_step_2, t_step_2):
    length = len(y_step_2)
    f_array = []
    for index in range(length):
        TP = sum([1 if (a_step == 1 and b_step == 1) else 0 for a_step, b_step in zip(y_step_2[index], t_step_2[index])])
        FP = sum([1 if (a_step == 1 and b_step == 0) else 0 for a_step, b_step in zip(y_step_2[index], t_step_2[index])])
        FN = sum([1 if (a_step == 0 and b_step == 1) else 0 for a_step, b_step in zip(y_step_2[index], t_step_2[index])])
        f_array.append( TP / ( TP + ( FP + FN ) / 2 ))
    return f_array

## Code

### Data Setting

In [7]:
# yの読み出し（最初の9個）
train_df = pd.read_csv("./data/train.csv")

In [8]:
# image_id
image_name = np.array(train_df["Id"][:])

# target_vector
t_data = np.array(train_df.iloc[:,1:])

In [9]:
X_train_name_array, X_test_name_array, y_train_array, y_test_array = o_train_test_split(image_name, t_data)

In [10]:
batch_size       = 1000
x_train_name_len = len(X_train_name_array)
batch_count      = int(x_train_name_len / batch_size) # 21750 / 50 = 435
batch_count_mod  = x_train_name_len % batch_size      # 21750 % 50 = 72

# randomにindexの値がとられる。[29800, 23100, 26200, 4800, 28800, 13800, 27100, 11200,]
b_top_index_array = random.sample(range(0, x_train_name_len, batch_size), k = batch_count-1)

In [11]:
print(b_top_index_array)
print((batch_count) * batch_size)

[11000, 15000, 13000, 21000, 7000, 9000, 16000, 2000, 4000, 19000, 10000, 0, 20000, 14000, 17000, 18000, 1000, 5000, 12000, 6000]
21000


### Model

#### 他クラス分類問題

In [12]:
# sample_y = np.array([[0.2, 0.9, 0.2, 0.9, 0],[0.2, 0.9, 0.2, 0.9, 0],[0.2, 0.9, 0.2, 0.9, 0.8]])
# sample_t = np.array([[0.1, 0.2, 0.8, 0.9, 0.6],[0.1, 0.2, 0.8, 0.9, 0.6],[0.5, 0.2, 0.8, 0.9, 0.3]])


# 出てくる配列 y = 0 ~ -1 [[0,0,10.8,],[],[],[],[]]

# F値を求めて、tf.reduce_mean する。この段階で、a = [0~1]×入力値数（バッチ数）
#accuracy = tf.reduce_mean(tf.cast(a, "float"))


# まずは誤差関数がちゃんと動くか確認
# tensor →rリストとりだし
# リスト計算→tensor
# tensorflow でf値を求めるやつがあるか？

#### マルチクラス分類問題

In [13]:
# dropout
keep_prob = tf.placeholder("float")

# 入力層
x = tf.placeholder("float", [None, 65536])

# 形状変更
x_image = tf.reshape(x, [-1, 128, 128, 4])

# 第2層 (畳み込み層)
W_conv1 = weight_variable([5, 5, 4, 32])
b_conv1 = bias_variable([32])
y_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)

# 第3層 (プーリング層)
y_pool1      = max_pool_2x2(y_conv1) #32
y_pool1_drop = tf.nn.dropout(y_pool1, keep_prob)
 
# 第4層 (畳み込み層)
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
y_conv2 = tf.nn.relu(conv2d(y_pool1_drop, W_conv2) + b_conv2)
 
# 第5層 (プーリング層)
y_pool2      = max_pool_2x2(y_conv2) #8
y_pool2_drop = tf.nn.dropout(y_pool2, keep_prob)

# 形状変更
y_pool2_flat = tf.reshape(y_pool2_drop, [-1, 4096])
 
# 第6層 (全結合層)
W_fc1 = weight_variable([4096, 1024])
b_fc1 = bias_variable([1024])
y_fc1 = tf.nn.relu(tf.matmul(y_pool2_flat, W_fc1) + b_fc1)
 
# 第7層 (全結合層)
W_fc2 = weight_variable([1024, 28])
b_fc2 = bias_variable([28])
y = tf.sigmoid(tf.matmul(y_fc1, W_fc2) + b_fc2) #<= シグモイド関数
 
# 目標出力の次元
t = tf.placeholder("float", [None, 28])

# 損失関数を計算グラフを作成する
# cross_entropy = -tf.reduce_sum(t * tf.log(y))
# tf.nn.sigmoid_cross_entropy_with_logits(_sentinel=None, labels=None,logits=None,name=None)

cross_entropy = -tf.reduce_sum( t * tf.log(y + 1e-9)) + ((1-t) * tf.log(1 - y + 1e-9)  )

# 次の(1)、(2)を行うための計算グラフを作成する。
# (1) 損失関数に対するネットワークを構成するすべての変数の勾配を計算する。
# (2) 勾配方向に学習率分移動して、すべての変数を更新する。
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
 
# 初期化を行うための計算グラフを作成する。
init_g = tf.global_variables_initializer()
init_l = tf.local_variables_initializer()
 
# テストデータに対する正答率を計算するための計算グラフを作成する。

# correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(t, 1))
# f1_score, update_op = tf.contrib.metrics.f1_score(t, y)

TP = tf.math.count_nonzero(y * t)
TN = tf.math.count_nonzero((y - 1) * (t - 1))
FP = tf.math.count_nonzero(y * (t - 1))
FN = tf.math.count_nonzero((y - 1) * t)


precision = TP / (TP + FP)
recall = TP / (TP + FN)
f1 = 2 * precision * recall / (precision + recall)

accuracy = tf.reduce_mean(tf.cast(f1, "float"))

NameError: name 'precision' is not defined

In [None]:
# セッションを作成して、計算グラフを実行する。
with tf.Session() as sess:
   
    # 初期化を実行する。
    sess.run(init_g)
    sess.run(init_l)
    
    
   
    # 学習を実行する。（エポック）
    for i in tqdm(range(10)):
        train_acc = 0
        #test_acc  = 0
        # ミニパッチ
        for p_index in tqdm(b_top_index_array):
            
            # 最後の配列だった場合
            if p_index == batch_count * batch_size:
                index_array = list(range(p_index, p_index + batch_count_mod )) #　-１いらない？(とった)
            else:
                index_array = list(range(p_index, p_index + batch_size))
            
            # ここで読み出し。
            print(index_array)
            patch_x = [hpaic_image_loader(X_train_name_array[i]) for i in index_array]
            patch_t = [y_train_array[i] for i in index_array]
            
            sess.run(train_step, feed_dict={x: patch_x, t: patch_t, keep_prob: 0.5})
            
            # trainデータの訓練精度を追加、表示
            train_acc += sess.run(accuracy, feed_dict = {x: patch_x, t: patch_t, keep_prob: 1.0})
            
        print('epoch:{} \n \
        tr_acc:{} \n '.format(i, train_acc/len(b_top_index_array)))

            
    saver = tf.train.Saver()
    saver.save(sess, "C:/Users/talla/Dropbox/Computer/Repositry/Self/human-protein-atlas/model/hpaic_model")



  0%|                                                                                           | 0/10 [00:00<?, ?it/s]
  0%|                                                                                           | 0/20 [00:00<?, ?it/s]

[11000, 11001, 11002, 11003, 11004, 11005, 11006, 11007, 11008, 11009, 11010, 11011, 11012, 11013, 11014, 11015, 11016, 11017, 11018, 11019, 11020, 11021, 11022, 11023, 11024, 11025, 11026, 11027, 11028, 11029, 11030, 11031, 11032, 11033, 11034, 11035, 11036, 11037, 11038, 11039, 11040, 11041, 11042, 11043, 11044, 11045, 11046, 11047, 11048, 11049, 11050, 11051, 11052, 11053, 11054, 11055, 11056, 11057, 11058, 11059, 11060, 11061, 11062, 11063, 11064, 11065, 11066, 11067, 11068, 11069, 11070, 11071, 11072, 11073, 11074, 11075, 11076, 11077, 11078, 11079, 11080, 11081, 11082, 11083, 11084, 11085, 11086, 11087, 11088, 11089, 11090, 11091, 11092, 11093, 11094, 11095, 11096, 11097, 11098, 11099, 11100, 11101, 11102, 11103, 11104, 11105, 11106, 11107, 11108, 11109, 11110, 11111, 11112, 11113, 11114, 11115, 11116, 11117, 11118, 11119, 11120, 11121, 11122, 11123, 11124, 11125, 11126, 11127, 11128, 11129, 11130, 11131, 11132, 11133, 11134, 11135, 11136, 11137, 11138, 11139, 11140, 11141, 11142

## Model Restore and Prediction

In [41]:
sess = tf.Session()
sess.run(init)
saver = tf.train.import_meta_graph('./model/hpaic_model.meta')
saver.restore(sess,  tf.train.latest_checkpoint('./model/'))
print('Restored a model')

INFO:tensorflow:Restoring parameters from C:/Users/talla/Dropbox/Computer/Repositry/Self/human-protein-atlas/model/hpaic_model
Restored a model


In [42]:
# yの読み出し（最初の9個）
sub_df = pd.read_csv("./data/sample_submission.csv")

In [43]:
# 名前の読み出し。
sub_image_name  = np.array(sub_df["Id"][:10])
print(sub_image_name)

# 画像の一次元化
sub_image_array = np.array([hpaic_image_loader(i, "./data/test/" ) for i in sub_image_name])

['00008af0-bad0-11e8-b2b8-ac1f6b6435d0'
 '0000a892-bacf-11e8-b2b8-ac1f6b6435d0'
 '0006faa6-bac7-11e8-b2b7-ac1f6b6435d0'
 '0008baca-bad7-11e8-b2b9-ac1f6b6435d0'
 '000cce7e-bad4-11e8-b2b8-ac1f6b6435d0'
 '00109f6a-bac8-11e8-b2b7-ac1f6b6435d0'
 '001765de-bacd-11e8-b2b8-ac1f6b6435d0'
 '0018641a-bac9-11e8-b2b8-ac1f6b6435d0'
 '00200f22-bad7-11e8-b2b9-ac1f6b6435d0'
 '0026f154-bac6-11e8-b2b7-ac1f6b6435d0']


In [44]:
pred_test = sess.run(y, feed_dict={x: sub_image_array, keep_prob: 1.0})

In [45]:
print(pred_test)

[[0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00 1.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00 1.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00]
 [0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00 0.0000000e+00
  0.0000000e+00 0.0000000e+00 0.0000000e+00 0.

In [47]:
s = tf.argmax([1,2,3,4,5], 1)

In [48]:
print(s)

Tensor("ArgMax_20:0", shape=(), dtype=int64)


In [None]:
# tensorflowのgpu動作確認●
# テストデータの評価
# 答えの出力（方法、形式、確認）

In [6]:
# test_dataとtrain_dataで訓練
X_train, X_test,y_train, y_test = train_test_split(x_data, t_data, test_size=0.3, random_state=0)

In [5]:
# yをもとにxを読み出し。(_blue, _green, _red, _yellow)でテンソル３階層する。
cmyk = ["_blue","_green","_red","_yellow"]
x_list = []

for img in tqdm(image_name):
    img_blue   = cv2.imread( "./data/train/" + img + "_blue"   + ".png", 0)
    img_green  = cv2.imread( "./data/train/" + img + "_green"  + ".png", 0)
    img_red    = cv2.imread( "./data/train/" + img + "_red"    + ".png", 0)
    img_yellow = cv2.imread( "./data/train/" + img + "_yellow" + ".png", 0)
    
    cmyk       = cmyk_array_unify(img_blue, img_green, img_red, img_yellow)
    cmyk_plane = cmyk.reshape(-1,)
    x_list.append(cmyk_plane)
    
x_data = np.array(x_list)

100%|██████████████████████████████████████████████████████████████████████████| 31072/31072 [4:32:11<00:00,  1.98it/s]


In [46]:
image_r = tf.read_file("/data/train/00ab10d6-bba4-11e8-b2b9-ac1f6b6435d0_blue.png")
image = tf.image.decode_png(image_r, channels=3)
image_float = tf.to_float(image)
print(tf.shape(image))
print(image.get_shape)
print(image_float)

Tensor("Shape_17:0", shape=(3,), dtype=int32)
<bound method Tensor.get_shape of <tf.Tensor 'DecodePng_17:0' shape=(?, ?, 3) dtype=uint8>>
Tensor("ToFloat_4:0", shape=(?, ?, 3), dtype=float32)


In [140]:
"""
sessionの作成

"""
#　訓練データの読み込み
np.set_printoptions(threshold=10)
img = cv2.imread("./data/train/00ab10d6-bba4-11e8-b2b9-ac1f6b6435d0_blue.png",0)
print(img)
print(type(img))

[[41 45 34 ...  0  0  0]
 [43 50 44 ...  0  0  0]
 [42 46 51 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]]
<class 'numpy.ndarray'>


In [132]:
z_c = np.array([["a","b","c"],["d","e","f"]])
z_m = np.array([["g","h","i"],["j","k","l"]])
z_y = np.array([["m","n","o"],["p","q","r"]])
z_k = np.array([["s","t","u"],["v","w","x"]])

In [133]:
s = cmyk_array_unify(z_c,z_m,z_y,z_k)
print(s)

[[['a' 'g' 'm' 's']
  ['b' 'h' 'n' 't']
  ['c' 'i' 'o' 'u']]

 [['d' 'j' 'p' 'v']
  ['e' 'k' 'q' 'w']
  ['f' 'l' 'r' 'x']]]


In [134]:
x_image = s.reshape(-1,)
print(x_image)

['a' 'g' 'm' ... 'l' 'r' 'x']


In [None]:
#コードの作成

In [20]:
# gpu動作確認コード
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 3786672106358165647, name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 6674410373
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 2611446402891614919
 physical_device_desc: "device: 0, name: GeForce GTX 1080, pci bus id: 0000:01:00.0, compute capability: 6.1"]

In [79]:
print(r)

Tensor("Reshape:0", shape=(262144,), dtype=uint8)


In [None]:
# target_dataの読み込み

train_labels = pd.read_csv("data/train_original.csv")

In [None]:
# w = tf.Variable(tf.zeros([2,1])) #入力は２次元
#     b = tf.Variable(tf.zeros([1]))   #入力は２次元

#     # def y(x):
#     #     return sigmoid(np.dot(w,x)+b)
#     # def sigmoid(x):
#     #     return 1 / (1 + np.exp(-x))

x = tf.placeholder(tf.float32, shape=[None, 512]) #入力
t = tf.placeholder(tf.float32, shape=[None, 1]) #正解出力
y = tf.nn.sigmoid(tf.matmul(x,w)+b)

In [10]:
from tensorflow.examples.tutorials.mnist import input_data

In [11]:
mnist = input_data.read_data_sets("data/", one_hot=True)
x_batch, t_batch = mnist.train.next_batch(100)

Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.
Instructions for updating:
Please write your own downloading logic.
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting data/train-images-idx3-ubyte.gz
Instructions for updating:
Please use tf.data to implement this functionality.
Extracting data/train-labels-idx1-ubyte.gz
Instructions for updating:
Please use tf.one_hot on tensors.
Extracting data/t10k-images-idx3-ubyte.gz
Extracting data/t10k-labels-idx1-ubyte.gz
Instructions for updating:
Please use alternatives such as official/mnist/dataset.py from tensorflow/models.


In [13]:
print(x_batch[0])
print(t_batch[0])
print(type(x_batch[0]))
print(type(t_batch[0]))

#print(t_batch)

[0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         

In [None]:
with tf.Session() as sess:
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(coord=coord)
    img = sess.run(image)
    Image.fromarray(np.uint8(img)).show()

Instructions for updating:
To construct input pipelines, use the `tf.data` module.


In [2]:
data = pd.read_csv("data/train.csv")

In [9]:
col = np.array(data.columns[1:])

In [14]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 2150555571929651621, name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 6871947673
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 3101695577045397688
 physical_device_desc: "device: 0, name: GeForce GTX 1080, pci bus id: 0000:01:00.0, compute capability: 6.1"]