In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import skimage
import os
import seaborn as sns
import pandas as pd
import gc
import math
from skimage import transform, io
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Conv2D, MaxPool2D, BatchNormalization, Dropout,Activation,Flatten

In [2]:
Img_dir = '../input/dian-set/images/'
train_label_dir = '../input/dian-set/labels/training_label'
test_label_dir ='../input/dian-set/labels/test_label'
words_dir = '../input/dian-set/labels/words'

# Generate data

In [5]:
def generatedata(label_dir, imgs_dir):
    with open(label_dir,'r') as f:
        contents = f.readlines() 
    X = []
    for content in contents:
        vals = content.split()
        curImg_dir = vals[0][:-2] + '00/'
        img = io.imread(imgs_dir + curImg_dir + vals[0] + '.jpeg')
        img = transform.resize(img, (192, 192))
        img = np.array(img)
        img = img[8:184,8:184,:]
        
        X.append(img)
    X = np.array(X)
    return X

In [None]:
X = generatedata(train_label_dir,Img_dir)

np.save('./train_x.npy',X)
print('training data generated')

In [None]:
words = pd.read_csv(words_dir, header=None)
words.columns = ['features']
label_num = 374
with open(train_label_dir, 'r') as f:
    contents = f.readlines()

In [None]:
def parse_content(contents):
    y_ = []
    for content in contents:
        args = content.split()
        cur_label = np.zeros(374)
        for i in range(1, len(args)):
            cur_label[int(args[i]) - 1] = 1
        y_.append(cur_label)
    y_ = np.array(y_)
    return y_

In [None]:
labels = parse_content(contents)
print(labels.shape)
print(words)

## The freq of training set

In [None]:
freq = np.nonzero(labels)[1]
freq = np.bincount(freq)
print(freq)
v = np.arange(0, label_num)
sns.barplot(x = v, y = freq)

## get most probable

In [None]:
most_probable_index = np.argsort(-freq)
print(most_probable_index)

## the propotion of the first most-appeared

In [None]:
total_labels = np.sum(freq)
count_label = 0
for i in range(label_num):
    count_label = count_label + freq[most_probable_index[i]]
    print('Counting the first {} labels, occupying {}, appear for {} times'.format(i+1, count_label / total_labels, freq[most_probable_index[i]]))

## The whole dataset

In [None]:
data = np.zeros((4500, label_num)) # 第一个维度是元素数量，第二个维度是是否满足标签
for i in range(labels.shape[0]):
    cur_label = np.nonzero(labels[i]) # 获取当前图片所有的标签
    data[i,cur_label] = 1
Data_collection = pd.DataFrame(data = data, dtype = int, columns=range(label_num))
print(Data_collection)
print(np.sum(Data_collection[4])) # check if it is correct

## Drop feature and get new_words

**new_words** structure : 

**cur_index**|pre_index|features
-|-|-
0|0|city
...|...|...
343|373|hawaii

In [None]:
rid_num = 30
least_appear = most_probable_index[-rid_num:] - 1
new_Data = Data_collection.drop(least_appear, axis=1).columns # 剩下的标签
tmp = {'index':new_Data, 'features':words['features'][new_Data]}
new_words = pd.DataFrame(tmp)
new_words.index = range(label_num - rid_num)
new_words.to_csv('./new_words.csv')
print(new_words)

## get new labels

In [None]:
new_train_data = Data_collection.drop(least_appear, axis=1)
new_train_data.columns = range(label_num - rid_num)
new_train_data.to_csv('./new_labels.csv')
print(new_train_data)

In [None]:
np.save('./train_y',new_train_data.values)

In [None]:
"""
    input : train_dir : dir of Images
    labels : a list of labels with the following format:
        [
            [Image_index1 label1 label2....]
            [Image_index2 label1 label2....]
            ...
        ]
    return a batch of data
"""
def getdata(train_dir, labels):
    X = []
    for label in labels:
        args = label.split()
        curImg_dir = Img_dir + args[0][:-2] + '00/' + args[0] + '.jpeg'
        img = io.imread(curImg_dir)
        img = transform.resize(img, (192, 192))
        img = np.array(img)
        img = img[8:184, 8:184, :]
        X.append(img)
        
    X = np.array(X)
    return X

## Model

In [None]:
class myModel(Model):
    def __init__(self):
        super().__init__()
        self.c1 = Conv2D(filters=64, kernel_size=5, padding='valid')  # 卷积层1
        self.flatten = Flatten()
        self.f3 = Dense(344, activation='softmax')
        
    @tf.function
    def call(self, x):
        x = self.c1(x)
        x = self.flatten(x)
        y = self.f3(x)
        return y

## Loss function

In [None]:
@tf.function
def macro_soft_f1(y_true, y_pred):
    """Compute the macro soft F1-score as a cost (average 1 - soft-F1 across all labels).
    Use probability values instead of binary predictions.
    
    Args:
        y (int32 Tensor): targets array of shape (BATCH_SIZE, N_LABELS)
        y_hat (float32 Tensor): probability matrix from forward propagation of shape (BATCH_SIZE, N_LABELS)
        
    Returns:
        cost (scalar Tensor): value of the cost function for the batch
    """
    y_true = tf.cast(y_true, tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)
    tp = tf.reduce_sum(y_pred * y_true, axis=0)
    fp = tf.reduce_sum(y_pred * (1 - y_true), axis=0)
    fn = tf.reduce_sum((1 - y_pred) * y_true, axis=0)
    soft_f1 = 2*tp / (2*tp + fn + fp + 1e-16)
    cost = 1 - soft_f1 # reduce 1 - soft-f1 in order to increase soft-f1
    macro_cost = tf.reduce_mean(cost) # average on all labels
    return macro_cost

In [None]:
@tf.function
def myLoss(y_true, y_pred):
    y_true = tf.cast(y_true, dtype = tf.float32)
    y_pred = tf.cast(y_pred, tf.float32)
    return tf.nn.sigmoid_cross_entropy_with_logits(labels = y_true, logits=y_pred)

## accuracy

In [None]:
@tf.function
def get_f1(y_true, y_pred, thresh=0.5):
    """Compute the macro F1-score on a batch of observations (average F1 across labels)
    
    Args:
        y (int32 Tensor): labels array of shape (BATCH_SIZE, N_LABELS)
        y_hat (float32 Tensor): probability matrix from forward propagation of shape (BATCH_SIZE, N_LABELS)
        thresh: probability value above which we predict positive
        
    Returns:
        macro_f1 (scalar Tensor): value of macro F1 for the batch
    """
    y_true = tf.cast(y_true, dtype=tf.float32)
    y_pred = tf.cast(tf.greater(y_pred, thresh), tf.float32)
    tp = tf.cast(tf.math.count_nonzero(y_pred * y_true), tf.float32)
    fp = tf.cast(tf.math.count_nonzero(y_pred * (1 - y_true)), tf.float32)
    fn = tf.cast(tf.math.count_nonzero((1 - y_pred) * y_true), tf.float32)
    f1 = 2*tp / (2*tp + fn + fp + 1e-16)
    return f1

In [None]:
@tf.function
def get_precision(y_true, y_pred, thresh=0.5):
    y_true = tf.cast(y_true, dtype=tf.float32)
    y_pred = tf.cast(tf.greater(y_pred, thresh), tf.float32)
    tp = tf.cast(tf.math.count_nonzero(y_pred * y_true), tf.float32)
    fp = tf.cast(tf.math.count_nonzero(y_pred * (1 - y_true)), tf.float32)
    precision = tp / (tp + fp)
    return precision

In [None]:
@tf.function
def get_recall(y_true, y_pred, thresh=0.5):
    y_true = tf.cast(y_true, dtype=tf.float32)
    y_pred = tf.cast(tf.greater(y_pred, thresh), tf.float32)
    tp = tf.cast(tf.math.count_nonzero(y_pred * y_true), tf.float32)
    fn = tf.cast(tf.math.count_nonzero((1 - y_pred) * y_true), tf.float32)
    recall = tp / (tp + fn)
    return recall

In [None]:
"""
    data_batch: 
        img <- getdata(Img_dir, content)
        onehot label <- new_train_data.iloc[from:to].values
"""

In [None]:
def train(train_x, train_y, epoch = 1):
    batch_size = 32
    m = train_x.shape[0]
    val_num = 900
    seed = 1234
    train_num = m - val_num
    log_dir = 'checkpoint'
    history_path = './history.csv'
    optimizer = tf.keras.optimizers.Adam()
    batch_num = (m - val_num) // batch_size
    loop_per_epoch = math.ceil(train_num / (epoch * 4))
    
    model = myModel()
    summary_writer = tf.summary.create_file_writer(log_dir)
    checkpoint = tf.train.Checkpoint(myAwesome = model)
    train_db = tf.data.Dataset.from_tensor_slices((train_x, train_y)).shuffle(seed).batch(64).repeat(epoch)
    
    cur_training_log = pd.DataFrame(columns = ['f1', 'precision', 'recall'])
    for i, (x_batch, y_batch) in enumerate(train_db):
        with tf.GradientTape() as tape:
            y_pred = model(x_batch)
            loss = myLoss(y_true = y_batch, y_pred = y_pred)

            f1 = get_f1(y_true = y_batch, y_pred = y_pred)
            precision = get_precision(y_true = y_batch, y_pred = y_pred)
            print('point 2')
            recall = get_recall(y_true = y_batch, y_pred = y_pred)

            s = pd.Series([f1, precision, recall], index = cur_training_log.columns)
            cur_training_log.append(s, ignore_index = True)
            loss = tf.reduce_mean(loss)
            print('batch finished')
        grads = tape.gradient(loss, model.variables)
        optimizer.apply_gradients(grads_and_vars = zip(grads, model.variables))
        del grads
        gc.collect()
        if 0 == i % 100:
            path = checkpoint.save('./checkpoint/12.ckpt')
            print('model saved to %s' % path)
            
        # TODO: validation set
        
    if os.path.exists(history_path):
        history = pd.read_csv(history_path)
    else:
        history = pd.DataFrameata(columns = ['f1', 'precision', 'recall'])
    
    history.append(cur_training_log, ignore_index = True)
    history.to_csv(history_path)
    
    return cur_training_log
            

# LOAD DATA

In [None]:
y_train = np.load('./train_y.npy', allow_pickle = True)
x_train = np.load('./train_x.npy', allow_pickle = True)

# np.random.seed(1234)
# np.random.shuffle(y_train)
# print(y_train)

# RUN

In [None]:
log_data = train(x_train, y_train)