## 模型验证

In [1]:
import pandas as pd
df = pd.read_table('../VOCdevkit/VOC2012/ImageSets/Main/car_train.txt', delim_whitespace=True, names=('file_name', 'label'))

In [2]:
#解析pascal voc数据
import glob
import pandas as pd


label_files_path = '../VOCdevkit/VOC2012/ImageSets/Main/*.txt' 
files = glob.glob(label_files_path)
image_files_path = '../VOCdevkit/VOC2012/JPEGImages/' 

#使用字典存储每个类的文件路径

#data_train key为label，value为文件名称，考虑到类别平衡问题
data_train = {}
#data_val key为文件名称，value为label，方便处理
data_val = {}
val_files = []#用于存储所有的文件

#获取label，并初始化字典
for file in files:
    if '_train.txt' in file:#
        label = file.split('/')[-1].split('.txt')[0].split('_')[0]
        data_train[label] = []

keys = data_train.keys()
data_label = {}
i = 0
for key in keys:
    data_label[key] = i
    i+=1

#获取每个类对应的数据文件
for file in files:
    if '_train.txt' in file:
        label = file.split('/')[-1].split('.txt')[0].split('_')[0]
        df = pd.read_table(file, delim_whitespace=True, names=('file_name', 'label'))
        #获取df中label==1的file_name
        df_positive = df[df['label']==1]#获取对应的正类文件
        for index, row in df_positive.iterrows():
            data_train[label].append(image_files_path + row['file_name'] + '.jpg')
    if '_val.txt' in file:
        label = file.split('/')[-1].split('.txt')[0].split('_')[0]#value
        df = pd.read_table(file, delim_whitespace=True, names=('file_name', 'label'))
        #获取df中label==1的file_name
        df_positive = df[df['label']==1]#获取对应的正类文件, 对应的文件为key
        for index, row in df_positive.iterrows():
            val_file_path = image_files_path + row['file_name'] + '.jpg'
            val_files.append(val_file_path)
            #直接存储label值
            data_val[val_file_path] = data_label[label]

In [3]:
#构建迭代器，并考虑每个batch每类数据的平衡性
import random
import cv2
import numpy as np

def preprocess(img, target_size = 224):
    '''
    数据处理，包含两部分：
    S1：resize操作
    S2: normal操作
    '''
    img = cv2.resize(img, (target_size, target_size))
    
    #normal
    img = img / 127.5
    img = img - 1.
    
    return img

def load_image(batch_files):
    '''
    图像数据加载
    batch_files: 待加载的数据列表
    '''
    batch_data = []
    
    for file in batch_files:
        img = cv2.imread(file)
        img = preprocess(img, target_size=224)
        batch_data.append(img)
        
    batch_data = np.array(batch_data)
    return batch_data

def train_generator(data, data_label, batch_size=32, steps=20):
    '''
    每次从数据样本中抽取4类数据进行训练
    :param data: dict, 训练数据
    :param batch_size: integer， batch大小，最小为4
    :param steps: integer， 单次epoch运行次数
    '''
    while True:
        keys = data.keys()
        while True:
            classes = random.sample(keys, 4)#随机选取其中的四类
            
            #求取每类样本的数目
            batch_num = int(batch_size / 4)
            batch_files = []
            batch_label = []#保留label值
            for cls in classes:
                files = data[cls]#获取当前类别的所有文件
                random.shuffle(files)#随机打乱
                batch_files = batch_files + files[0:batch_num]#获取每一类数据的batch，组成batch_files
                batch_label.append(data_label[cls])#保存所取数据的类别，最终生成的类别list需要再次进行repeat 4操作          
            
            batch_data = load_image(batch_files)
            batch_label = np.array(batch_label)
            batch_label = batch_label.repeat(batch_num)#标签重复四次，获取真实label
            
            #shuffle，打乱类别顺序
            index = list(range(batch_size))
            batch_data = batch_data[index]
            batch_label = batch_label[index]
            batch_label = np.expand_dims(batch_label, -1)
            
            yield batch_data, batch_label
            
            
def train_generator2(data, data_label, batch_size=32):
    '''
    每次从数据样本中抽取4类数据进行训练
    :param data: dict, 训练数据
    :param batch_size: integer， batch大小，最小为4
    :param steps: integer， 单次epoch运行次数
    '''
    i = 0
    while True:        
        keys = data.keys()
        #while True:
        classes = random.sample(keys, 4)#随机选取其中的四类

        #求取每类样本的数目
        batch_num = int(batch_size / 4)
        batch_files = []
        batch_label = []#保留label值
        for cls in classes:
            files = data[cls]#获取当前类别的所有文件
            random.shuffle(files)#随机打乱
            batch_files = batch_files + files[0:batch_num]#获取每一类数据的batch，组成batch_files
            batch_label.append(data_label[cls])#保存所取数据的类别，最终生成的类别list需要再次进行repeat 4操作          

        batch_data = load_image(batch_files)
        batch_label = np.array(batch_label)
        batch_label = batch_label.repeat(batch_num)#标签重复四次，获取真实label

        #shuffle，打乱类别顺序
        index = list(range(batch_size))
        batch_data = batch_data[index]
        batch_label = batch_label[index]
        batch_label = np.expand_dims(batch_label, -1)

        yield batch_data, batch_label

In [4]:
#check iterator
gen = train_generator2(data_train, data_label, batch_size=4)

for i in range(1):
    batch_data, batch_label = next(gen)
    print(batch_data.shape, batch_label)

(4, 224, 224, 3) [[ 4]
 [11]
 [ 0]
 [18]]


In [5]:
#val_gen每次全部进行，不需要进行shuffle
#虽然可以直接传入字典，但是从字典中获取片段相比list有些耗时
def val_generator(val_files, val_label, batch_size=32):
    '''
    每次从数据样本中抽取4类数据进行训练
    :param data: dict, 训练数据
    :param data_label: dict，用于保存数据标签
    :param batch_size: integer， batch大小
    :param steps: integer， 单次epoch运行次数
    '''
    while True:
        steps = int(len(val_files)//batch_size)
        for i in range(steps):               
            #求取每类样本的数目
            batch_files = val_files[i*batch_size:(i+1)*batch_size]#获取文件切片
            #数据加载，并load标签
            batch_data = []
            batch_label = []
            
            for file in batch_files:
                img = cv2.imread(file)
                img =preprocess(img, target_size=224)
                batch_data.append(img)
                batch_label.append(val_label[file])
            
            batch_data = np.array(batch_data)
            batch_label = np.array(batch_label)
            batch_label = np.expand_dims(batch_label, -1)
            
            yield batch_data, batch_label

In [6]:
#check val iterator
val_gen = val_generator(val_files, data_val, 8)
for i in range(1):
    batch_data, batch_label = next(gen)
    print(batch_data.shape, batch_label)

(4, 224, 224, 3) [[19]
 [ 4]
 [16]
 [ 2]]


In [7]:
print(data_label)

{'diningtable': 0, 'pottedplant': 1, 'horse': 2, 'sheep': 3, 'chair': 4, 'cat': 5, 'dog': 6, 'car': 7, 'motorbike': 8, 'bird': 9, 'bicycle': 10, 'boat': 11, 'bus': 12, 'train': 13, 'cow': 14, 'person': 15, 'aeroplane': 16, 'bottle': 17, 'sofa': 18, 'tvmonitor': 19}


In [8]:
import sys
sys.path.append('./models/resnext50.py')
from models.resnext50 import ResNext50
import keras

Using TensorFlow backend.


In [9]:
def initial_weighted(labels):
    '''
    根据label进行权重初始化
    '''
    weighted = {}#用list好处理
    for label in labels:
        weighted[label]=1.0#默认权重都是1
    return weighted

def weighted_categoritical_loss(weighted):
    print('check33333333333')
    def weighted_categoritical_loss_inline(y_true, y_pred):
        print("check111111111111", y_true.shape)
        print('check22222222222222', y_pred.shape)
        result2 = -tf.reduce_sum(y_true*tf.log(y_pred),1)
        return result2
    return weighted_categoritical_loss_inline

In [10]:
#设置weigithed
labels = list(range(20))
weighted = initial_weighted(labels)
weighted[2]=3

In [11]:
#开始check模型
import tensorflow as tf

train_gen = train_generator2(data_train, data_label, batch_size=4)#当iterator设置100时，会报错，为什么？难道是我之前理解有误？
val_gen = val_generator(val_files, data_val, 4)

print(len(data_label))
#model设置
#model = Inception_Resnet_v2(input_shape=(299,299,3), classes = len(data_label))
#model = ResNet50(input_shape=(224,224,3), classes = len(data_label))
model = ResNext50(input_shape=(224,224,3), classes = len(data_label))
model.compile(optimizer=keras.optimizers.SGD(), loss=keras.losses.sparse_categorical_crossentropy, metrics=['acc'])
#model.compile(optimizer=keras.optimizers.SGD(), loss=weighted_categoritical_loss(weighted), metrics=['acc'])

20


In [12]:
model.fit_generator(generator=train_gen,
                      steps_per_epoch=1000,
                      epochs=10,
                      verbose = 1,
                      validation_data=val_gen,
                      validation_steps = len(val_files)/4)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ffa9a6fedd8>