# 训练SVM

In [1]:
import os
import cv2
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import selectivesearch
import matplotlib.patches as mpatches
import itertools
from tensorflow.keras import Sequential, layers, Model, models
from tensorflow.keras import regularizers
import sklearn.svm as svm

## 1. 把fining好的模型加载进来

In [2]:
model_dir = "./AlexNet_fine_tuning_pascal"
alex_net = tf.keras.models.load_model(model_dir)

In [3]:
alex_net.summary()  # 这个模型不会保存trainable的信息

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv_1 (Conv2D)              multiple                  34944     
_________________________________________________________________
pool_1 (MaxPooling2D)        multiple                  0         
_________________________________________________________________
bn_1 (BatchNormalization)    multiple                  384       
_________________________________________________________________
conv_2 (Conv2D)              multiple                  614656    
_________________________________________________________________
pool_2 (MaxPooling2D)        multiple                  0         
_________________________________________________________________
bn_2 (BatchNormalization)    multiple                  1024      
_________________________________________________________________
conv_3 (Conv2D)              multiple                  8

In [4]:
for i in range(1):  # pop掉最后一层
    alex_net.pop()

In [5]:
"""
原论文中最后一层flatten之后输出的feature特征应该是6 × 6 × 256 = 9216维，而我这里为了加快训练速度使用的图片输入是32*32,
所以最后的特征是256维度
"""
alex_net.summary()  

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv_1 (Conv2D)              multiple                  34944     
_________________________________________________________________
pool_1 (MaxPooling2D)        multiple                  0         
_________________________________________________________________
bn_1 (BatchNormalization)    multiple                  384       
_________________________________________________________________
conv_2 (Conv2D)              multiple                  614656    
_________________________________________________________________
pool_2 (MaxPooling2D)        multiple                  0         
_________________________________________________________________
bn_2 (BatchNormalization)    multiple                  1024      
_________________________________________________________________
conv_3 (Conv2D)              multiple                  8

## 2. 加载训练集

In [6]:
# 创建一个描述器
image_feature_description = {
    'image_raw': tf.io.FixedLenFeature([], tf.string),
    'bboxes': tf.io.FixedLenSequenceFeature([4], tf.float32, allow_missing=True),  # 用float存的原因是方便后面归一化, 也可以在存之前就归一化
    'labels': tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
    'labels_text': tf.io.FixedLenSequenceFeature([], tf.string, allow_missing=True),
    'image_name': tf.io.FixedLenFeature([], tf.string),
    'regions': tf.io.FixedLenSequenceFeature([4], tf.float32, allow_missing=True),
    'regions_label':tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True)
}
def parse_image_function(example_proto):
    # 把Example转为dict
    return tf.io.parse_single_example(example_proto, image_feature_description)

In [7]:
out_dir = r".\data\PASCAL_VOC_2007_OBJ_car_train_regions.tfrecords"
ds = tf.data.TFRecordDataset(out_dir).map(parse_image_function)

In [8]:
def preprocess(image_raw, regions, regions_label):
    """
        处理之后data应该是regions(32*32), 而输出是0或1
    """
    region_datas = []
    img = tf.image.decode_jpeg(image_raw)
    for region in regions:
        # 按照regions切割原图像
        region = tf.cast(region, tf.int64)
        xmin, ymin, xmax, ymax = region
        region_data = img[ymin: ymax, xmin: xmax, :]
        region_data = tf.cast(region_data, tf.float32)
        region_data = tf.image.resize(region_data, [32, 32])  # warp
        region_data = region_data
        region_datas.append(region_data)
    return tf.convert_to_tensor(region_datas), tf.convert_to_tensor(regions_label)

In [9]:
def _ensure_shape(region_datas, regions_label):
    return tf.ensure_shape(region_datas, [32, 32, 3]), tf.ensure_shape(regions_label, [])

In [10]:
ds_train = ds.map(lambda x: (x["image_raw"], x["regions"], x["regions_label"]))  \
                .map(lambda image_raw, regions, regions_label: tf.py_function(func=preprocess,
                              inp=[image_raw, regions, regions_label], Tout = [tf.float32, tf.int64])) \
                .unbatch().map(_ensure_shape).batch(10)

In [11]:
def _etract_features(regions_raw, regions_label):
    return (alex_net(regions_raw), regions_label)

In [12]:
ds_train_svm = ds_train.map(lambda regions_raw, regions_label: tf.py_function(func = _etract_features,
                                                                              inp = [regions_raw, regions_label],
                                                                             Tout = [tf.float32, tf.int64]))

In [13]:
# 由于svm需要把所有数据全部读入内存中, 所以这里需要一个转换
X = []
y = []
for region_raw, label in ds_train_svm.unbatch():
    X.append(region_raw.numpy())
    y.append(label.numpy())
X = np.array(X)
y = np.array(y)

In [14]:
print(X.shape)
print(y.shape)

(17822, 256)
(17822,)


In [15]:
# 由于负样本过多, 这里采用正:负 = 1:3的方式进行降采样, Hints: 如果采用SVC的平衡模式则可以不用这样
# X_positive = X[np.where(y == 1)]
# y_positive = y[np.where(y == 1)]
# num_positive = len(y_positive)
# sample_idx = np.random.choice(np.where(y != 1)[0], 3*num_positive, replace=False)
# X_negative = X[sample_idx]
# y_negative = y[sample_idx]

# X_train = np.concatenate([X_positive, X_negative], axis = 0)
# y_train = np.concatenate([y_positive, y_negative])
# np.random.shuffle(X_train)
# np.random.shuffle(y_train)

In [16]:
clf = svm.SVC(class_weight = "balanced")
clf.fit(X, y)

SVC(class_weight='balanced')

In [18]:
clf.score(X, y)

0.7810571204129727

In [19]:
for regions_raw, regions_label in ds_train_svm.take(10):
    print("===========================")
    print("ground truth:", regions_label.numpy())
    print("pred: ", clf.predict(regions_raw.numpy()))

ground truth: [0 0 0 0 0 0 0 0 1 1]
pred:  [0 1 0 0 0 0 0 0 1 1]
ground truth: [0 0 0 1 1 0 0 0 0 0]
pred:  [0 0 0 1 0 0 0 0 1 0]
ground truth: [1 0 0 0 0 1 1 1 0 0]
pred:  [1 0 1 0 0 1 1 1 0 0]
ground truth: [1 0 0 0 0 0 1 1 0 0]
pred:  [1 0 1 0 0 0 1 1 0 0]
ground truth: [0 0 0 0 0 0 0 0 0 0]
pred:  [0 0 1 0 0 0 0 0 0 0]
ground truth: [1 0 0 0 0 0 0 0 0 0]
pred:  [1 0 0 0 0 1 1 0 1 0]
ground truth: [0 0 1 0 0 0 1 0 0 0]
pred:  [0 0 1 0 0 0 1 0 0 0]
ground truth: [0 0 0 0 0 1 0 0 0 0]
pred:  [1 0 1 0 0 1 0 0 0 0]
ground truth: [0 0 0 0 0 0 0 0 0 0]
pred:  [0 0 0 0 0 0 1 0 1 0]
ground truth: [0 0 0 0 0 0 0 0 0 0]
pred:  [1 0 1 0 0 0 0 0 0 0]


**说明：** R-CNN在这里训练SVM还用到了Hard-negative mining，我这里并没有用这个

In [16]:
# 保存模型
import joblib
model_dir = "./svm_pascal.pkl"

In [23]:
joblib.dump(clf, model_dir) 

['./svm_pascal.pkl']

In [17]:
s = joblib.load(model_dir)

In [29]:
s.decision_function(X[0:10, :])

array([-4.20508054,  1.33794755, -2.16558059, -2.48448922, -1.66504911,
       -0.96871932, -2.22794364, -1.56203662,  1.27573284,  1.0018129 ])

In [30]:
from scipy.special import expit
expit(s.decision_function(X[0:10, :]))

array([0.01470026, 0.79215221, 0.10288423, 0.07695272, 0.15908538,
       0.27513584, 0.09726906, 0.1733546 , 0.78172254, 0.73141487])