图像分类基础实例
=============

这一部分我们学习简单的数据处理操作。这里我们以 [notMNIST](http://yaroslavvb.blogspot.com/2011/09/notmnist-dataset.html) 数据集作为例子。

In [2]:
# 这些是本例子所需的python库。
# 例如scipy,可能需要单独安装。
# pip install -U scikit-learn
# pip install Pillow
import matplotlib.pyplot as plt ＃画图
import numpy as np  ＃数值处理
import os
import sys
import tarfile
from IPython.display import display, Image
from scipy import ndimage  ＃ 图像处理
from sklearn.linear_model import LogisticRegression
from six.moves.urllib.request import urlretrieve  ＃ 网络下载
from six.moves import cPickle as pickle

将数据集下载到本机。
---------

这个数据集包涵不同字体的英文字母（下采样到28x28像素的图像）。这些图像有A到J 10个类。训练集包涵 500k个样本。测试集包涵19k个样本。

In [3]:
url = 'http://yaroslavvb.com/upload/notMNIST/'

def maybe_download(filename, expected_bytes):
  """Download a file if not present, and make sure it's the right size."""
  if not os.path.exists(filename):
    filename, _ = urlretrieve(url + filename, filename)
  statinfo = os.stat(filename)
  if statinfo.st_size == expected_bytes:
    print('Found and verified', filename)
  else:
    raise Exception(
      'Failed to verify' + filename + '. Can you get to it with a browser?')
  return filename

train_filename = maybe_download('notMNIST_large.tar.gz', 247336696)
test_filename = maybe_download('notMNIST_small.tar.gz', 8458043)

('Found and verified', 'notMNIST_large.tar.gz')
('Found and verified', 'notMNIST_small.tar.gz')


解压.tar.gz 文件到相应的目录。
---------

In [7]:
num_classes = 10

def extract(filename):
  tar = tarfile.open(filename)
  root = os.path.splitext(os.path.splitext(filename)[0])[0]  # 去掉文件后缀 .tar.gz
  print('解压 %s.' % root)
  sys.stdout.flush()
  tar.extractall()
  tar.close()
  data_folders = [
    os.path.join(root, d) for d in sorted(os.listdir(root)) if d != '.DS_Store']
  if len(data_folders) != num_classes:
    raise Exception(
      'Expected %d folders, one per class. Found %d instead.' % (
        num_classes, len(data_folders)))
  print(data_folders)
  return data_folders
  
train_folders = extract(train_filename)
test_folders = extract(test_filename)

解压 notMNIST_large.
['notMNIST_large/A', 'notMNIST_large/B', 'notMNIST_large/C', 'notMNIST_large/D', 'notMNIST_large/E', 'notMNIST_large/F', 'notMNIST_large/G', 'notMNIST_large/H', 'notMNIST_large/I', 'notMNIST_large/J']
解压 notMNIST_small.
['notMNIST_small/A', 'notMNIST_small/B', 'notMNIST_small/C', 'notMNIST_small/D', 'notMNIST_small/E', 'notMNIST_small/F', 'notMNIST_small/G', 'notMNIST_small/H', 'notMNIST_small/I', 'notMNIST_small/J']


利用IPython.display包显示图像数据
---------

这里将图像文件转换为三维数组。做白化处理（零均值，并且方差在1以内）以利于之后的模型训练。标注被存储为一个取值范围是［0，9］整数向量。

In [41]:
image_size = 28  # 图像宽和高.
pixel_depth = 255.0  # 每个像素灰度范围.

def load(data_folders, max_num_images):
  dataset = np.ndarray(
    shape=(max_num_images, image_size, image_size), dtype=np.float32)  # 创建图像三维的数组
  labels = np.ndarray(shape=(max_num_images), dtype=np.int32)  # 创建标注向量（一维的数组）
  # files = np.chararray(shape=(max_num_images))  # 文件名向量
  files = ["" for x in range(max_num_images)]
  image_label = 0  # 图像类别
  image_index = 0  # 图像序列号
  for folder in data_folders:
    print(folder)
    n = 0
    for image in os.listdir(folder):
      n += 1
      if n > max_num_images / 10:
        break
      image_file = os.path.join(folder, image)
      try:
        image_data = (ndimage.imread(image_file).astype(float) - pixel_depth / 2) / pixel_depth  #白化
        if image_data.shape != (image_size, image_size):
          raise Exception('图像形状异常: %s' % str(image_data.shape))
        dataset[image_index, :, :] = image_data
        labels[image_index] = image_label
        files[image_index] = image_file
        image_index += 1
      except IOError as e:
        print(e)
    image_label += 1
  num_images = image_index
  dataset = dataset[0:num_images, :, :]
  labels = labels[0:num_images]
  files = files[0:num_images] 
  print('data shape:', dataset.shape) #数据张量尺寸
  print('mean:', np.mean(dataset)) # 均值
  print('variance:', np.std(dataset)) #方差
  print('labels:', labels.shape)  #标注
  return dataset, labels, files

train_dataset, train_labels, train_files = load(train_folders, 5000)
test_dataset, test_labels, test_files = load(test_folders, 200)

notMNIST_large/A
notMNIST_large/B
notMNIST_large/C
notMNIST_large/D
notMNIST_large/E
notMNIST_large/F
notMNIST_large/G
notMNIST_large/H
notMNIST_large/I
notMNIST_large/J
('data shape:', (5000, 28, 28))
('mean:', -0.027473768)
('variance:', 0.45789579)
('labels:', (5000,))
notMNIST_small/A
notMNIST_small/B
notMNIST_small/C
notMNIST_small/D
notMNIST_small/E
notMNIST_small/F
notMNIST_small/G
notMNIST_small/H
notMNIST_small/I
notMNIST_small/J
('data shape:', (200, 28, 28))
('mean:', -0.14173354)
('variance:', 0.43553993)
('labels:', (200,))


In [22]:
image = ndimage.imread(train_files[0])
plt.imshow(image)
plt.show()

打乱图像的顺序
---------

这一步对于取得好的训练效果非常重要

In [42]:
np.random.seed(133)
def randomize(dataset, labels):
  permutation = np.random.permutation(labels.shape[0])  # 图像序号的随机排列组合
  shuffled_dataset = dataset[permutation,:,:]
  shuffled_labels = labels[permutation]
  return shuffled_dataset, shuffled_labels

train_dataset, train_labels = randomize(train_dataset, train_labels)
test_dataset, test_labels = randomize(test_dataset, test_labels)

建立验证数据集
---------

深度模型一般有很多超参数需要调整。如果直接依据测试数据集来调整参数可能会造成过拟合。

In [43]:
valid_size = 1000

valid_dataset = train_dataset[:valid_size,:,:]
valid_labels = train_labels[:valid_size]
train_dataset = train_dataset[valid_size:,:,:]
train_labels = train_labels[valid_size:]
print('Training', train_dataset.shape, train_labels.shape)
print('Validation', valid_dataset.shape, valid_labels.shape)

('Training', (4000, 28, 28), (4000,))
('Validation', (1000, 28, 28), (1000,))


将数据集存储为文件
---------

In [44]:
pickle_file = 'notMNIST.pickle'

try:
  f = open(pickle_file, 'wb')
  save = {
    'train_dataset': train_dataset,
    'train_labels': train_labels,
    'valid_dataset': valid_dataset,
    'valid_labels': valid_labels,
    'test_dataset': test_dataset,
    'test_labels': test_labels,
    }
  pickle.dump(save, f, pickle.HIGHEST_PROTOCOL)
  f.close()
except Exception as e:
  print('Unable to save data to', pickle_file, ':', e)
  raise

In [45]:
statinfo = os.stat(pickle_file)
print('Compressed pickle size:', statinfo.st_size)

('Compressed pickle size:', 16328435)


尝试训练并评估 sklearn.linear_model.LogisticRegression模型
---------


可以尝试不同训练集大小得到的准确度

In [53]:
train_examples = np.reshape(train_dataset, (train_dataset.shape[0], 28*28))

In [54]:
logreg = LogisticRegression(C=1e5)
logreg.fit(train_examples, train_labels)

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [55]:
valid_examples = np.reshape(valid_dataset, (valid_dataset.shape[0], 28*28))

In [61]:
float(sum(logreg.predict(valid_examples) == valid_labels))/valid_size  # 准确度

0.483

In [64]:
logreg.fit(train_examples[:1000], train_labels[:1000])

LogisticRegression(C=100000.0, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [65]:
float(sum(logreg.predict(valid_examples) == valid_labels))/valid_size  # 准确度

0.515