# 采用预处理模型剔除异常数据

> [应用：数据预处理-异常值识别](https://www.jianshu.com/p/ac6418ee8e3f) - 简书

> [有哪些比较好的做异常值检测的方法？](https://www.zhihu.com/question/38066650) - 知乎

> [毕业设计 Dogs vs Cats For Udacity P7 (异常值检验)](https://zhuanlan.zhihu.com/p/34068451) - 知乎

In [None]:
import csv
import matplotlib.pyplot as plt

%matplotlib inline  

plt.style.use('seaborn-white')

img_size = (299, 299)

def get_imageNet_class(file_path):
    category_class = []
    with open(file_path, 'r') as f:
        reader = csv.reader(f)
        for line in reader:
            if line[1] == '狗' or line[1] == '猫':
                category_class.append(line[0])
    return category_class

imageNet_class = get_imageNet_class('ImageNetClasses.csv')

In [None]:
import os

os.chdir("{}/image".format(os.getcwd()))

In [None]:
from keras.applications.inception_v3 import InceptionV3
from keras.preprocessing import image
from keras.applications.inception_v3 import preprocess_input
from keras.applications.inception_v3 import decode_predictions
import numpy as np
import random
from math import ceil

def get_outlier_predictions(train_path, model, img_size, top=10):
    outlier_predictions = {}
    targetnames = os.listdir(train_path)
    for name in targetnames[:]:
        img = image.load_img(train_path + name, target_size=img_size)
        x = image.img_to_array(img)
        x = np.expand_dims(x, axis=0)
        x = preprocess_input(x)

        preds = model.predict(x)
        predictions = decode_predictions(preds, top=top)[0]
        #if predictions not in imageNet_class:
        #    outlier_predictions.append(name)
        outlier_predictions[name] = predictions
            
    return outlier_predictions    

In [None]:
model = InceptionV3(weights='imagenet')

In [None]:
outlier_list = get_outlier_predictions('train/', model, img_size, 10)

In [None]:
import csv

def writer_csv(csv_path, data):
    """
    :type csv_path: str
    :type data: dict
    :rtype: None
    """
    with open(csv_path, 'w') as csv_file:
        writer = csv.writer(csv_file)
        for key, value in data.items():
            writer.writerow([key, value])
            
def readr_csv(csv_path):
    """
    :type cav_path: str
    :rtpye: dict
    """
    with open(csv_path, 'r') as csv_file:
        reader = csv.reader(csv_file)
        return dict(reader)

In [None]:
csv_path = 'train_decode_predictions.csv'
writer_csv(csv_path, outlier_list)

In [None]:
outlier_list = readr_csv(csv_path)

In [None]:
def get_result_list(pred):
    result = []
    for key, value in pred.items():
        value = eval(value)
        pred_list = [ x[0] for x in value ]
        pred_list = [ 1 for x in pred_list if x in imageNet_class ]
        if sum(pred_list) == 0:
            result.append(key)
        
    return result

In [None]:
outlier_result = get_result_list(outlier_list)

In [None]:
import cv2

def plt_outlier_img_1(outlier_list):
    plt.figure(figsize=(16, 68), dpi=80)
    subplot_row = ceil(len(outlier_list) / 5)
    for i in range(0, len(outlier_list)):
        plt.subplot(subplot_row, 5, i+1)
        img = cv2.imread('train/'+ outlier_list[i])
        img = cv2.resize(img, (224, 224))
        x = img.copy()
        x.astype(np.float32)
        plt.title(outlier_list[i])
        plt.axis('off')
        plt.imshow(x[:,:,::-1])
        
#         img = image.load_img('train/'+ outlier_list[i])
#         x = image.img_to_array(img)
#         plt.title(outlier_list[i])
#         #plt.axis('off')
#         plt.tight_layout()
#         plt.imshow(img)

In [None]:
plt_outlier_img_1(outlier_result)

In [None]:
import shutil

pick_out_outlier_list = ['dog.3889.jpg', 'cat.8100.jpg', 'dog.9188.jpg', 'cat.10700.jpg', 'dog.12376.jpg',
                        'dog.1043.jpg', 'dog.5490.jpg', 'dog.1895.jpg', 'dog.1308.jpg', 'dog.1194.jpg',
                        'cat.7564.jpg', 'cat.3216.jpg', 'dog.4218.jpg', 'cat.4338.jpg', 'dog.5604.jpg',
                        'cat.10712.jpg', 'dog.4367.jpg', 'dog.10237.jpg', 'cat.9171.jpg', 'dog.8736.jpg',
                        'cat.4688.jpg', 'dog.11299.jpg', 'cat.10029.jpg', 'cat.7968.jpg', 'cat.8470.jpg',
                        'cat.3868.jpg', 'dog.2614.jpg', 'cat.5418.jpg', 'cat.7377.jpg', 'cat.12272.jpg',
                        'dog.10161.jpg', 'dog.1259.jpg', 'dog.1773.jpg', 'dog.6475.jpg', 'dog.11186.jpg',
                        'cat.11184.jpg', 'cat.2939.jpg', 'dog.10747.jpg', 'dog.9517.jpg', 'dog.10190.jpg',
                        'cat.8456.jpg', 'dog.10801.jpg', 'dog.8898.jpg']

print("Len outlier_list: {}".format(len(pick_out_outlier_list)))
plt.figure(figsize=(12, 20))
subplot_row = ceil(len(pick_out_outlier_list) / 5)
for i in range(0, len(pick_out_outlier_list)):
    plt.subplot(subplot_row, 5, i+1)
    img = image.load_img('train/'+ pick_out_outlier_list[i])
    plt.title(pick_out_outlier_list[i])
    plt.axis('off')
    plt.imshow(img)
    shutil.move('train/' + pick_out_outlier_list[i], 'outlier/' + pick_out_outlier_list[i])