Stworzenie zbioru z nazwami zdjęć i dopasowaniem typu

In [1]:
import pandas as pd
import numpy as np

df_test = pd.read_csv('test_split.txt',
                 delimiter = " ",
                index_col=False)
df_test.columns = ["patient", "filename", "label_str", "data"]

df_train = pd.read_csv('train_split.txt',
                 delimiter = " ",
                index_col=False)
df_train.columns = ["patient", "filename", "label_str", "data"]

df_test = df_test.assign(label=pd.Series(np.zeros(len(df_test['label_str']))).values)
df_train = df_train.assign(label=pd.Series(np.zeros(len(df_train['label_str']))).values)

df_test.loc[df_test['label_str']== "normal", 'label'] = 0
df_test.loc[df_test['label_str']== "COVID-19", 'label'] = 1
df_test.loc[df_test['label_str']== "pneumonia", 'label'] = 2

df_train.loc[df_train['label_str']== "normal", 'label'] = 0
df_train.loc[df_train['label_str']== "COVID-19", 'label'] = 1
df_train.loc[df_train['label_str']== "pneumonia", 'label'] = 2

In [2]:
test = df_test[['filename', 'label', 'label_str']]
train = df_train[['filename', 'label', 'label_str']]

Oversampling

In [3]:
from imblearn.over_sampling import RandomOverSampler

In [4]:
def random_minority_oversample(train_set):
    '''
    Oversample the minority class using the specified algorithm
    :param train_set: Training set image file names and labels
    :return: A new training set containing oversampled examples
    '''
    X_train = train_set[[x for x in train_set.columns if x != 'label']].to_numpy()
    if X_train.shape[1] == 1:
        X_train = np.expand_dims(X_train, axis=-1)
    Y_train = train_set['label'].to_numpy()
    sampler = RandomOverSampler(random_state=np.random.randint(0, high=1000))
    X_resampled, Y_resampled = sampler.fit_resample(X_train, Y_train)
    filenames = X_resampled[:, 0]     # Filename is in first column
    label_strs = X_resampled[:, 1]    # Class name is in second column
    print("Train set shape before oversampling: ", X_train.shape, 
          " Train set shape after resampling: ", X_resampled.shape)
    train_set_resampled = pd.DataFrame({'filename': filenames, 'label': Y_resampled, 'label_str': label_strs})
    return train_set_resampled

In [5]:
np.random.seed(2137)
train_o = random_minority_oversample(train)

Train set shape before oversampling:  (15110, 2)  Train set shape after resampling:  (23898, 2)


Undersampling

In [6]:
from imblearn.under_sampling import RandomUnderSampler

In [7]:
def random_majority_undersample(train_set):
    '''
    Undersample the majority class using the specified algorithm
    :param train_set: Training set image file names and labels
    :return: A new training set containing undersampled examples
    '''
    X_train = train_set[[x for x in train_set.columns if x != 'label']].to_numpy()
    if X_train.shape[1] == 1:
        X_train = np.expand_dims(X_train, axis=-1)
    Y_train = train_set['label'].to_numpy()
    sampler = RandomUnderSampler(random_state=np.random.randint(0, high=1000))
    X_resampled, Y_resampled = sampler.fit_resample(X_train, Y_train)
    filenames = X_resampled[:, 0]     # Filename is in first column
    label_strs = X_resampled[:, 1]    # Class name is in second column
    print("Train set shape before oversampling: ", X_train.shape, 
          " Train set shape after resampling: ", X_resampled.shape)
    train_set_resampled = pd.DataFrame({'filename': filenames, 'label': Y_resampled, 'label_str': label_strs})
    return train_set_resampled

In [8]:
np.random.seed(2137)
train_u = random_majority_undersample(train)

Train set shape before oversampling:  (15110, 2)  Train set shape after resampling:  (5010, 2)


Ładowanie zdjęć

In [9]:
train_o.columns

Index(['filename', 'label', 'label_str'], dtype='object')

In [10]:
train_u.columns

Index(['filename', 'label', 'label_str'], dtype='object')

In [11]:
import os
from PIL import Image

def load_images2(data_split_table, img_path):
    n_images = len(data_split_table)
    images = np.zeros((n_images, 224, 224, 3))
    labels = np.empty(n_images, dtype='object')
    for i in range(n_images):
        name = data_split_table.filename[i]
        image = Image.open(os.path.join(img_path, name))
        image = image.convert('RGB').resize((224,224))
        image = np.asarray(image)
        images[i] = image
        labels[i] = dict_train[name]
    return images, labels

In [12]:
# tworzenie datasetu po over/under samplingu

In [13]:
train_path = os.path.join("data2", "train")
test_path = os.path.join("data2", "test")

In [14]:
train_o.shape

(23898, 3)

In [15]:
train_u.shape

(5010, 3)

In [16]:
#x_train, y_train = load_images2(train_o, train_path)
#x_test, y_test = load_images2(train_u, test_path)