In [1]:
import os
import gc
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from  matplotlib import pyplot as plt

import tensorflow as tf

In [2]:
mvsa_single_data_path = '../input/mvsasingle/MVSA_Single/data'
mvsa_single_label_path = '../input/mvsasingle/MVSA_Single/labelResultAll.txt'
mvsa_multiple_data_path = '../input/mvsamultiple/MVSA/data'
mvsa_multiple_label_path = '../input/mvsamultiple/MVSA/labelResultAll.txt'

IMAGE_SIZE = (224, 224)
NUM_CHANNELS = 3

In [3]:
def read_text_file(path, multi_line=False):
    if multi_line == True:
        lines = open(path, 'r', encoding='latin-1').readlines()
        lines = [line.rstrip('\n') for line in lines]
        return lines
    return open(path, 'r', encoding='latin-1').read()

def read_image_file(path):
    try:
        image = cv2.imread(path, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, IMAGE_SIZE, interpolation = cv2.INTER_AREA)
        invalid_ID = -1
    except:
        image = np.zeros((IMAGE_SIZE[0], IMAGE_SIZE[1], NUM_CHANNELS))
        invalid_ID = os.path.split(path)[1].split('.')[0]
    return image, invalid_ID

def read_labels_file(path):
    dataframe = pd.read_csv(path, sep="\s+|,", engine="python")
    return dataframe

In [4]:
# since there are 3 annotators defining each data labels in the MVSA-Multiple dataset
# we take the labels pair that 2 out 3 annotators agree on as the valid labels pair, remove otherwise
def merge_multi_label(dataframe):
    anno_1 = list(dataframe.iloc[:, 1:3].itertuples(index=False, name=None))
    anno_2 = list(dataframe.iloc[:, 3:5].itertuples(index=False, name=None))
    anno_3 = list(dataframe.iloc[:, 5:7].itertuples(index=False, name=None))
    IDs = list(dataframe.iloc[:, 0])
    
    valid_pairs = []
    
    for i in range(len(anno_1)):
        pairs = [anno_1[i], anno_2[i], anno_3[i]]
        ID = IDs[i]
        
        valid_pair = tuple([pair for pair in pairs if pairs.count(pair) > 1])
        
        if len(valid_pair) == 0:
            valid_pair = (ID, 'invalid', 'invalid')
        else:
            valid_pair = (ID, valid_pair[0][0], valid_pair[0][1])
        valid_pairs.append(valid_pair)
        
    valid_dataframe = pd.DataFrame(valid_pairs, columns=['ID', 'text', 'image'])
    return valid_dataframe

def multimodal_label(text_label, image_label):
    if text_label == image_label:
        label = text_label
    elif (text_label == 'positive' and image_label == 'negative') or (text_label == 'negative' and image_label == 'positive'):
        label = 'invalid'
    elif (text_label == 'neutral' and image_label != 'neutral') or (text_label != 'neutral' or image_label == 'neutral'):
        label = image_label if text_label == 'neutral' else text_label
    return label

In [5]:
def get_data_paths(path, extension):
    paths = os.listdir(path)
    paths.sort(key = lambda x : int(x.split('.')[0]))
    paths = list(filter(lambda x: x.endswith(extension), paths))
    paths = [os.path.join(path, x) for x in paths]
    return paths

def get_image_with_id(path):
    filename = os.path.split(path)[1]
    ID = int(filename.split('.')[0])
    image = read_image_file(path)
    return (ID, image)

In [6]:
def create_labels(path, multiple=False, mappings=False):
    dataframe = read_labels_file(path)
    
    if multiple == True:
        dataframe = merge_multi_label(dataframe)

    labels = []
    for label_pair in dataframe.iloc[:, 1:].values:
        label = multimodal_label(label_pair[0], label_pair[1])
        labels.append(label)
        
    if mappings == True:
        label_map = {}
        for i in range(len(labels)):
            ID = dataframe.iloc[i, 0]
            label_map[ID] = labels[i]            
        return label_map
    
    return np.array(labels, dtype='object')

def create_text_data(path):
    texts = []
    text_paths = get_data_paths(path, '.txt')
    
    print('Read text data')
    for text_path in tqdm(text_paths):
        text = read_text_file(text_path).rstrip('\n')
        texts.append(text)
        
    return np.array(texts)

def create_image_data(path):
#     images = []
    images = np.array([])

    invalid_indices = []
    image_paths = get_data_paths(path, '.jpg')

    print('Read image data')
    for image_path in tqdm(image_paths):
        image, invalid_ID = read_image_file(image_path)
        if images.shape[0] == 0:
            images = np.array([image])
        else:
            images = np.concatenate((images, [image]))
#         images.append(image)

        if invalid_ID != -1:
            invalid_indices.append(invalid_ID)
            
    return images, invalid_indices

In [7]:
a = np.array([])

In [8]:
# a = np.array([np.zeros((5, 5, 3))])
# b = np.zeros((5, 5, 3))
# c = np.concatenate((a, [b]))
# d = np.concatenate((c, [b]))
# d.shape

In [9]:
# if extracted_data.shape[0] == 0:
#     extracted_data = np.concatenate(([extracted_data], [features]), 1)
# else:
#     extracted_data = np.concatenate((extracted_data, [features]), 0)

In [10]:
def invalid_indices(labels):
    invalid_indices = [i for i in range(labels.shape[0]) if labels[i] == 'invalid']
    return indices

def remove_invalid(data, indices):
#     indices = invalid_indices(labels)
    new_data =  np.delete(data, indices)
    return new_data

In [11]:
def save_text_file(filename, lines):
    with open(filename, 'w', encoding='latin1') as f:
        f.write('\n'.join(lines))

In [12]:
# mvsa_single_texts = create_text_data(mvsa_single_data_path)
mvsa_single_images, mvsa_single_images_invalid_indices = create_image_data(mvsa_single_data_path)
# mvsa_single_labels = create_labels(mvsa_single_label_path)

Read image data


100%|██████████| 4869/4869 [18:39<00:00,  4.35it/s]


In [9]:
# mvsa_multiple_texts = create_text_data(mvsa_multiple_data_path)
mvsa_multiple_images, mvsa_multiple_images_invalid_indices = create_image_data(mvsa_multiple_data_path)
# mvsa_multiple_labels = create_labels(mvsa_multiple_label_path, multiple=True)

Read image data


 62%|██████▏   | 12238/19600 [02:32<01:44, 70.18it/s]Premature end of JPEG file
100%|██████████| 19600/19600 [04:11<00:00, 77.87it/s]


# Remove invalid data

In [None]:
mvsa_single_labels_invalid_indices = [i for i in range(mvsa_single_labels.shape[0]) if mvsa_single_labels[i] == 'invalid']

mvsa_single_invalid_indices = []
mvsa_single_invalid_indices.extend(mvsa_single_labels_invalid_indices)
mvsa_single_invalid_indices.extend(mvsa_single_images_invalid_indices)
mvsa_single_invalid_indices = list(set(mvsa_single_invalid_indices))

mvsa_single_texts_valid = remove_invalid(mvsa_single_texts, mvsa_single_invalid_indices)
mvsa_single_images_valid = remove_invalid(mvsa_single_images, mvsa_single_invalid_indices)
mvsa_single_labels_valid = remove_invalid(mvsa_single_labels, nmvsa_single_invalid_indicesvalid_indices)

In [None]:
mvsa_multiple_labels_invalid_indices = [i for i in range(mvsa_multiple_labels.shape[0]) if mvsa_multiple_labels[i] == 'invalid']

mvsa_multiple_invalid_indices = []
mvsa_multiple_invalid_indices.extend(mvsa_multiple_labels_invalid_indices)
mvsa_multiple_invalid_indices.extend(mvsa_multiple_images_invalid_indices)
mvsa_multiple_invalid_indices = list(set(mvsa_multiple_invalid_indices))

mvsa_multiple_texts_valid = remove_invalid(mvsa_multiple_texts, mvsa_multiple_invalid_indices)
mvsa_multiple_images_valid = remove_invalid(mvsa_multiple_images, mvsa_multiple_invalid_indices)
mvsa_multiple_labels_valid = remove_invalid(mvsa_multiple_labels, mvsa_multiple_invalid_indices)

In [None]:
save_text_file('./mvsa-single-texts.txt', mvsa_single_texts_valid)
np.save('./mvsa-single-images.npy', mvsa_single_images_valid)
save_text_file('./mvsa-single-labels.txt', mvsa_single_labels_valid)

save_text_file('./mvsa-multiple-texts.txt', mvsa_multiple_texts_valid)
np.save('./mvsa-multiple-images.npy', mvsa_multiple_images_valid)
save_text_file('./mvsa-multiple-labels.txt', mvsa_multiple_labels_valid)

In [None]:
mvsa_single_texts_loaded = read_text_file('./mvsa-single-texts.txt', multi_line=True)
mvsa_single_images_loaded = np.load('./mvsa-single-images.npy')
mvsa_single_labels_loaded = read_text_file('./mvsa-single-labels.txt', multi_line=True)

print((mvsa_single_texts_valid == mvsa_single_texts_loaded).all())
print((mvsa_single_images_loaded == mvsa_single_images_valid).all())
print((mvsa_single_labels_valid == mvsa_single_labels_loaded).all())

In [None]:
mvsa_multiple_texts_loaded = read_text_file('./mvsa-multiple-texts.txt', multi_line=True)
mvsa_multiple_images_loaded = np.load('./mvsa-multiple-images.npy')
mvsa_multiple_labels_loaded = read_text_file('./mvsa-multiple-labels.txt', multi_line=True)

print((mvsa_multiple_texts_valid == mvsa_multiple_texts_loaded).all())
print((mvsa_multiple_images_loaded == mvsa_multiple_images_valid).all())
print((mvsa_multiple_labels_valid == mvsa_multiple_labels_loaded).all())

In [None]:
############
# df_labels = read_labels_file('../input/mvsasingle/MVSA_Single/labelResultAll.txt')
# df_labels['text'].count
# label_text_count = df_labels.value_counts('text')
# label_image_count = df_labels.value_counts('image')
# label_text_count

In [None]:
# text_ids = []
# image_ids = []
# for i in range(len(texts)):
#     if texts[i][0] != images[i][0] != read_labels_file(label_path)['ID'][i]:
#         print('here')

In [None]:
# u, c, i = np.unique(texts, return_index=True, return_counts=True, axis=1)
# dup = u[c > 1]
# u[:10]

In [None]:
# def display_sample(text, image):
#     plt.imshow(image)
#     print('Text:', text)
#     plt.show()
# display_sample(texts_with_id[2][1], images_with_id[2][1])

In [None]:
# shapes = []
# for image in images_with_id:
#     shapes.append(image.shape)
# shapes

In [None]:
# raw_texts[raw_texts.duplicated('Text')]

In [None]:
# len_split = lambda x: len(x.split())
# print()
# print('Min number of words in text:', raw_texts['Text'].apply(len_split).min())
# print('Max number of words in text:', raw_texts['Text'].apply(len_split).max())
# print('Average number of words in text:', round(raw_texts['Text'].apply(len_split).mean()))

# Drafts

In [None]:
# # mvsa_single_images.tofile('./mvsa-single-images.npy')
# np.save('./mvsa-single-images.npy', mvsa_single_images)
# mvsa_single_images_loaded = np.load('./mvsa-single-images.npy')
# (mvsa_single_images_loaded == mvsa_single_images).all()

In [None]:
# a = read_labels_file(mvsa_single_label_path).iloc[:, 1:].values
# for i in a:
#     if i[0] == 'negative' and i[1] == 'positive':
#         print('here')
#     elif i[0] == 'positive' and i[1] == 'negative':
#         print('hereee')

In [None]:
# label_mappings = create_labels(mvsa_single_label_path, mappings=True)

# image_dataset = make_dataset([os.path.join(mvsa_single_label_path, str(ID) + '.jpg') for ID in label_mappings.keys()],
#                              list(label_mappings.values()))

In [None]:
# a = get_image_with_id(os.path.join(mvsa_single_data_path, '1.jpg'))
# tf.keras.utils.array_to_img(a[1])

In [None]:
# label_mappings = create_labels(mvsa_single_label_path, mappings=True)

# image_dataset = make_dataset([os.path.join(mvsa_single_label_path, str(ID) + '.jpg') for ID in label_mappings.keys()],
#                              list(label_mappings.values()))

In [None]:
# a = os.listdir(mvsa_single_data_path)
# a.sort(key = lambda x: int(x.split('.')[0]))
# list(filter(lambda x: x.endswith('.jpg'), a))
# a = [os.path.join(mvsa_single_data_path, i) for i in a]
# a

In [None]:
# # a = create_labels(mvsa_multiple_label_path, multiple=True, mappings=True)
# def process_image(path, label):
# #     filename = os.path.split(path)[1]
# #     ID = int(filename.split('.')[0])
# #     return read_image_file(path), label_map[ID]
#     return read_image_file(path), label

# def make_dataset(images, labels):
#     dataset = tf.data.Dataset.from_tensor_slices((images, labels))
# #     dataset = dataset.shuffle(len(images))
#     dataset = dataset.map(process_image)#, num_parallel_calls=AUTOTUNE)
# #     dataset = dataset.batch(100).prefetch(tf.data.AUTOTUNE)
#     return dataset

In [None]:
# IMAGE_SIZE = (224, 224)
# def read_image_file(path):
#     image = cv2.imread(path, cv2.COLOR_BGR2RGB)
# #     image = np.array(image)#.astype('float32')
# #     image = cv2.resize(image, IMAGE_SIZE, interpolation = cv2.INTER_AREA)
# #     image /= 255
# #     print(image.shape)
# #     ax = plt.subplot(1,2,1)
# #     plt.imshow(image)
# #     image = cv2.resize(image, IMAGE_SIZE, interpolation = cv2.INTER_AREA)
# #     ax = plt.subplot(1,2,2)
# #     plt.imshow(image)
# #     print(image.shape)
#     return image
# read_image_file('../input/mvsasingle/MVSA_Single/data/10.jpg')      

In [None]:
# def read_text_file(path, multi_line=False):
# #     with open(path, 'r', encoding='latin-1') as f:
# #         if multi_line == True:
# #             lines = f.readlines()
# #             lines = [line.rstrip('\n') for line in lines]
# #             return lines
# #         return f.read()
    
#     if multi_line == True:
#         lines = open(path, 'r', encoding='latin-1').readlines()
#         lines = [line.rstrip('\n') for line in lines]
#         return lines
#     return open(path, 'r', encoding='latin-1').read()

# def read_image_file(path):
#     try:
#         image = cv2.imread(path, cv2.COLOR_BGR2RGB)
#         image = cv2.resize(image, IMAGE_SIZE, interpolation = cv2.INTER_AREA)
#     except:
#         image = np.zeros((IMAGE_SIZE[0], IMAGE_SIZE[1], NUM_CHANNELS))
#         ID = os.path.split(path)[1].split('.')[0]
#         invalid_indices.append(ID)
# #     image = tf.io.read_file(path)
# #     image = tf.image.decode_jpeg(image, channels=NUM_CHANNELS)
# #     image = tf.image.resize(image, IMAGE_SIZE)
#     return image

# def read_labels_file(path):
#     dataframe = pd.read_csv(path, sep="\s+|,", engine="python")
#     return dataframe

In [None]:
# def create_labels(path, multiple=False, mappings=False):
#     dataframe = read_labels_file(path)
    
#     if multiple == True:
#         dataframe = merge_multi_label(dataframe)
        
#     labels = []
#     for _, row in dataframe.iterrows():
#         label = multimodal_label(row['text'], row['image'])
#         labels.append(label)
        
#     if mappings == True:
#         label_map = {}
#         for i in range(len(labels)):
#             ID = dataframe.iloc[i, 0]
#             label_map[ID] = labels[i]            
#         return label_map
#     return np.array(labels, dtype='object')

# def create_text_data(path):
#     texts = []
    
#     print('Read text data')
#     # read data along with its filename as ID
#     for filename in tqdm(os.listdir(path)):
#         ID = int(filename.split('.')[0])
#         file_path = os.path.join(path, filename)
#         if filename.endswith('txt'):
#             text = read_text_file(file_path)
#             texts.append((ID, text))

#     # Sort data by its ID
#     get_ID = lambda x : x[0]
#     texts.sort(key=get_ID)
    
#     # return data without ID
#     texts = np.array([text[1].rstrip('\n') for text in texts])

#     return texts

# def create_image_data(path):
#     images = []
     
#     image_paths = os.listdir(path)
#     image_paths.sort(key = lambda x : int(x.split('.')[0]))
#     image_paths = list(filter(lambda x: x.endswith('.jpg'), image_paths))
#     image_paths = [os.path.join(path, x) for x in image_paths]
# #     print(image_path)
    
#     print('Read image data')
#     # read data along with its filename as ID
#     for image_path in tqdm(image_paths):
# #         ID = int(filename.split('.')[0])
# #         file_path = os.path.join(path, filename)
# #         gc.collect()
# #         if filename.endswith('jpg'):
# #         image_with_id = get_image_with_id(image_path)
# #         images.append(image_with_id)
#         image = read_image_file(image_path)
#         images.append(image)
# #         del image
# #         gc.collect()
        
    
# #     # Sort data by its ID
# #     get_ID = lambda x : x[0]
# #     images.sort(key=get_ID)
    
#     # return data without ID
# #     images = np.array([image[1] for image in images], dtype='object')

#     return images

# def get_image_with_id(path):
#     filename = os.path.split(path)[1]
#     ID = int(filename.split('.')[0])
#     image = read_image_file(path)
#     return (ID, image)