In [1]:
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from  matplotlib import pyplot as plt

In [2]:
mvsa_single_data_path = '../input/mvsasingle/MVSA_Single/data'
mvsa_single_label_path = '../input/mvsasingle/MVSA_Single/labelResultAll.txt'
mvsa_multiple_data_path = '../input/mvsamultiple/MVSA/data'
mvsa_multiple_label_path = '../input/mvsamultiple/MVSA/labelResultAll.txt'

In [3]:
def read_text_file(path, multi_line=False):
    with open(path, 'r', encoding='latin-1') as f:
        if multi_line == True:
            lines = f.readlines()
            lines = [line.rstrip('\n') for line in lines]
            return lines
        return f.read()
    
def read_image_file(path):
    image = cv2.imread(path, cv2.COLOR_BGR2RGB)
    return image

def read_labels_file(path):
    dataframe = pd.read_csv(path, sep="\s+|,", engine="python")
    return dataframe

In [4]:
# IMAGE_SIZE = (224, 224)
# def read_image_file(path):
#     image = cv2.imread(path, cv2.COLOR_BGR2RGB)
# #     image = np.array(image)#.astype('float32')
# #     image = cv2.resize(image, IMAGE_SIZE, interpolation = cv2.INTER_AREA)
# #     image /= 255
# #     print(image.shape)
# #     ax = plt.subplot(1,2,1)
# #     plt.imshow(image)
# #     image = cv2.resize(image, IMAGE_SIZE, interpolation = cv2.INTER_AREA)
# #     ax = plt.subplot(1,2,2)
# #     plt.imshow(image)
# #     print(image.shape)
#     return image
# read_image_file('../input/mvsasingle/MVSA_Single/data/10.jpg')      

In [5]:
# since there are 3 annotators defining each data labels in the MVSA-Multiple dataset
# we take the labels pair that 2 out 3 annotators agree on as the valid labels pair, remove otherwise
def valid_label(dataframe):
    anno_1 = list(dataframe.iloc[:, 1:3].itertuples(index=False, name=None))
    anno_2 = list(dataframe.iloc[:, 3:5].itertuples(index=False, name=None))
    anno_3 = list(dataframe.iloc[:, 5:7].itertuples(index=False, name=None))
    IDs = list(dataframe.iloc[:, 0])
    
    valid_pairs = []
    
    for i in range(len(anno_1)):
        pairs = [anno_1[i], anno_2[i], anno_3[i]]
        ID = IDs[i]
        
        valid_pair = tuple([pair for pair in pairs if pairs.count(pair) > 1])
        
        if len(valid_pair) == 0:
            valid_pair = (ID, 'invalid', 'invalid')
        else:
            valid_pair = (ID, valid_pair[0][0], valid_pair[0][1])
        valid_pairs.append(valid_pair)
        
    valid_dataframe = pd.DataFrame(valid_pairs, columns=['ID', 'text', 'image'])
    return valid_dataframe

def ground_truth_label(text_label, image_label):
    if text_label == image_label == 'positive':
        label = 'positive'
    elif text_label == image_label == 'negative':
        label = 'negative'
    elif text_label == image_label == 'neutral':
        label = 'neutral'
    elif (text_label == 'neutral' and image_label != 'neutral') or (text_label != 'neutral' or image_label == 'neutral'):
        label = image_label if text_label == 'neutral' else text_label
    else:# (text_label == 'positive' and image_label == 'negative') or (text_label == 'negative' and image_label == 'positive'):
        label = 'invalid'
    return label

In [6]:
def create_labels(path, multiple=False):
    dataframe = read_labels_file(path)
    
    if multiple == True:
        dataframe = valid_label(dataframe)
        
    labels = []
    for _, row in dataframe.iterrows():
        label = ground_truth_label(row['text'], row['image'])
        labels.append(label)
    return np.array(labels, dtype='object')

def create_text_data(path):
    texts = []
    
    print('Read text data')
    # read data along with its filename as ID
    for filename in tqdm(os.listdir(path)):
        ID = int(filename.split('.')[0])
        file_path = os.path.join(path, filename)
        if filename.endswith('txt'):
            text = read_text_file(file_path)
            texts.append((ID, text))

    # Sort data by its ID
    get_ID = lambda x : x[0]
    texts.sort(key=get_ID)
    
    # return data without ID
    texts = np.array([text[1].rstrip('\n') for text in texts])

    return texts

def create_image_data(path):
    images = []
    
    print('Read image data')
    # read data along with its filename as ID
    for filename in tqdm(os.listdir(path)):
        ID = int(filename.split('.')[0])
        file_path = os.path.join(path, filename)
        if filename.endswith('jpg'):
            image = read_image_file(file_path)
            images.append((ID, image))
    
    # Sort data by its ID
    get_ID = lambda x : x[0]
    images.sort(key=get_ID)
    
    # return data without ID
    images = np.array([image[1] for image in images], dtype='object')

    return images

In [7]:
def save_text_file(filename, lines):
    with open(filename, 'w', encoding='latin1') as f:
        f.write('\n'.join(lines))

In [8]:
mvsa_single_texts = create_text_data(mvsa_single_data_path)
# mvsa_single_images = create_image_data(mvsa_single_data_path)
mvsa_single_labels = create_labels(mvsa_single_label_path)

Read text data


100%|██████████| 9738/9738 [00:30<00:00, 319.84it/s]


In [9]:
mvsa_multiple_texts = create_text_data(mvsa_multiple_data_path)
# mvsa_multiple_images = create_image_data(mvsa_multiple_data_path)
mvsa_multiple_labels = create_labels(mvsa_multiple_label_path, multiple=True)

Read text data


100%|██████████| 39200/39200 [02:00<00:00, 326.62it/s]


In [10]:
def invalid_indices(labels):
    indices = [i for i in range(labels.shape[0]) if labels[i] == 'invalid']
    return indices

def remove_invalid(data, labels):
    indices = invalid_indices(labels)
    new_data =  np.delete(data, indices)
    return new_data

In [11]:
mvsa_single_texts_valid = remove_invalid(mvsa_single_texts, mvsa_single_labels)
mvsa_multiple_texts_valid = remove_invalid(mvsa_multiple_texts, mvsa_multiple_labels)
mvsa_single_labels_valid = remove_invalid(mvsa_single_labels, mvsa_single_labels)
mvsa_multiple_labels_valid = remove_invalid(mvsa_multiple_labels, mvsa_multiple_labels)

In [12]:
save_text_file('./mvsa-single-texts.txt', mvsa_single_texts_valid)
save_text_file('./mvsa-multiple-texts.txt', mvsa_multiple_texts_valid)
save_text_file('./mvsa-single-labels.txt', mvsa_single_labels_valid)
save_text_file('./mvsa-multiple-labels.txt', mvsa_multiple_labels_valid)

In [13]:
mvsa_single_texts_loaded = read_text_file('./mvsa-single-texts.txt', multi_line=True)
mvsa_multiple_texts_loaded = read_text_file('./mvsa-multiple-texts.txt', multi_line=True)

mvsa_single_labels_loaded = read_text_file('./mvsa-single-labels.txt', multi_line=True)
mvsa_multiple_labels_loaded = read_text_file('./mvsa-multiple-labels.txt', multi_line=True)

print((mvsa_single_texts_valid == mvsa_single_texts_loaded).all())
print((mvsa_multiple_texts_valid == mvsa_multiple_texts_loaded).all())

print((mvsa_single_labels_valid == mvsa_single_labels_loaded).all())
print((mvsa_multiple_labels_valid == mvsa_multiple_labels_loaded).all())

True
True
True
True


In [14]:
############
# df_labels = read_labels_file('../input/mvsasingle/MVSA_Single/labelResultAll.txt')
# df_labels['text'].count
# label_text_count = df_labels.value_counts('text')
# label_image_count = df_labels.value_counts('image')
# label_text_count

In [15]:
# text_ids = []
# image_ids = []
# for i in range(len(texts)):
#     if texts[i][0] != images[i][0] != read_labels_file(label_path)['ID'][i]:
#         print('here')

In [16]:
# u, c, i = np.unique(texts, return_index=True, return_counts=True, axis=1)
# dup = u[c > 1]
# u[:10]

In [17]:
# def display_sample(text, image):
#     plt.imshow(image)
#     print('Text:', text)
#     plt.show()
# display_sample(texts_with_id[2][1], images_with_id[2][1])

In [18]:
# shapes = []
# for image in images_with_id:
#     shapes.append(image.shape)
# shapes

In [19]:
# raw_texts[raw_texts.duplicated('Text')]

In [20]:
# len_split = lambda x: len(x.split())
# print()
# print('Min number of words in text:', raw_texts['Text'].apply(len_split).min())
# print('Max number of words in text:', raw_texts['Text'].apply(len_split).max())
# print('Average number of words in text:', round(raw_texts['Text'].apply(len_split).mean()))