In [1]:
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm
from  matplotlib import pyplot as plt

In [2]:
mvsa_single_data_path = '../input/mvsasingle/MVSA_Single/data'
mvsa_single_label_path = '../input/mvsasingle/MVSA_Single/labelResultAll.txt'
mvsa_multiple_data_path = '../input/mvsamultiple/MVSA/data'
mvsa_multiple_label_path = '../input/mvsamultiple/MVSA/labelResultAll.txt'

In [28]:
def read_text_file(path, multi_line=False):
    with open(path, 'r', encoding='latin-1') as f:
        if multi_line == True:
            lines = f.readlines()
            lines = [line.rstrip('\n') for line in lines]
            return lines
        return f.read()
    
def read_image_file(path):
    image = cv2.imread(path, cv2.COLOR_BGR2RGB)
    return image

def read_labels_file(path):
    dataframe = pd.read_csv(path, sep="\s+|,", engine="python")
    return dataframe

In [4]:
# IMAGE_SIZE = (224, 224)
# def read_image_file(path):
#     image = cv2.imread(path, cv2.COLOR_BGR2RGB)
# #     image = np.array(image)#.astype('float32')
# #     image = cv2.resize(image, IMAGE_SIZE, interpolation = cv2.INTER_AREA)
# #     image /= 255
# #     print(image.shape)
# #     ax = plt.subplot(1,2,1)
# #     plt.imshow(image)
# #     image = cv2.resize(image, IMAGE_SIZE, interpolation = cv2.INTER_AREA)
# #     ax = plt.subplot(1,2,2)
# #     plt.imshow(image)
# #     print(image.shape)
#     return image
# read_image_file('../input/mvsasingle/MVSA_Single/data/10.jpg')      

In [5]:
# since there are 3 annotators defining each data labels in the MVSA-Multiple dataset
# we take the labels pair that 2 out 3 annotators agree on as the valid labels pair, remove otherwise
def valid_label(dataframe):
    anno_1 = list(dataframe.iloc[:, 1:3].itertuples(index=False, name=None))
    anno_2 = list(dataframe.iloc[:, 3:5].itertuples(index=False, name=None))
    anno_3 = list(dataframe.iloc[:, 5:7].itertuples(index=False, name=None))
    IDs = list(dataframe.iloc[:, 0])
    
    valid_pairs = []
    
    for i in range(len(anno_1)):
        pairs = [anno_1[i], anno_2[i], anno_3[i]]
        ID = IDs[i]
        
        valid_pair = tuple([pair for pair in pairs if pairs.count(pair) > 1])
        
        if len(valid_pair) == 0:
            valid_pair = (ID, None)
        else:
            valid_pair = (ID, valid_pair[0][0], valid_pair[0][1])
        valid_pairs.append(valid_pair)
        
    valid_dataframe = pd.DataFrame(valid_pairs, columns=['ID', 'text', 'image'])
    return valid_dataframe

def ground_truth_label(text_label, image_label):
    if text_label == image_label == 'positive':
        label = 'positive'
    elif text_label == image_label == 'negative':
        label = 'negative'
    elif text_label == image_label == 'neutral':
        label = 'neutral'
    elif (text_label == 'neutral' and image_label != 'neutral') or (text_label != 'neutral' or image_label == 'neutral'):
        label = image_label if text_label == 'neutral' else text_label
    else:# (text_label == 'positive' and image_label == 'negative') or (text_label == 'negative' and image_label == 'positive'):
        label = None
    return label

In [12]:
def create_labels(path, multiple=False):
    dataframe = read_labels_file(path)
    
    if multiple == True:
        dataframe = valid_label(dataframe)
        
    labels = []
    for _, row in dataframe.iterrows():
        label = ground_truth_label(row['text'], row['image'])
        labels.append(label)
    return np.array(labels, dtype='object')

def create_text_data(path):
    texts = []
    
    print('Read text data')
    # read data along with its filename as ID
    for filename in tqdm(os.listdir(path)):
        ID = int(filename.split('.')[0])
        file_path = os.path.join(path, filename)
        if filename.endswith('txt'):
            text = read_text_file(file_path)
            texts.append((ID, text))

    # Sort data by its ID
    get_ID = lambda x : x[0]
    texts.sort(key=get_ID)
    
    # return data without ID
    texts = np.array([text[1].rstrip('\n') for text in texts])

    return texts

def create_image_data(path):
    images = []
    
    print('Read image data')
    # read data along with its filename as ID
    for filename in tqdm(os.listdir(path)):
        ID = int(filename.split('.')[0])
        file_path = os.path.join(path, filename)
        if filename.endswith('jpg'):
            image = read_image_file(file_path)
            images.append((ID, image))
    
    # Sort data by its ID
    get_ID = lambda x : x[0]
    images.sort(key=get_ID)
    
    # return data without ID
    images = np.array([image[1] for image in images], dtype='object')

    return images

In [7]:
def save_text_file(filename, lines, newline=False):
    with open(filename, 'w', encoding='latin1') as f:
        if newline == True:
            f.write('\n'.join(lines))
        else:
            f.writelines(lines)

In [13]:
mvsa_single_texts = create_text_data(mvsa_single_data_path)
# mvsa_single_images = create_image_data(mvsa_single_data_path)
mvsa_single_labels = create_labels(mvsa_single_label_path)

Read text data


100%|██████████| 9738/9738 [00:02<00:00, 4157.58it/s]


In [15]:
mvsa_multiple_texts = create_text_data(mvsa_multiple_data_path)
# mvsa_multiple_images = create_image_data(mvsa_multiple_data_path)
mvsa_multiple_labels = create_labels(mvsa_multiple_label_path, multiple=True)

Read text data


100%|██████████| 39200/39200 [00:09<00:00, 4145.16it/s]


In [14]:
mvsa_single_texts

array(['How I feel today #legday #jelly #aching #gym ',
       'grattis min griskulting!!!???? va bara tvungen oki s? sch ? @ingenkommeratttrodig #pig #happybday #wow #lovely #cut¡\xad ',
       'RT @polynminion: The moment I found my favourite tV character. #PROFOUNDLOVE ',
       ...,
       'RT @bookmyshow: #Disney\'s lies - "Every day is a good hair day". Yeah, right!!: http://t.co/351AQVV7gA http://t.co/fFH59wGwbs',
       'RT @MUBMI: TWO MORE DAYS to submit your apps #mizzou19 http://t.co/aqhTljfgIG http://t.co/cftTb6ok5b',
       "RT @khununeos: khun was literally me whenever I gotta see junho's sinful butt- lol (crtto) http://t.co/y3IvvUTRYT"],
      dtype='<U257')

In [16]:
mvsa_multiple_texts

array(['Knocked doors with the venerable #TeamTrudeau #lpc candidate @kylejpeterson this aft in my hometown, Aurora! #elxn42',
       'Canvassing for @ElectKellyYEG #yegfed #elxn42',
       "An NPD gov't would institutionalize mediocrity #elxn42 #polqc",
       ...,
       '@saddlehillsab deadline extended for County #scholarships http://t.co/NF8IPAAOhP',
       'im dead',
       'I usually spend a lot of time at Improv Detroit just staring at my chalkboard, but these faux-Russians are damn funny'],
      dtype='<U144')

In [17]:
save_text_file('./mvsa-single-texts.txt', mvsa_single_texts, newline=True)

In [29]:
mvsa_single_texts_loaded = read_text_file('./mvsa-single-texts.txt', multi_line=True)
(mvsa_single_texts == mvsa_single_texts_loaded).all()

True

In [26]:
len(mvsa_single_texts_loaded)

4871

In [18]:
save_text_file('./mvsa-multiple-texts.txt', mvsa_multiple_texts, newline=True)

In [30]:
mvsa_multiple_texts_loaded = read_text_file('./mvsa-multiple-texts.txt', multi_line=True)

(mvsa_multiple_texts == mvsa_multiple_texts_loaded).all()

True

In [27]:
mvsa_multiple_texts_loaded == mvsa_multiple_texts

array([False, False, False, ..., False, False,  True])

In [29]:
mvsa_multiple_texts_loaded

['Knocked doors with the venerable #TeamTrudeau #lpc candidate @kylejpeterson this aft in my hometown, Aurora! #elxn42\n',
 'Canvassing for @ElectKellyYEG #yegfed #elxn42\n',
 "An NPD gov't would institutionalize mediocrity #elxn42 #polqc\n",
 '""I think it\'s time for change"" - Ana Commit to Vote: #GenerationTrudeau #SFU #LPC #elxn42 http://t.co/hv2oIUdXIb\n',
 'The Past and Future of the Refugee Crisis - Thomas Sowell #elxn42 #polqc http://t.co/2KRP2MrMmP\n',
 'Rdy to watch @ThomasMulcair rock it tnight in the @globeandmail debate at @WinnipegNews CafÃ© #NDP #cdnpoli #elxn42\n',
 "Can't wait to vote for @kenthehr and @JustinTrudeau on October 19. #cdnpoli #elxn42 #RealChange Go @TeamHehr\n",
 "Vote for NDP is vote for another Harper. #elxn42 Andrew Thomson, Mulcair's financial adviser http://t.co/e6qTJi5vVH\n",
 'The end of the road and destined for the scrap heap #HarperBus #Canada #cdnpoli #elxn42\n',
 '?? Who is who and does what in the #elxn42 Conservative War Room? #CPC #CDNpol

In [None]:
############
# df_labels = read_labels_file('../input/mvsasingle/MVSA_Single/labelResultAll.txt')
# df_labels['text'].count
# label_text_count = df_labels.value_counts('text')
# label_image_count = df_labels.value_counts('image')
# label_text_count

In [None]:
# text_ids = []
# image_ids = []
# for i in range(len(texts)):
#     if texts[i][0] != images[i][0] != read_labels_file(label_path)['ID'][i]:
#         print('here')

In [None]:
u, c, i = np.unique(texts, return_index=True, return_counts=True, axis=1)
dup = u[c > 1]
u[:10]

In [None]:
remove_indices = [i for i in range(labels.shape[0]) if labels[i] is None] # labels that conflicts between image and text
# np.delete......

In [None]:
def display_sample(text, image):
    plt.imshow(image)
    print('Text:', text)
    plt.show()
display_sample(texts_with_id[2][1], images_with_id[2][1])

In [None]:
shapes = []
for image in images_with_id:
    shapes.append(image.shape)
shapes

In [None]:
raw_texts[raw_texts.duplicated('Text')]

In [None]:
len_split = lambda x: len(x.split())
print()
print('Min number of words in text:', raw_texts['Text'].apply(len_split).min())
print('Max number of words in text:', raw_texts['Text'].apply(len_split).max())
print('Average number of words in text:', round(raw_texts['Text'].apply(len_split).mean()))