In [34]:
import json 
import os
import collections
import tensorflow as tf 
import re
import h5py
import argparse
import sys 
import numpy as np 
import pandas as pd
import pickle

# Preprocessing for Q/A module

In [28]:
FLAGS = None
BUFFER_TOKENS = ['<NULL>', '<START>', '<END>', '<UNK>']

In [29]:
def _parse_sentence(s):
    s = s.replace('.', '')
    s = s.replace(',', '')
    s = s.replace('"', '')
    s = s.replace("'", '')
    s = s.replace("?", '')
    s = s.lower()
    s = re.sub("\s\s+", " ", s)
    s = s.split(' ')
    return s

In [30]:
def _create_init_dic(filename):
    df=pd.read_csv(filename,header=None)
    df=df[[0,1,2,3]]
    df.columns = ['SerialNo', 'ImageNo', 'Question','Answer']
    df.reset_index()
    bool_mat=[]
    for i in range(0,len(df)):
        bool_mat.append(df['Question'].iloc[i][len(df['Question'].iloc[i])-1]=='?')
    df=df[bool_mat]
    df['Q_parsed']=[ _parse_sentence(s) for s in df['Question']]
    df['A_parsed']=[ _parse_sentence(s) for s in df['Answer']]
    return df
#Training Dataset
dic_df=_create_init_dic("/home/it/8sem/AMP/vqa assignment/VQAMed2018Train/VQAMed2018Train-QA.csv")
#Test Dataset
dic_v_df=_create_init_dic("/home/it/8sem/AMP/vqa assignment/VQAMed2018Valid/VQAMed2018Valid-QA.csv")

In [31]:
list_of_all_words=[]
for i in range(0,len(dic_df)):
    list_of_all_words=list_of_all_words+dic_df['A_parsed'].iloc[i]+dic_df['Q_parsed'].iloc[i]

In [32]:
counter = collections.Counter(list_of_all_words)
TOTAL_VOCAB=len(counter)
vocab = counter.most_common(TOTAL_VOCAB)

In [33]:
## create word_to_idx, and idx_to_word
vocab = [i[0] for i in vocab]
word_to_idx = {}
idx_to_word = {}
# add in BUFFER_TOKENS
for i in range(len(BUFFER_TOKENS)):
    idx_to_word[int(i)] = BUFFER_TOKENS[i]
    word_to_idx[BUFFER_TOKENS[i]] = i

for i in range(len(vocab)):
    word_to_idx[vocab[i]] = i + len(BUFFER_TOKENS)
    idx_to_word[int(i + len(BUFFER_TOKENS))] = vocab[i]


In [None]:
pickle.dump(word_to_idx, open('word_to_idx.pkl', 'wb') )
pickle.dump(idx_to_word, open('idx_to_word.pkl', 'wb') )

In [35]:
PADDING_LEN=17
def _convert_sentence_to_numbers(s):
    """Convert a sentence s (a list of words) to list of numbers using word_to_idx"""
    UNK_IDX = BUFFER_TOKENS.index('<UNK>')
    NULL_IDX = BUFFER_TOKENS.index('<NULL>')
    END_IDX = BUFFER_TOKENS.index('<END>')
    STR_IDX = BUFFER_TOKENS.index('<START>')
    s_encoded = [word_to_idx.get(w, UNK_IDX) for w in s]
    s_encoded = [STR_IDX] + s_encoded
    s_encoded += [END_IDX]
    s_encoded += [NULL_IDX] * (PADDING_LEN - 1 - len(s_encoded))
    return s_encoded

In [36]:
df_final=dic_df
all_answers = [_convert_sentence_to_numbers(s) for s in np.array(df_final['A_parsed'])] # list of numbers 
valid_rows1 = [i for i in range(len(all_answers)) if len(all_answers[i]) == PADDING_LEN-1]
df_final=df_final.iloc[valid_rows1,:]
df_final['A_Encoded']=[row for row in all_answers if len(row) == PADDING_LEN-1]
all_questions = [_convert_sentence_to_numbers(s) for s in np.array(df_final['Q_parsed'])] 
valid_rows2 = [i for i in range(len(all_questions)) if len(all_questions[i]) == PADDING_LEN-1]
df_final=df_final.iloc[valid_rows2,:]
df_final['Q_Encoded']=[row for row in all_questions if len(row) == PADDING_LEN-1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


In [40]:
df_final.head()

Unnamed: 0,SerialNo,ImageNo,Question,Answer,Q_parsed,A_parsed,A_Encoded,Q_Encoded
0,1,rjv03401,what does mri show?,lesion at tail of pancreas,"[what, does, mri, show]","[lesion, at, tail, of, pancreas]","[1, 26, 35, 272, 6, 129, 2, 0, 0, 0, 0, 0, 0, ...","[1, 5, 7, 15, 9, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
1,2,AIAN-14-313-g002,where does axial section mri abdomen show hypo...,in distal pancreas,"[where, does, axial, section, mri, abdomen, sh...","[in, distal, pancreas]","[1, 10, 290, 129, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[1, 25, 7, 23, 54, 15, 31, 9, 886, 19, 2, 0, 0..."
2,3,wjem-11-76f3,what do the arrows denote in the noncontrast c...,complex fluid collection with layering consist...,"[what, do, the, arrows, denote, in, the, nonco...","[complex, fluid, collection, with, layering, c...","[1, 492, 76, 93, 16, 3051, 343, 16, 109, 2, 0,...","[1, 5, 84, 4, 354, 3052, 10, 4, 387, 8, 14, 20..."
3,4,ccr30002-0045-f3,what was normal?,blood supply to the brain,"[what, was, normal]","[blood, supply, to, the, brain]","[1, 668, 1658, 33, 4, 28, 2, 0, 0, 0, 0, 0, 0,...","[1, 5, 152, 130, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
4,5,rjt01904,what shows evidence of a contained rupture?,repeat ct scan of the abdomen,"[what, shows, evidence, of, a, contained, rupt...","[repeat, ct, scan, of, the, abdomen]","[1, 887, 8, 14, 6, 4, 31, 2, 0, 0, 0, 0, 0, 0,...","[1, 5, 12, 181, 6, 13, 1659, 440, 2, 0, 0, 0, ..."


In [38]:
df_v_final=dic_v_df
all_answers = [_convert_sentence_to_numbers(s) for s in np.array(df_v_final['A_parsed'])] # list of numbers 
valid_rows3 = [i for i in range(len(all_answers)) if len(all_answers[i]) == PADDING_LEN-1]
df_v_final=df_v_final.iloc[valid_rows3,:]
df_v_final['A_Encoded']=[row for row in all_answers if len(row) == PADDING_LEN-1]
all_questions = [_convert_sentence_to_numbers(s) for s in np.array(df_v_final['Q_parsed'])] 
valid_rows4 = [i for i in range(len(all_questions)) if len(all_questions[i]) == PADDING_LEN-1]
df_v_final=df_v_final.iloc[valid_rows4,:]
df_v_final['Q_Encoded']=[row for row in all_questions if len(row) == PADDING_LEN-1]

In [39]:
df_v_final.head()

Unnamed: 0,SerialNo,ImageNo,Question,Answer,Q_parsed,A_parsed,A_Encoded,Q_Encoded
0,1,ATM-02-80-g002,what does thorax ct show?,regression in the infiltrations,"[what, does, thorax, ct, show]","[regression, in, the, infiltrations]","[1, 1032, 10, 4, 2077, 2, 0, 0, 0, 0, 0, 0, 0,...","[1, 5, 7, 125, 8, 9, 2, 0, 0, 0, 0, 0, 0, 0, 0..."
1,2,AJNS-8-48-g001,where does the ct scan show the hematoma?,left parietal area,"[where, does, the, ct, scan, show, the, hematoma]","[left, parietal, area]","[1, 18, 216, 86, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 25, 7, 4, 8, 14, 9, 4, 109, 2, 0, 0, 0, 0,..."
2,3,1477-7819-2-41-1,what does the ct scan show?,a paraesophageal and retrotracheal mass,"[what, does, the, ct, scan, show]","[a, paraesophageal, and, retrotracheal, mass]","[1, 13, 1555, 11, 3, 19, 2, 0, 0, 0, 0, 0, 0, ...","[1, 5, 7, 4, 8, 14, 9, 2, 0, 0, 0, 0, 0, 0, 0, 0]"
3,4,iranjradiol-10-99-g001,what does the mri demonstrate?,non mass-like enhancement,"[what, does, the, mri, demonstrate]","[non, mass-like, enhancement]","[1, 123, 3, 50, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 5, 7, 4, 15, 39, 2, 0, 0, 0, 0, 0, 0, 0, 0..."
4,5,Tanaffos-10-072-g001,what does the ct scan of the chest show?,minimal basilar atelectasis,"[what, does, the, ct, scan, of, the, chest, show]","[minimal, basilar, atelectasis]","[1, 578, 705, 1250, 2, 0, 0, 0, 0, 0, 0, 0, 0,...","[1, 5, 7, 4, 8, 14, 6, 4, 38, 9, 2, 0, 0, 0, 0..."


In [None]:
#save preprocessed training data frame
df_final.to_pickle("train_df_final.pkl")

In [22]:
#save preprocessed test data frame
df_v_final.to_pickle("test_df_v_final.pkl")

# Image Related

In [None]:
#Take input as preprocssed images (features) and select valid rows based on above selection during training
file = open('/home/it/8sem/AMP/image_feature_train.pkl', 'rb')
features = pickle.load(file)
features = np.array(features)
features = features[valid_rows1,]
features = features[valid_rows2,]

In [None]:
pickle.dump(features, open('image_feature_train.pkl','wb'))

In [24]:
#Take input as preprocssed images (features) and select valid rows based on above selection during testing
file = open('/home/it/8sem/AMP/image_feature_test.pkl', 'rb')
features = pickle.load(file)
features = np.array(features)
features = features[valid_rows3,]
features = features[valid_rows4,]

In [25]:
pickle.dump(features, open('image_feature_test.pkl','wb'))