In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import cv2
import seaborn as sns
from tqdm import tqdm

In [3]:
df = pd.read_csv("data/preprocessed_train.csv")
df

Unnamed: 0,img_id,complexity,question,answer,original,question_class,norm_question,dl_question,norm_answer
0,clb0kvxvm90y4074yf50vf5nq,3,"Are there any abnormalities, polyps, or anatom...",Evidence of oesophagitis is present with no po...,"[\n{\n""q"": ""Are there any abnormalities in the...","['abnormality_presence', 'polyp_type', 'landma...",are there any abnormalities polyps or anatomic...,any abnormality polyp anatomical landmark visi...,evidence of oesophagitis is present with no po...
1,cl8k2u1r71foz083278j63qnm,2,What procedure is depicted in the image and wh...,Evidence of a colonoscopy with a paris iia pol...,"[\n{\n""q"": ""What type of procedure is the imag...","['procedure_type', 'polyp_type']",what procedure is depicted in the image and wh...,what procedure depicted image what type polyp ...,evidence of a colonoscopy with a paris iia pol...
2,cl8k2u1qa1ekz08324rek2qcv,3,"Have all polyps been removed, is there any tex...","Polyps remain present, text is visible, and th...","[\n{\n""q"": ""Have all polyps been removed?"",\n""...","['polyp_removal_status', 'text_presence', 'abn...",have all polyps been removed is there any text...,polyp removed any text present where abnormali...,polyps remain present text is visible and the ...
3,cla820gmss67b071u3h7o5k3t,3,"Are there any surgical instruments, polyps, or...","No surgical instruments or polyps are visible,...","[\n{\n""q"": ""How many instrumnets are in the im...","['instrument_count', 'polyp_count', 'finding_c...",are there any surgical instruments polyps or a...,any surgical instrument polyp abnormal finding...,no surgical instruments or polyps are visible ...
4,clb0kvxvf90l4074y85pi02pq,1,Are there any medical devices visible in the i...,No foreign bodies or instruments identified,"[\n{\n""q"": ""Are there any instruments in the i...",['instrument_presence'],are there any medical devices visible in the i...,any medical device visible image,no foreign bodies or instruments identified
...,...,...,...,...,...,...,...,...,...
143589,clb0lbwybdnz8086u24pid7l7,1,What procedure is depicted in the image?,evidence of colonoscopy procedure,"[\n{\n""q"": ""What type of procedure is the imag...",['procedure_type'],what procedure is depicted in the image,what procedure depicted image,evidence of colonoscopy procedure
143590,clb0lbx03dpyg086u4g58d4qc,2,Are there any green or black box artifacts and...,evidence of green and black box artifacts with...,"[\n{\n""q"": ""Is there a green/black box artefac...","['box_artifact_presence', 'text_presence']",are there any green or black box artifacts and...,any green black box artifact visible text image,evidence of green and black box artifacts with...
143591,clb0kvxwp92k8074ybz5l790g,1,In which regions of the image is the abnormali...,scattered across multiple quadrants including ...,"[\n{\n""q"": ""Where in the image is the abnormal...",['abnormality_location'],in which regions of the image is the abnormali...,region image abnormality located,scattered across multiple quadrants including ...
143592,clb0kvxwo92hs074y8p8h4lm3,2,Identify the anatomical landmark and assess fo...,"No anatomical landmark identified, with eviden...","[\n{\n""q"": ""Where in the image is the anatomic...","['landmark_location', 'abnormality_presence']",identify the anatomical landmark and assess fo...,identify anatomical landmark assess any abnorm...,no anatomical landmark identified with evidenc...


In [91]:
from gensim.models import Word2Vec
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
import nltk
import re

def download_nltk():
    nltk.download('stopwords')
    nltk.download('wordnet')
    nltk.download("punkt")
    nltk.download("omw-1.4")

sort_dict = lambda x: dict(reversed(sorted(x.items(), key=lambda item: item[1])))

def norm_text(text, keep_num = False):
    text = text.lower()
    
    text = text.replace("/", " ")
    text = text.replace("?", "")
    
    if not keep_num:
        text = re.sub(r"\d+", "", text)
    
    text = re.sub(r"[^a-zA-Z.,!?;:\s]", "", text)

    text = " ".join(text.split())
    return text

# to deep learning format (remove stopwords + lemmatize)
def to_deep_learning(text):
    stop_words = set(stopwords.words("english"))
    white_list = ["what", "when", "where", "why", "any", "how", "if", "more"]
    stop_words.difference_update(white_list)
    
    words = re.findall(r"\w+|[.,!?;:]", text)
    words = [w for w in words if w not in stop_words]
    
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(w) for w in words]
    
    stemmer = PorterStemmer()
    words = [stemmer.stem(w) for w in words]
    
    return " ".join(words)

class Tokenizer:
    # get <vocab_size> most occur words
    def __init__(self, vocab_size, max_length):
        self.vocab_size = vocab_size
        self.max_length = max_length
        self.vocab_map = dict({})
        
        self.cls_id = None
        self.sep_id = None
        self.unknown_id = None
        self.pad_id = 0
        
    def fit_transform(self, data, pad = "pre", truncation = "post"):
        tokens = list(map(lambda s: s.split(), data))
        word_counts = dict({})
        
        sep_vocab = [",", ".", "?", "!", ";", ":"]
        
        for sentence in tokens:
            for word in sentence:
                if word not in sep_vocab:
                    word_counts[word] = word_counts.get(word, 0) + 1

        word_counts = sort_dict(word_counts)
        vocab = list(word_counts.keys())[:self.vocab_size:]
        self.vocab_size = len(vocab)
        self.vocab_map = {w: i for w, i in zip(vocab, range(1, self.vocab_size + 1))}
        
        self.unknown_id = self.vocab_size + 1
        self.cls_id = self.unknown_id + 1
        self.sep_id = self.cls_id + 1
        
        ids = []
        for sentence in tokens:
            tmp = [self.cls_id]
            for word in sentence:
                if word in sep_vocab:
                    id = self.sep_id
                else:
                    id = self.vocab_map.get(word, self.unknown_id)
                tmp.append(id)
            tmp.append(self.sep_id)
            ids.append(tmp)
            
        for i, sentence in enumerate(ids):
            if len(sentence) > self.max_length:
                if truncation == "pre":
                    ids[i] = ids[i][-self.max_length::]
                elif truncation == "post":
                    ids[i] = ids[i][:self.max_length:]
            else:
                pad_len = self.max_length - len(sentence)
                if pad == "pre":
                    ids[i] = [self.pad_id] * pad_len + ids[i]
                elif pad == "post":
                    ids[i] = ids[i] + [self.pad_id] * pad_len
            
        return ids    
        

In [92]:
texts = df["question"].apply(lambda x: to_deep_learning(norm_text(x)))

In [93]:
texts = texts.to_list()

In [94]:
tokenizer = Tokenizer(1000, 100)
ids = tokenizer.fit_transform(texts)

In [95]:
ids = np.array(ids)

In [96]:
ids.max()

352

In [97]:
tokenizer.vocab_map

{'imag': 1,
 'ani': 2,
 'what': 3,
 'visibl': 4,
 'polyp': 5,
 'present': 6,
 'abnorm': 7,
 'instrument': 8,
 'how': 9,
 'mani': 10,
 'locat': 11,
 'where': 12,
 'type': 13,
 'landmark': 14,
 'anatom': 15,
 'procedur': 16,
 'black': 17,
 'green': 18,
 'text': 19,
 'box': 20,
 'color': 21,
 'find': 22,
 'observ': 23,
 'size': 24,
 'remov': 25,
 'artifact': 26,
 'artefact': 27,
 'perform': 28,
 'depict': 29,
 'remain': 30,
 'gastrointestin': 31,
 'area': 32,
 'identifi': 33,
 'evid': 34,
 'region': 35,
 'medic': 36,
 'surgic': 37,
 'if': 38,
 'obtain': 39,
 'seen': 40,
 'tract': 41,
 'complet': 42,
 'tissu': 43,
 'element': 44,
 'check': 45,
 'boxlik': 46,
 'distinct': 47,
 'textual': 48,
 'associ': 49,
 'number': 50,
 'relev': 51,
 'shown': 52,
 'residu': 53,
 'sign': 54,
 'ulcer': 55,
 'coliti': 56,
 'presenc': 57,
 'excis': 58,
 'detect': 59,
 'taken': 60,
 'content': 61,
 'inform': 62,
 'confirm': 63,
 'contain': 64,
 'statu': 65,
 'colon': 66,
 'within': 67,
 'lesion': 68,
 'use': 6

In [98]:
import pickle
with open("models/tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)