In [1]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import numpy as np
import tensorflow as tf
import nltk
from PIL import Image
import os
import re
import sys
import pickle
import random
from tensorflow.python.keras.applications import InceptionV3
from tensorflow.python.keras.models import Model

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
dataPath="./data/"
if "google.colab" in sys.modules:
    from google.colab import drive
    drive.mount("/content/drive", force_remount=True)
    dataPath="/content/drive/My Drive/MVI project/data/"
    print("Pouziva se google colab.")
    nltk.download('punkt')
else:
    print("Pouziva se lokalni vypocet.")

Pouziva se lokalni vypocet.


In [3]:
# Rozdelim si obrazky na trenovaci a testovaci mnozinu,
# obrazky jsou vsechny v jpg, takze priponu neresim
trainImages=[]
testImages=[]
with open(dataPath+"Flickr8k_text/Flickr_8k.trainImages.txt", "r") as f:
    for line in f:
        trainImages.append(line.split(".")[0])
with open(dataPath+"Flickr8k_text/Flickr_8k.testImages.txt", "r") as f:
    for line in f:
        testImages.append(line.split(".")[0])
print("Trenovacich je {} a testovacich {}.".format(len(trainImages), len(testImages)))
print("Priklad formatu ulozeni jmena obrazku: {}".format(trainImages[0]))

Trenovacich je 6000 a testovacich 1000.
Priklad formatu ulozeni jmena obrazku: 2513260012_03d33305cf


In [4]:
origImagePath=dataPath+"Flicker8k_Dataset/"
imagePath=dataPath+"resized/"
# Zmeni rozliseni obrazku na danou velikost
def resizeImage(imgName):
    width=299
    height=299
    temp=tf.keras.preprocessing.image.load_img(origImagePath+imgName+".jpg",
                                                       target_size=(height,width))
    #temp=Image.open(origImagePath+imgName+".jpg")
    temp=temp.resize((width, height),Image.LANCZOS)
    temp.save(imagePath+imgName+".jpg")

# Pokud ve slozce s resized jsou obrazky, tak se resizing neprovadi
cnt=0
for imgName in trainImages:
    if not os.path.exists(imagePath+imgName+".jpg"):
        resizeImage(imgName)
        cnt+=1
for imgName in testImages:
    if not os.path.exists(imagePath+imgName+".jpg"):
        resizeImage(imgName)
        cnt+=1
print("Resizing byl proveden u {} z {} obrazku.".format(cnt, len(trainImages)+len(testImages)))

Resizing byl proveden u 0 z 7000 obrazku.


In [5]:
# Nacteni obrazku a jejich popisu
# Vysledkem je list struktur (id obrazku, popis1,....,popis5)
imageCaptions={}
with open(dataPath+"Flickr8k_text/Flickr8k.token.txt", "r") as f:
    for line in f:
        temp=line.split(maxsplit=1)
        fileId=temp[0].split(".")[0]
        caption=temp[1][:-1]
        if fileId not in imageCaptions:
            imageCaptions[fileId]=[]
        imageCaptions[fileId].append(caption)
print("Nactena data {} obrazku.".format(len(imageCaptions)))
print("Priklad ulozenych dat: {}".format(list(imageCaptions.items())[0]))

Nactena data 8092 obrazku.
Priklad ulozenych dat: ('1000268201_693b08cb0e', ['A child in a pink dress is climbing up a set of stairs in an entry way .', 'A girl going into a wooden building .', 'A little girl climbing into a wooden playhouse .', 'A little girl climbing the stairs to her playhouse .', 'A little girl in a pink dress going into a wooden cabin .'])


# CNN #

In [6]:
# Nacteni modelu na rozpoznavani ficur v obrazku
# Konkretne Inception V3 predtrenovany na imagenet datech
fea_model=InceptionV3(weights='imagenet')
# Odstraneni klasifikacni (posledni) vrstvy
fea_model=Model(fea_model.input, fea_model.layers[-2].output)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [7]:
# Architektura modelu
fea_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 299, 299, 3) 0                                            
__________________________________________________________________________________________________
conv2d (Conv2D)                 (None, 149, 149, 32) 864         input_1[0][0]                    
__________________________________________________________________________________________________
batch_normalization (BatchNorma (None, 149, 149, 32) 96          conv2d[0][0]                     
__________________________________________________________________________________________________
activation (Activation)         (None, 149, 149, 32) 0           batch_normalization[0][0]        
______________________________________________________________________________________________

In [8]:
# Prozene obrazek Inception V3 a vyhodi vystupy predposledni vrstvy
def runImageThroughV3(inpImg):
    temp=Image.open(imagePath+inpImg+".jpg")
    temp=tf.keras.preprocessing.image.img_to_array(temp)
    temp=np.expand_dims(temp, axis=0)
    temp=tf.keras.applications.inception_v3.preprocess_input(temp)
    temp=fea_model.predict(temp)
    return temp[0]

v3_train_path=dataPath+"v3imgOutput.pkl"
if os.path.exists(v3_train_path):
    with open(v3_train_path, 'rb') as f:
        v3_train_output=pickle.load(f)
    print("V3 Training nactena.")
else:
    v3_train_output={}
    cnt=0
    for img in trainImages:
        v3_train_output[img]=runImageThroughV3(img)
        cnt+=1
        if cnt%500==0:
            print("Zpracovano {} z {}.".format(cnt, len(trainImages)))
    with open(v3_train_path, 'wb') as f:
        pickle.dump(v3_train_output, f)
    print("V3 Training zprocesovana a ulozena.")

V3 Training nactena.


# Tokenizace popisků #

In [9]:
# Oznaceni zacatku a konce captions
print("Vychozi vzhled popisku: {}".format(list(imageCaptions.values())[0][0]))
mark_start="ssssss"
mark_end="eeeeee"
cnt=0
for img in imageCaptions:
    # Kdyby se to nahodou poustelo dvakrat, aby nebyl potreba reset
    if imageCaptions[img][0].split(maxsplit=1)[0]==mark_start:
        break
    for i in range(0,5):
        imageCaptions[img][i]=mark_start+" "+imageCaptions[img][i]+" "+mark_end
        cnt+=1
print("Novy vzhled popisku: {}".format(list(imageCaptions.values())[0][0]))
print("Transformovano {} popisku.".format(cnt))

Vychozi vzhled popisku: A child in a pink dress is climbing up a set of stairs in an entry way .
Novy vzhled popisku: ssssss A child in a pink dress is climbing up a set of stairs in an entry way . eeeeee
Transformovano 40460 popisku.


In [10]:
def unwrapCaptions(imgCaptions):
    ret=[]
    for img in imgCaptions:
        ret.extend(imgCaptions[img])
    return ret

# Tokenizuje popisky a zaroven vytvori indexy pro jednotliva slova
vocabSize=2000
tokenizer=tf.keras.preprocessing.text.Tokenizer(num_words=vocabSize)
tokenizer.fit_on_texts(unwrapCaptions(imageCaptions))
# Ulozim si tokenizovana slova ve forme indexu
captionsTokenized={}
for img in imageCaptions:
    captionsTokenized[img]=tokenizer.texts_to_sequences(list(imageCaptions[img]))
print("Priklad: {}\n je reprezentovany jako {}".format(list(
    imageCaptions.values())[0][0], list(captionsTokenized.values())[0][0]))
# Pristup ke slovum skrze jejich indexy
indexToWord=dict(map(reversed, tokenizer.word_index.items()))
# Nejdelsi caption
maxCaptLen=max([len(i) for i in unwrapCaptions(captionsTokenized)])

Priklad: ssssss A child in a pink dress is climbing up a set of stairs in an entry way . eeeeee
 je reprezentovany jako [2, 1, 43, 4, 1, 90, 172, 7, 119, 51, 1, 394, 12, 395, 4, 28, 670, 3]


# Generace dat #

In [11]:
# Vyhodi nahodne popisky pro dane obrazky
def getRandCapTok(ind, captionsTokenized):
    res=[]
    for i in ind:
        j=np.random.choice(len(captionsTokenized[i]))
        toks=captionsTokenized[i][j]
        res.append(toks)
    return res

In [12]:
# Generuje trenovaci data pro RNN
def dataGenerator(captionsTokenized, v3_output, maxCaptLen, vocabSize, batchSize):
    while True:
        ind=random.sample(list(captionsTokenized), batchSize)
        feature_values=[]
        for i in ind:
            feature_values.append(v3_output[i])
        toks=getRandCapTok(ind, captionsTokenized)
        num_tokens=[len(t) for t in toks]
        max_tokens=np.max(num_tokens)
        toksPadded=tf.keras.preprocessing.sequence.pad_sequences(toks, maxlen=max_tokens,
                                                                 padding='post', truncating='post')
        
        decoder_input_data=toksPadded[:,0:-1]
        decoder_output_data=toksPadded[:,1:]
        xRet=\
        {
            'decoder_input': np.array(decoder_input_data),
            'feature_values_input': np.array(feature_values)
        }
        yRet=\
        {
            'decoder_output': np.array(decoder_output_data)
        }
        yield (xRet, yRet)

In [13]:
# Pouze captions z trenovacich obrazku
trainCaptionsTokenized={k:captionsTokenized[k] for k in trainImages}
# sample generator s batchsize 4
gener=dataGenerator(trainCaptionsTokenized, v3_train_output, maxCaptLen, vocabSize, 4)
temp=next(gener)
tempx=temp[0]
tempy=temp[1]
print("Priklad feature values inputu: {}".format(tempx['feature_values_input'][0]))
print("Priklad decoder inputu: {}".format(tempx['decoder_input'][0]))
print("Priklad decoder outputu: {}".format(tempy['decoder_output'][0]))

Priklad feature values inputu: [0.38199306 0.2568115  1.0268419  ... 0.         0.15898944 0.25276464]
Priklad decoder inputu: [  2 189   7   1  26  19   4   1  59 172   8   1  26  16   4   1  14 215
  38   4   1  52  12  41]
Priklad decoder outputu: [189   7   1  26  19   4   1  59 172   8   1  26  16   4   1  14 215  38
   4   1  52  12  41   3]


# RNN #

In [14]:
batchSize=512
stepsInEpoch=(len(list(imageCaptions.values())[0])*len(trainImages))//batchSize
print("Kroku v kazde epose: {}".format(stepsInEpoch))

Kroku v kazde epose: 58


In [15]:
stateSize=256
embeddingSize=128
feature_values_input=tf.keras.Input(shape=(2048,), name="feature_values_input")
decoder_feature_map=tf.keras.layers.Dense(stateSize, activation="tanh", name="decoder_feature_map")
# input none znamena libovolnou delku
decoder_input=tf.keras.Input(shape=(None,), name="decoder_input")
# konvertuje integer tokeny na sekvence vektoru
decoder_embedding=tf.keras.layers.Embedding(input_dim=vocabSize,
                                            output_dim=embeddingSize, name="decoder_embedding")
# 3 gru vrstvy, vyhazuji tensor [batchsize, seqlen, statesize]
decoder_gru1=tf.keras.layers.GRU(stateSize, name="decoder_gru1", return_sequences=True)
decoder_gru2=tf.keras.layers.GRU(stateSize, name="decoder_gru2", return_sequences=True)
decoder_gru3=tf.keras.layers.GRU(stateSize, name="decoder_gru3", return_sequences=True)
decoder_dense=tf.keras.layers.Dense(vocabSize, activation="linear", name="decoder_output")

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [16]:
# assembly vysledneho modelu
initState=decoder_feature_map(feature_values_input)
net=decoder_input
net=decoder_embedding(net)
net=decoder_gru1(net, initial_state=initState)
net=decoder_gru2(net, initial_state=initState)
net=decoder_gru3(net, initial_state=initState)
decoder_output=decoder_dense(net)

decoder_model=Model(inputs=[feature_values_input, decoder_input], outputs=[decoder_output])
decoder_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
decoder_input (InputLayer)      [(None, None)]       0                                            
__________________________________________________________________________________________________
feature_values_input (InputLaye [(None, 2048)]       0                                            
__________________________________________________________________________________________________
decoder_embedding (Embedding)   (None, None, 128)    256000      decoder_input[0][0]              
__________________________________________________________________________________________________
decoder_feature_map (Dense)     (None, 256)          524544      feature_values_input[0][0]       
____________________________________________________________________________________________

In [17]:
# Nastaveni optimizeru
def sparse_cross_entropy(true, pred):
    loss=tf.nn.sparse_softmax_cross_entropy_with_logits(labels=true, logits=pred)
    ret=tf.reduce_mean(loss)
    return ret

optimizer=tf.keras.optimizers.RMSprop(lr=1e-3)
decoder_target=tf.placeholder(dtype="int32", shape=(None, None))
decoder_model.compile(optimizer=optimizer, loss=sparse_cross_entropy, target_tensors=[decoder_target])

In [18]:
# Nastaveni checkpointu
checkpointPath=dataPath+"rnn_checkpoint.keras"
callbackchp=tf.keras.callbacks.ModelCheckpoint(filepath=checkpointPath, verbose=1, save_weights_only=True)
callbacktb=tf.keras.callbacks.TensorBoard(log_dir=dataPath+"RNN_log/", histogram_freq=0, write_graph=False)
callbacks=[callbackchp, callbacktb]

# Nacteni modelu, resp jeho vah
modelFound=False
try:
    decoder_model.load_weights(checkpointPath)
    modelFound=True
except Exception as e:
    modelFound=False
    print(e)

In [19]:
# Trening modelu
gener=dataGenerator(trainCaptionsTokenized, v3_train_output, maxCaptLen, vocabSize, batchSize)
if not modelFound:
  decoder_model.fit_generator(generator=gener, steps_per_epoch=stepsInEpoch, epochs=20, callbacks=callbacks)
print("Trening hotov.")

Trening hotov.


# Generace popisků #

In [20]:
# Generuje popisek pro dany obrazek
def generateCaption(imgPath):
    width=299
    height=299
    maxTokens=30
    image=tf.keras.preprocessing.image.load_img(imgPath, target_size=(height,width))
    image=image.resize((width, height),Image.LANCZOS)
    image=tf.keras.preprocessing.image.img_to_array(image)
    image=np.expand_dims(image, axis=0)
    image=tf.keras.applications.inception_v3.preprocess_input(image)
    image=fea_model.predict(image)
    feature_values=image
    shp=(1, maxTokens)
    decoder_input_data=np.zeros(shape=shp, dtype=np.int)
    theToken=tokenizer.word_index[mark_start]
    outputText=""
    tokenCount=0
    while theToken!=tokenizer.word_index[mark_end] and tokenCount<maxTokens:
        decoder_input_data[0, tokenCount]=theToken
        tmp=np.array(feature_values)
        x=\
        {
            'feature_values_input': np.array(feature_values),
            'decoder_input': decoder_input_data
        }
        decoder_output=decoder_model.predict(x)
        tok_onehot=decoder_output[0, tokenCount, :]
        theToken=np.argmax(tok_onehot)
        theWord=indexToWord[theToken]
        if theToken>3:
            outputText+=" " + theWord
        tokenCount+=1
    print("Popisek je: {}".format(outputText.strip()))
    return outputText.strip()

In [30]:
query1=generateCaption(dataPath+"kitten.jpg")

Popisek je: black and white cat is jumping in the air


In [31]:
query2=generateCaption(dataPath+"manwithsuit.jpg")

Popisek je: man in suit and black jacket is sitting on bench


In [32]:
query3=generateCaption(dataPath+"merc-c.jpg")

Popisek je: car is driving around car at car


# Doc2Vec #

In [33]:
#porStem=nltk.stem.PorterStemmer()
def cleanWord(theWord):
    theWord=theWord.lower()
    theWord=re.sub('[^a-z0-9]+', '', theWord)
    #theWord=porStem.stem(theWord)
    if len(theWord)<2:
        return ""
    return theWord

# Nacteni clanku
articleFiles=[]
for r, d, f in os.walk(dataPath+"bbc/"):
    for file in f:
        temp=file.split(".")
        if temp[1].lower()=="txt" and temp[0].lower()!="readme":
            articleFiles.append(r+"/"+file)
intToArt=[]
articles=[]
for article in articleFiles:
    with open(article, "r") as f:
        tmp=f.read()
        tmp=nltk.word_tokenize(tmp)
        tmp=[cleanWord(w) for w in tmp]
        tmp=[w for w in tmp if len(w)>1]
        nummer=len(intToArt)
        articles.append(TaggedDocument(tmp, [nummer]))
        intToArt.append(article)
print("Clanky nacteny.")

Clanky nacteny.


In [34]:
# Query bude vysledek NN
doc2vec_model_path=dataPath+"doc2vec.model"
if os.path.exists(doc2vec_model_path):
    model=Doc2Vec.load(doc2vec_model_path)
    print("Model nacten.")
else:
    model=Doc2Vec(articles, vector_size=50, min_count=2, epochs=40, workers=4)
    model.save(doc2vec_model_path)
    print("Model vytvoren.")
# Comparing query and titles
print("Pro query \"{}\":".format(query1))
query1=nltk.word_tokenize(query1)
#query=[cleanWord(w) for w in query]
#query=[w for w in query if len(w)>1]
qVec=model.infer_vector(query1)
sim=model.docvecs.most_similar([qVec], topn=5)
for i in range(5):
    print("{}. nejblizsi dokument je {}.".format(i, intToArt[sim[i][0]]))
#print(sim)

Model nacten.
Pro query "black and white cat is jumping in the air":
0. nejblizsi dokument je ./data/bbc/sport/084.txt.
1. nejblizsi dokument je ./data/bbc/sport/052.txt.
2. nejblizsi dokument je ./data/bbc/sport/077.txt.
3. nejblizsi dokument je ./data/bbc/sport/024.txt.
4. nejblizsi dokument je ./data/bbc/entertainment/309.txt.


In [35]:
print("Pro query \"{}\":".format(query2))
query2=nltk.word_tokenize(query2)
qVec=model.infer_vector(query2)
sim=model.docvecs.most_similar([qVec], topn=5)
for i in range(5):
    print("{}. nejblizsi dokument je {}.".format(i, intToArt[sim[i][0]]))

Pro query "man in suit and black jacket is sitting on bench":
0. nejblizsi dokument je ./data/bbc/sport/263.txt.
1. nejblizsi dokument je ./data/bbc/sport/167.txt.
2. nejblizsi dokument je ./data/bbc/sport/158.txt.
3. nejblizsi dokument je ./data/bbc/sport/262.txt.
4. nejblizsi dokument je ./data/bbc/sport/166.txt.


In [36]:
print("Pro query \"{}\":".format(query3))
query3=nltk.word_tokenize(query3)
qVec=model.infer_vector(query3)
sim=model.docvecs.most_similar([qVec], topn=5)
for i in range(5):
    print("{}. nejblizsi dokument je {}.".format(i, intToArt[sim[i][0]]))

Pro query "car is driving around car at car":
0. nejblizsi dokument je ./data/bbc/sport/463.txt.
1. nejblizsi dokument je ./data/bbc/sport/167.txt.
2. nejblizsi dokument je ./data/bbc/sport/471.txt.
3. nejblizsi dokument je ./data/bbc/sport/262.txt.
4. nejblizsi dokument je ./data/bbc/sport/388.txt.
