In [32]:
import pandas as pd
import numpy as np
from struct import unpack
from base64 import b64decode
import string

In [2]:
filename = "/Users/dmitry/Downloads/topics_dataset.json"
df = pd.read_json(filename, lines=True)

In [3]:
df_cutted = df.sample(n=10000)
df_cutted.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 15427 to 149484
Data columns (total 3 columns):
x1    10000 non-null object
x2    10000 non-null object
y1    10000 non-null int64
dtypes: int64(1), object(2)
memory usage: 312.5+ KB


In [120]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Activation, Dropout, concatenate
from keras.callbacks import EarlyStopping
from keras.utils.np_utils import to_categorical

In [49]:
inp_img = Input(shape=(1024,))
inp_txt = Input(shape=(300,))

In [121]:
es = EarlyStopping(monitor='val_accuracy', mode='max', min_delta=0.001, patience=3)

def get_model_1():
    x_img = Dense(64, activation='relu')(inp_img)
    x_img = Dropout(0.25)(x_img)
    x_img = Dense(64, activation='relu')(x_img)

    x_txt = Dense(64, activation='relu')(inp_txt)
    x_txt = Dropout(0.25)(x_txt)
    x_txt = Dense(64, activation='relu')(x_txt)

    x = concatenate([x_img, x_txt])
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.25)(x)
    x = Dense(128, activation='relu')(x)
    out = Dense(50, activation='softmax')(x)

    model = Model(inputs=[inp_img, inp_txt], outputs=out)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [58]:
IMG_LEN = 1024
TXT_LEN = 300
N_CLASSES = 50

In [109]:
from functools import partial

def unpck(l, x):
    return unpack('%df' % l, b64decode(x.encode('utf-8')))

unpck_img = partial(unpck, IMG_LEN)
unpck_txt = partial(unpck, TXT_LEN)

x_img = np.stack(df_cutted['x1'].map(unpck_img), axis=0)
x_txt = np.stack(df_cutted['x2'].map(unpck_txt), axis=0)
y = to_categorical(np.array(df_cutted['y1']), N_CLASSES)

In [110]:
print(x_img.shape, x_txt.shape, y.shape)

(10000, 1024) (10000, 300) (10000, 50)


In [111]:
model.fit([x_img, x_txt], y, epochs=10, validation_split=0.1)

Train on 9000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x1489a95d0>

In [113]:
df_test = df.sample(n=10000)
x_img_test = np.stack(df_test['x1'].map(unpck_img), axis=0)
x_txt_test = np.stack(df_test['x2'].map(unpck_txt), axis=0)
y_test = to_categorical(np.array(df_test['y1']), N_CLASSES)

In [115]:
model.evaluate([x_img_test, x_txt_test], y_test, verbose=0)

[0.04970714685320854, 0.98471624]

In [117]:
df_q = df.sample(frac=0.25)
x_img_q = np.stack(df_q['x1'].map(unpck_img), axis=0)
x_txt_q = np.stack(df_q['x2'].map(unpck_txt), axis=0)
y_q = to_categorical(np.array(df_q['y1']), N_CLASSES)

In [122]:
model = get_model_1()
model.fit([x_img_q, x_txt_q], y_q, epochs=20, validation_split=0.1, callbacks=[es])
model.evaluate([x_img_test, x_txt_test], y_test, verbose=0)

Train on 47776 samples, validate on 5309 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


[0.043834722620248796, 0.98616064]

In [128]:
from sklearn.model_selection import train_test_split
x_img_train, x_img_test, x_txt_train, x_txt_test, y_train, y_test = train_test_split(x_img_q, x_txt_q, y_q, test_size=0.2, random_state=42)

In [129]:
model.fit([x_img_train, x_txt_train], y_train, epochs=20, validation_split=0.1, callbacks=[es])

Train on 38221 samples, validate on 4247 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20


<tensorflow.python.keras.callbacks.History at 0x14eaec390>

In [131]:
model.evaluate([x_img_test, x_txt_test], y_test, verbose=0)

[0.04204885141142088, 0.986618]