In [1]:
import pandas as pd
import numpy as np
from struct import unpack
from base64 import b64decode
import string

In [2]:
filename = "/Users/dmitry/Downloads/topics_dataset.json"
df = pd.read_json(filename, lines=True)

In [3]:
df_cutted = df.sample(n=10000)
df_cutted.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 194086 to 103402
Data columns (total 3 columns):
x1    10000 non-null object
x2    10000 non-null object
y1    10000 non-null int64
dtypes: int64(1), object(2)
memory usage: 312.5+ KB


In [4]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Activation, Dropout, concatenate
from keras.callbacks import EarlyStopping
from keras.utils.np_utils import to_categorical

Using TensorFlow backend.


In [5]:
inp_img = Input(shape=(1024,))
inp_txt = Input(shape=(300,))

In [6]:
es = EarlyStopping(monitor='val_accuracy', mode='max', min_delta=0.001, patience=3)

def get_model_1():
    x_img = Dense(64, activation='relu')(inp_img)
    x_img = Dropout(0.25)(x_img)
    x_img = Dense(64, activation='relu')(x_img)

    x_txt = Dense(64, activation='relu')(inp_txt)
    x_txt = Dropout(0.25)(x_txt)
    x_txt = Dense(64, activation='relu')(x_txt)

    x = concatenate([x_img, x_txt])
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.25)(x)
    x = Dense(128, activation='relu')(x)
    out = Dense(50, activation='softmax')(x)

    model = Model(inputs=[inp_img, inp_txt], outputs=out)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [7]:
def get_model_2():
    x_img = Dense(128, activation='relu')(inp_img)
    x_img = Dropout(0.25)(x_img)
    x_img = Dense(128, activation='tanh')(x_img)

    x_txt = Dense(128, activation='relu')(inp_txt)
    x_txt = Dropout(0.25)(x_txt)
    x_txt = Dense(128, activation='tanh')(x_txt)

    x = concatenate([x_img, x_txt])
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.25)(x)
    x = Dense(256, activation='tanh')(x)
    x = Dropout(0.1)(x)
    x = Dense(256, activation='relu')(x)
    out = Dense(50, activation='softmax')(x)

    model = Model(inputs=[inp_img, inp_txt], outputs=out)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [8]:
IMG_LEN = 1024
TXT_LEN = 300
N_CLASSES = 50

In [9]:
from functools import partial

def unpck(l, x):
    return unpack('%df' % l, b64decode(x.encode('utf-8')))

unpck_img = partial(unpck, IMG_LEN)
unpck_txt = partial(unpck, TXT_LEN)

x_img = np.stack(df_cutted['x1'].map(unpck_img), axis=0)
x_txt = np.stack(df_cutted['x2'].map(unpck_txt), axis=0)
y = to_categorical(np.array(df_cutted['y1']), N_CLASSES)

In [10]:
print(x_img.shape, x_txt.shape, y.shape)

(10000, 1024) (10000, 300) (10000, 50)


In [172]:
model = get_model_2()
model.fit([x_img, x_txt], y, epochs=10, validation_split=0.1)

Train on 9000 samples, validate on 1000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x14fe712d0>

In [16]:
df_test = df.sample(n=10000)
x_img_test = np.stack(df_test['x1'].map(unpck_img), axis=0)
x_txt_test = np.stack(df_test['x2'].map(unpck_txt), axis=0)
y_test = to_categorical(np.array(df_test['y1']), N_CLASSES)

In [115]:
model.evaluate([x_img_test, x_txt_test], y_test, verbose=0)

[0.04970714685320854, 0.98471624]

In [17]:
df_q = df.sample(frac=0.25)
x_img_q = np.stack(df_q['x1'].map(unpck_img), axis=0)
x_txt_q = np.stack(df_q['x2'].map(unpck_txt), axis=0)
y_q = to_categorical(np.array(df_q['y1']), N_CLASSES)

In [18]:
from sklearn.model_selection import train_test_split
x_img_train, x_img_test, x_txt_train, x_txt_test, y_train, y_test = train_test_split(x_img_q, x_txt_q, y_q, test_size=0.2, random_state=42)

In [186]:
model.evaluate([x_img_test, x_txt_test], y_test, verbose=0)

[1.5349045181112728, 0.58839595]

In [133]:
y_pred = model.predict([x_img_test, x_txt_test])

In [139]:
np.argmax(y_pred, axis=1).shape

(10617,)

In [140]:
from sklearn.metrics import classification_report
print(classification_report(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1)))

              precision    recall  f1-score   support

           0       0.72      0.89      0.79       403
           1       0.53      0.72      0.61       386
           2       0.58      0.57      0.58        54
           3       0.77      0.90      0.83       441
           4       0.59      0.43      0.50        81
           5       0.45      0.42      0.44       160
           6       0.81      0.84      0.82       261
           7       0.59      0.76      0.66       116
           8       0.44      0.73      0.55       128
           9       0.68      0.72      0.70       208
          10       0.74      0.92      0.82       310
          11       0.71      0.86      0.78       273
          12       0.51      0.54      0.53       307
          13       0.65      0.78      0.71       272
          14       0.61      0.63      0.62       186
          15       0.45      0.55      0.49       120
          16       0.72      0.75      0.73       456
          17       0.46    

  'precision', 'predicted', average, warn_for)


In [181]:
model = get_model_2()
model.fit([x_img_train, x_txt_train], y_train, epochs=20, validation_split=0.1, callbacks=[es])

Train on 38221 samples, validate on 4247 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20


<tensorflow.python.keras.callbacks.History at 0x160006390>

In [182]:
model.evaluate([x_img_test, x_txt_test], y_test, verbose=0)

[1.5739775799622562, 0.5870773, 0.5870773]

In [187]:
model = get_model_1()
model.evaluate([x_img_test, x_txt_test], y_test, verbose=0)

[3.921098195081648, 0.030988038]

In [188]:
x_img_f = np.stack(df['x1'].map(unpck_img), axis=0)
x_txt_f = np.stack(df['x2'].map(unpck_txt), axis=0)
y_f = to_categorical(np.array(df['y1']), N_CLASSES)

In [189]:
model = get_model_1()
model.fit([x_img_f, x_txt_f], y_f, epochs=30, validation_split=0.2, callbacks=[es])

Train on 169872 samples, validate on 42468 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30


<tensorflow.python.keras.callbacks.History at 0x165ba4290>

In [19]:
es = EarlyStopping(monitor='val_accuracy', mode='max', min_delta=0.001, patience=3)

def get_model_3():
    inp_img = Input(shape=(1024,))
    inp_txt = Input(shape=(300,))
    
    x_img = Dense(256, activation='relu')(inp_img)
    x_img = Dropout(0.25)(x_img)
    x_img = Dense(256, activation='relu')(x_img)
    
    x_txt = Dense(256, activation='relu')(inp_txt)
    x_txt = Dropout(0.25)(x_txt)
    x_txt = Dense(256, activation='relu')(x_txt)
    
    x = concatenate([x_img, x_txt])
    x = Dense(512, activation='relu')(x)
    x = Dropout(0.25)(x)
    x = Dense(512, activation='relu')(x)
    out = Dense(50, activation='softmax')(x)

    model = Model(inputs=[inp_img, inp_txt], outputs=out)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [22]:
model_3 = get_model_3()
model_3.fit([x_img_train, x_txt_train], y_train, epochs=20, validation_split=0.1, callbacks=[es], batch_size=1024)

Train on 38221 samples, validate on 4247 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20


<tensorflow.python.keras.callbacks.History at 0x139fed7d0>