## Import and Preprocess Data

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [2]:
train_data_1 = "./data/domain1_train.json"
train_data_2 = "./data/domain2_train.json"
test_data = "./data/test_set.json"

In [3]:
train = []
with open(train_data_1) as f:
    for l in f.readlines():
        train.append(json.loads(l))

def prepare_data(data):
    texts = pad_sequences([i['text'] for i in data])
    labels = np.array([i['label'] for i in data])
    return texts, labels

texts, labels = prepare_data(train)

def build_model(train_x, dim=128):
    model = Sequential(
        [
            layers.Embedding(input_dim=5000, output_dim=dim, input_length=train_x.shape[1]),
            layers.Bidirectional(layers.LSTM(64, return_sequences=True)),
            layers.Bidirectional(layers.LSTM(32)),
            layers.Dense(64, activation='relu'),
            layers.Dropout(0.5),
            layers.Dense(1, activation='sigmoid')
        ]
    )
    model.compile(optimizer='adam', loss='binary_crossentropy',  metrics=['accuracy'])

    return model
print(texts.shape, labels.shape)

x_train, x_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=12)
print(x_train.shape, y_train.shape, x_val.shape, y_val.shape)

model = build_model(x_train)
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, restore_best_weights=True)


(19500, 238) (19500,)
(15600, 238) (15600,) (3900, 238) (3900,)


In [4]:
history = model.fit(x_train, y_train, epochs=10, batch_size=64, validation_data=(x_val, y_val), callbacks=[early_stopping])


Epoch 1/10
Epoch 2/10
Epoch 3/10


In [11]:
test = []
with open(test_data) as f:
    for l in f.readlines():
        test.append(json.loads(l))

test_txt = pad_sequences([i['text'] for i in test], maxlen=texts.shape[1])

predictions = model.predict(test_txt)

def to_csv(predictions, name='./data/result.csv'):
    predictions = np.where(predictions > 0.5, 1, 0).reshape(-1)
    df = pd.DataFrame({'id': range(len(predictions)), 'label': predictions})
    df.to_csv(name, index=False)

to_csv(predictions)


