In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
import keras
from keras.layers import Dense, Dropout
from sklearn.metrics import f1_score
from keras.models import Sequential
import random

Using TensorFlow backend.


In [None]:
PATH_Y_TRAIN = './wili-2018/y_train.txt'
PATH_Y_TEST = './wili-2018/y_test.txt'
NGRAMS_TRAIN = 'ng_freq_train.csv'
NGRAMS_TEST = 'ng_freq_test.csv'

SEED = 42
LAYERS = [1000, 1000]
DROPOUT = [0.2, 0.8]
ACTIVATION = 'tanh'
EARLY_STOPPING = 3
MODEL_PATH = 'li_model.hdf5'
EPOCHS = 10

<h2>Import data

In [None]:
def read_txt(path):#read y values
    handle = open(path, "r", encoding='utf-8')
    df = pd.DataFrame(handle.readlines())
    handle.close()
    return df.values

def get_x_data():#read and transform x values
    train_data=pd.read_csv(NGRAMS_TRAIN, sep=';')
    test_data=pd.read_csv(NGRAMS_TEST, sep=';')
    
    scaler=MinMaxScaler()
    train_data=pd.DataFrame(scaler.fit_transform(train_data))
    test_data=pd.DataFrame(scaler.transform(test_data))
    train_data, dev_data = split_train_dev(train_data)
    return train_data, dev_data, test_data

def get_y_data():
    y_train=read_txt(PATH_Y_TRAIN)
    y_train=y_train[1:]
    y_test=read_txt(PATH_Y_TEST)
    y_test=y_test[1:]
    
    classes = np.unique(np.array(y_train))
    num_classes=len(classes)
    nums=np.arange(num_classes)
    d = dict(zip(classes,nums))

    y_train=[d[c[0]] for c in y_train] 
    y_train = keras.utils.to_categorical(np.array(y_train), num_classes)
    y_test=[d[c[0]] for c in y_test] 
    y_test = keras.utils.to_categorical(np.array(y_test), num_classes)
    y_train, y_dev = split(train_dev(y_train))
    return y_train, y_dev, y_test, num_classes

def split_train_dev(data):
    random.seed(SEED)
    random.shuffle(data)
    train_length = int(0.9 * len(data))
    data, dev_data = data[:train_length],\
        data[train_length:]
    return data, dev_data

In [None]:
#preprocessing
train_data, dev_data, test_data = get_x_data()
y_train, y_dev, y_test, num_classes = get_y_data()

print('Train x shape: {}, dev x shape: {}, test x shape: {}'.format(train_data.shape,\
    dev_data.shape, test_data.shape))
print('Train y shape: {}, dev y shape: {}, test y shape: {}'.format(y_train.shape,\
    y_dev.shape, y_test.shape))

<h2>Model

In [None]:
def create_model(inp, num_classes)
    model = Sequential()
    for i, l in enumerate(LAYERS):
        if i == 0:
            model.add(Dense(l, input_dim=inp, activation=ACTIVATION))
        else:
            model.add(Dense(l, activation=ACTIVATION))
        model.add(Dropout(DROPOUT[i]))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(loss='categorical_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    print(model.summary())
    return model

def train_model(model):
    callbacks = []
    if EARLY_STOPPING is not None:
        callbacks.append(EarlyStopping(monitor="val_acc", patience=EARLY_STOPPING))
    model_checkpoint = ModelCheckpoint(filepath=MODEL_PATH, monitor="val_acc",
                                       save_best_only=True, save_weights_only=True)
    callbacks.append(model_checkpoint)
    model.fit(train_data, y_train, epochs=EPOCHS, verbose=1, validation_data=(dev_data, y_dev),\
             callbacks=callbacks)

In [None]:
model = create_model(train_data.shape[1], num_classes)
train_model(model)

In [None]:
predict = np.argmax(model.predict(test_data), axis=1)
answer = np.argmax(y_test, axis=1)
print('F1-score: {}'.format(f1_score(predict, answer, average="macro")*100))