In [16]:
import json
import requests
import os
from pathlib import Path

In [17]:
def data_from_json(file_path):
    with open(file_path) as json_file:
        data = json.load(json_file)
        return data
    
def get_data(event_dir, datum):
    
    news_list = []
    dataset_path = event_dir
    data = data_from_json(event_dir)
    
    for key in data:
        news_list.append(data[key][datum])
            
    return news_list

In [18]:
tr_data = get_data('train.json','text')

dev_data = get_data('dev.json','text')
dev_label = get_data('dev.json','label')

test_data = get_data('test-unlabelled.json','text')

with open('articles.json') as f:
    tr_scr_data = json.load(f)
tr_label = [1]*len(tr_data) + [0]*len(tr_scr_data)

In [20]:
bi_tr_data = tr_data + tr_scr_data
bi_tr_label =tr_label

from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

x_tr_tr, x_tr_dev, y_tr_tr, y_tr_dev = train_test_split(bi_tr_data, bi_tr_label, 
                                                        test_size=0.2, random_state=1015,
                                                        stratify = bi_tr_label)
tokenizer = Tokenizer(oov_token="<UNK>")
tokenizer.fit_on_texts(bi_tr_data)
x_tr_all = tokenizer.texts_to_matrix(bi_tr_data, mode="tfidf")
x_tr_tr = tokenizer.texts_to_matrix(x_tr_tr, mode="tfidf") 
x_tr_dev = tokenizer.texts_to_matrix(x_tr_dev, mode="tfidf")
x_dev = tokenizer.texts_to_matrix(dev_data, mode="tfidf") 
x_comp = tokenizer.texts_to_matrix(test_data, mode="tfidf")

In [21]:
vocab_size = x_tr_all.shape[1]
print("Vocab size =", vocab_size)

Vocab size = 49674


# Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

classifier = LogisticRegression()
classifier.fit(x_tr_all, bi_tr_label)
y_pred = classifier.predict(x_dev)
acc= accuracy_score(dev_label , y_pred)
f1= f1_score(dev_label ,y_pred)
print('Accuracy  is: ' + str(acc))
print('Macro F1  is: ' + str(f1))

Accuracy  is: 0.8
Macro F1  is: 0.7959183673469388


# Feed-Forwards NN

In [23]:
from keras.models import Sequential
from keras import layers

#model definition
model = Sequential(name="feedforward-bow-input")
model.add(layers.Dense(100, input_dim=vocab_size, activation='relu'))
model.add(layers.Dense(100, input_dim=vocab_size, activation='relu'))
model.add(layers.Dense(100, input_dim=vocab_size, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

#since it's a binary classification problem, we use a binary cross entropy loss here
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "feedforward-bow-input"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 100)               4967500   
_________________________________________________________________
dense_7 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_8 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 101       
Total params: 4,987,801
Trainable params: 4,987,801
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.fit(x_tr_tr, y_tr_tr, epochs=10, verbose=True, validation_data=(x_tr_dev, y_tr_dev), 
          batch_size=10)

loss, accuracy = model.evaluate(x_dev, dev_label, verbose=False)

y_pred1 = model.predict(x_dev)
y_pred = [int(round(elem[0])) for elem in y_pred1]

acc1 = accuracy_score(dev_label, y_pred)
f1_1 = f1_score(dev_label, y_pred)

print("\nTesting models accuracy :  {:.4f}".format(accuracy))
print("\nTesting acc:  {:.4f}".format(acc1))
print("\nTesting f1:  {:.4f}".format(f1_1))

Train on 1824 samples, validate on 457 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Testing models accuracy :  0.8200

Testing acc:  0.8200

Testing f1:  0.8163


# LTSM NN

In [29]:
x_tr_tr, x_tr_dev, y_tr_tr, y_tr_dev = train_test_split(bi_tr_data, bi_tr_label, 
                                                        test_size=0.2, random_state=1015,
                                                        stratify = bi_tr_label)

xseq_train = tokenizer.texts_to_sequences(x_tr_tr)
xseq_dev = tokenizer.texts_to_sequences(x_tr_dev)
xseq_test = tokenizer.texts_to_sequences(dev_data)
xseq_comp = tokenizer.texts_to_sequences(test_data)

In [None]:
from keras.preprocessing.sequence import pad_sequences

maxlen = 818
xseq_train = pad_sequences(xseq_train, padding='post',maxlen=maxlen )
xseq_dev = pad_sequences(xseq_dev, padding='post',maxlen=maxlen )
xseq_test = pad_sequences(xseq_test, padding='post',maxlen=maxlen)
xseq_comp = pad_sequences(xseq_comp, padding='post',maxlen=maxlen)

In [31]:
from keras.layers import LSTM

embedding_dim = 500

#word order preserved with this architecture
model3 = Sequential(name="lstm")
model3.add(layers.Embedding(input_dim=vocab_size, 
                           output_dim=embedding_dim, 
                           input_length=maxlen))
model3.add(LSTM(100, return_sequences=True))
model3.add(LSTM(100, return_sequences=True))
model3.add(LSTM(100, return_sequences=True))
model3.add(LSTM(100))
model3.add(layers.Dense(1, activation='sigmoid'))
model3.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model3.summary()

Model: "lstm"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 818, 500)          24837000  
_________________________________________________________________
lstm_9 (LSTM)                (None, 818, 100)          240400    
_________________________________________________________________
lstm_10 (LSTM)               (None, 818, 100)          80400     
_________________________________________________________________
lstm_11 (LSTM)               (None, 818, 100)          80400     
_________________________________________________________________
lstm_12 (LSTM)               (None, 100)               80400     
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 101       
Total params: 25,318,701
Trainable params: 25,318,701
Non-trainable params: 0
__________________________________________________

In [32]:
model3.fit(xseq_train, y_tr_tr, epochs=10, verbose=True, validation_data=(xseq_dev, y_tr_dev), batch_size=100)

loss, accuracy = model3.evaluate(xseq_test, dev_label, verbose=False)

y_pred1 = model3.predict(xseq_test)
y_pred = [int(round(elem[0])) for elem in y_pred1]

acc2 = accuracy_score(dev_label, y_pred)
f1_2 = f1_score(dev_label, y_pred)

print("\nTesting acc:  {:.4f}".format(acc2))
print("\nTesting f1:  {:.4f}".format(f1_2))

Train on 1824 samples, validate on 457 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Testing acc:  0.6300

Testing acc:  0.6300

Testing f1:  0.7218
