# Event detection from Wikipedia pages

In [None]:
!pip install pandarallel sparql-client p_tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import sys
sys.path.append('/content/drive/My Drive/Colab Notebooks/mhc/')

In [4]:
%cd "/content/drive/My Drive/Colab Notebooks/mhc/"

/content/drive/My Drive/Colab Notebooks/mhc


In [5]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
import pandas as pd
import numpy as np

from pathlib import Path
from collections import Counter

import tensorflow as tf

from text import PreProcessing

from sklearn.model_selection import train_test_split
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from sklearn.metrics import classification_report
from tensorflow.keras.callbacks import EarlyStopping

from tqdm import tqdm
import multiprocessing as mp
from multiprocessing import Pool

from p_tqdm import p_map

from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)

INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Download Wikipedia pages

In [None]:
from wikiapi import WikiWrapper

In [None]:
q_hist = (
    """
    PREFIX dbo: <http://dbpedia.org/ontology/>
    SELECT DISTINCT ?Event_1
    WHERE { ?Event_1 a dbo:Event .
            ?Event_1 a dbo:MilitaryConflict . }

    LIMIT 1000
    """
)

q_nonhist_1 = (
    """
    PREFIX dbo: <http://dbpedia.org/ontology/>
    SELECT DISTINCT ?Artist_1
    WHERE { ?Artist_1 a dbo:Artist . }

    LIMIT 500
    """
)

q_nonhist_2 = (
    """
    PREFIX dbo: <http://dbpedia.org/ontology/>
    SELECT DISTINCT ?Animal_1
    WHERE { ?Animal_1 a dbo:Animal . }

    LIMIT 500
    """
)

In [None]:
hist_events_name = WikiWrapper.download_pages_name(q_hist)
non_hist_events_name = WikiWrapper.download_pages_name(q_nonhist_1)
non_hist_events_name = non_hist_events_name + WikiWrapper.download_pages_name(q_nonhist_2)

In [None]:
df = pd.DataFrame(hist_events_name, columns=["Name"])
df["Abstract"] = ""
df["Label"] = 1

In [None]:
df1 = pd.DataFrame(non_hist_events_name, columns=["Name"])
df1["Abstract"] = ""
df1["Label"] = 0

In [None]:
df = pd.concat([df, df1])
df.reset_index(inplace=True, drop=True)
df = df.sample(frac=1).reset_index(drop=True)

In [None]:
df["Abstract"] = df.parallel_apply(WikiWrapper.get_extract, axis=1)

In [None]:
df.dropna(subset=["Abstract"], inplace=True)
df.reset_index(inplace=True, drop=True)

In [None]:
df["Abstract"] = df["Abstract"].parallel_apply(lambda x: x.replace(",", "").replace("|", ""))

In [None]:
df.to_csv("data/wiki/wiki.csv", index=False, sep="|")

## Classification

In [7]:
df = pd.read_csv("data/wiki/wiki.csv", sep="|")

In [8]:
df.Abstract = df.Abstract.parallel_apply(lambda x: PreProcessing.cleanText(x))

In [9]:
df.drop(columns=["Name"], inplace=True)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df.Abstract.values,
                                                    df.Label.values,
                                                    test_size=0.2,
                                                    random_state=42,
                                                    stratify=df.Label.values)

X_train, X_val, y_train, y_val = train_test_split(X_train,
                                                  y_train,
                                                  test_size=0.2,
                                                  random_state=42,
                                                  stratify=y_train)

In [11]:
BATCH_SIZE = 64
EPOCHS = 50

In [12]:
vectorizer = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(X_train).batch(BATCH_SIZE)
vectorizer.adapt(text_ds)

In [13]:
voc = vectorizer.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

Skip the two cells below if you have already a compressed representation of the embedding matrix

In [None]:
path_to_glove_file = Path("wordemb/glove.840B.300d.txt")

embeddings_index = {}
with open(path_to_glove_file) as f:
    for i, line in enumerate(f):
        if i % 100000 == 0:
            print('- At line {}'.format(i))

        line = line.strip().split()

        if len(line) != 300 + 1:
            continue

        word = line[0]
        coefs = " ".join(line[1:])
        coefs = np.fromstring(coefs, "f", sep=" ")
        embeddings_index[word] = coefs

print("Found %s word vectors." % len(embeddings_index))

- At line 0
- At line 100000
- At line 200000
- At line 300000
- At line 400000
- At line 500000
- At line 600000
- At line 700000
- At line 800000
- At line 900000
- At line 1000000
- At line 1100000
- At line 1200000
- At line 1300000
- At line 1400000
- At line 1500000
- At line 1600000
- At line 1700000
- At line 1800000
- At line 1900000
- At line 2000000
- At line 2100000
Found 2195876 word vectors.


In [None]:
num_tokens = len(voc) + 2
embedding_dim = 300
hits = 0
misses = 0

# Prepare embedding matrix
embeddings = np.zeros((num_tokens, embedding_dim))
for word, i in tqdm(word_index.items()):
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embeddings[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print("Converted %d words (%d misses)" % (hits, misses))

np.savez_compressed("data/wiki/glove_wiki.npz", embeddings=embeddings)

100%|██████████| 20000/20000 [00:00<00:00, 216934.23it/s]
Converted 15502 words (4498 misses)


Reload your embedding matrix here

In [14]:
embeddings = np.load("data/wiki/glove_wiki.npz")['embeddings']

In [15]:
X_train = vectorizer(np.array([[s] for s in X_train])).numpy()
X_val = vectorizer(np.array([[s] for s in X_val])).numpy()
X_test = vectorizer(np.array([[s] for s in X_test])).numpy()

In [23]:
import tensorflow_addons as tfa

In [36]:
model = tf.keras.Sequential([
    tf.keras.Input(shape=(None,), dtype="int64"),
    tf.keras.layers.Embedding(len(voc) + 2, 300, embeddings_initializer=tf.keras.initializers.Constant(embeddings), trainable=False),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128)),
    tf.keras.layers.Dense(128, activation="relu"),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

es = EarlyStopping(monitor='loss', verbose=1,
                   mode='min', patience = 2, min_delta=0.01)

model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy', tfa.metrics.F1Score(num_classes=2, average="micro")])

In [37]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 300)         6000600   
_________________________________________________________________
bidirectional_6 (Bidirection (None, None, 256)         439296    
_________________________________________________________________
bidirectional_7 (Bidirection (None, 256)               394240    
_________________________________________________________________
dense_6 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 129       
Total params: 6,867,161
Trainable params: 866,561
Non-trainable params: 6,000,600
______________________________________

In [38]:
history = model.fit(X_train, y_train,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    validation_data=(X_val, y_val),
                    callbacks=[es])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 00004: early stopping


In [39]:
model.evaluate(X_test, y_test)



[0.5428334474563599, 0.9526462554931641, 0.6323809623718262]

In [41]:
print(classification_report(y_test, (model.predict(X_test) > 0.5).astype("int32")))

              precision    recall  f1-score   support

           0       0.97      0.94      0.96       193
           1       0.94      0.96      0.95       166

    accuracy                           0.95       359
   macro avg       0.95      0.95      0.95       359
weighted avg       0.95      0.95      0.95       359

