In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import re
import sys
from imp import reload
from collections import defaultdict

import warnings
warnings.filterwarnings('ignore')

if sys.version[0] == '2':
    reload(sys)
    sys.setdefaultencoding("utf-8")

from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Concatenate, Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, Flatten
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Convolution1D
from tensorflow.keras import initializers, regularizers, constraints, optimizers, layers

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [4]:
from typing import List

def get_society_label(country: str, peaceful: List[str], nonpeaceful: List[str]) -> str:
  if country in peaceful:
    return "peaceful"
  elif country in nonpeaceful:
    return "nonpeaceful"
  else:
    return "other"

def country_is_peaceful(society):
  return society.lower() == "peaceful"

BASE_DIR = "/content/drive/MyDrive/peace-speech-project/"

peaceful_countries = ['GB', 'AU', 'CA', 'SG', 'NZ', 'IE']
non_peaceful_countries = ['PK', 'BD', 'NG', 'KE', 'ZA', 'TZ']

data_file_path = os.path.join(BASE_DIR, "data", "domestic_articles__ngram__stopwords__lemmatized.csv")
articles = pd.read_csv(data_file_path, index_col=[0])
articles = articles.dropna().reset_index()

articles["society"] = articles.country.apply(
    get_society_label, 
    peaceful=peaceful_countries, 
    nonpeaceful=non_peaceful_countries
)

articles["is_peaceful"] = articles.society.apply(country_is_peaceful)
articles = articles[articles.society != "other"].copy()

print(articles.shape)
articles

(417941, 8)


Unnamed: 0,index,article_id,country,publisher,year,article_text,society,is_peaceful
0,0,71409778,AU,perthnow.com.au,2019,Labor continue pursuit Angus Taylor Federal La...,peaceful,True
1,1,71101824,AU,perthnow.com.au,2019,For many year South Australia unenviable reput...,peaceful,True
2,2,71512141,AU,perthnow.com.au,2019,Jamie Maclaren open join Melbourne City tough ...,peaceful,True
3,3,71147035,AU,perthnow.com.au,2019,Perth man hang arm drug gang Rio favela Topics...,peaceful,True
4,4,71206307,AU,perthnow.com.au,2019,MP allegedly told fake donor lie ICAC Dominica...,peaceful,True
...,...,...,...,...,...,...,...,...
656179,656870,14014747,TZ,Daily News | The National Newspaper (press rel...,2016,Principal Resident Magistrate Dr Yohana Yongol...,nonpeaceful,False
656180,656871,14806484,TZ,Daily News | The National Newspaper (press rel...,2016,FULLY FLEDGED Yoga Instructors Susan Tabula fa...,nonpeaceful,False
656181,656872,9047182,TZ,Daily News | The National Newspaper (press rel...,2016,THE government divulge content report Judicial...,nonpeaceful,False
656182,656873,13915423,TZ,Daily News | The National Newspaper (press rel...,2016,Ambassador Egypt Tanzania Mr Yasser Elshawaf t...,nonpeaceful,False


In [5]:
%%time

MAX_FEATURES = 6000
EMBED_SIZE = 128
RNN_CELL_SIZE = 32
MAX_LEN = 371

tokenizer = Tokenizer(num_words=MAX_FEATURES)
tokenizer.fit_on_texts(articles['article_text'])
reverse_word_index = dict(map(reversed, tokenizer.word_index.items()))

CPU times: user 1min 28s, sys: 131 ms, total: 1min 29s
Wall time: 1min 29s


In [6]:
class Attention(tf.keras.Model):
    def __init__(self, units):
        super(Attention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
 
    def call(self, features, hidden):
        # hidden shape == (batch_size, hidden size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden size)
        # we are doing this to perform addition to calculate the score
        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        # score shape == (batch_size, max_length, 1)
        # we get 1 at the last axis because we are applying score to self.V
        # the shape of the tensor before applying self.V is (batch_size, max_length, units)
        score = tf.nn.tanh(
            self.W1(features) + self.W2(hidden_with_time_axis))
        
        # attention_weights shape == (batch_size, max_length, 1)
        attention_weights = tf.nn.softmax(self.V(score), axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)
 
        return context_vector, attention_weights

METRICS = [
      keras.metrics.TruePositives(name='tp'),
      keras.metrics.FalsePositives(name='fp'),
      keras.metrics.TrueNegatives(name='tn'),
      keras.metrics.FalseNegatives(name='fn'), 
      keras.metrics.BinaryAccuracy(name='accuracy'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.AUC(name='auc'),
]

def build_model(return_attention=False):
  sequence_input = Input(shape=(MAX_LEN,), dtype="int32")
  embedded_sequences = Embedding(MAX_FEATURES, EMBED_SIZE)(sequence_input)

  lstm = Bidirectional(LSTM(RNN_CELL_SIZE, return_sequences = True), name="bi_lstm_0")(embedded_sequences)

  # Getting our LSTM outputs
  (lstm, forward_h, forward_c, backward_h, backward_c) = Bidirectional(
      LSTM(
          RNN_CELL_SIZE, 
          return_sequences=True, 
          return_state=True
      ), name="bi_lstm_1"
  )(lstm)


  state_h = Concatenate()([forward_h, backward_h])
  state_c = Concatenate()([forward_c, backward_c])

  context_vector, attention_weights = Attention(10)(lstm, state_h)
  attention_model = keras.Model(inputs=sequence_input, outputs=attention_weights)  ## Attention Model

  dense1 = Dense(20, activation="relu")(context_vector)
  dropout = Dropout(0.05)(dense1)
  output = Dense(1, activation="sigmoid")(dropout)

  model = keras.Model(inputs=sequence_input, outputs=output)
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=METRICS)

  if return_attention:
    return model, attention_model
  else:
    return model

In [7]:
model, attention_model = build_model(return_attention=True)
model.load_weights(os.path.join(BASE_DIR, "Attention Layer Lexicon", "attention_base_model.h5"))
attention_model.load_weights(os.path.join(BASE_DIR, "Attention Layer Lexicon", "attention_model.h5"))

In [8]:
def get_attention_weights(keras_tokenizer, attention_model, article_text, reverse_word_index, maxlen=MAX_LEN):
  article_sequence = pad_sequences(keras_tokenizer.texts_to_sequences([article_text]), maxlen=maxlen)
  attention_weights_array = attention_model.predict(article_sequence).reshape(1, maxlen)
  attention_weights_dict = defaultdict(list)

  for len_ind in range(maxlen):
    try:
      attention_weights_dict[reverse_word_index[article_sequence[0][len_ind]]].append(attention_weights_array[0][len_ind])
    except Exception:
      continue
  
  
  weights_df = pd.DataFrame(attention_weights_dict.items(), columns=["term", "weights"])
  weights_df["n_weights"] = weights_df.weights.apply(lambda x: len(x))
  weights_df["max_weight"] = weights_df.weights.apply(lambda x: np.max(x))
  weights_df["mean_weight"] = weights_df.weights.apply(lambda x: np.mean(x))
  weights_df["var_weight"] = weights_df.weights.apply(lambda x: np.var(x))
  weights_df["median_weight"] = weights_df.weights.apply(lambda x: np.median(x))

  return weights_df

def apply_get_attention_weights(tbl, n, field):
  row = tbl.iloc[0, :]
  weights = get_attention_weights(
      keras_tokenizer=tokenizer, 
      attention_model=attention_model,
      article_text=row.article_text, 
      reverse_word_index=reverse_word_index
  )

  return weights.sort_values(field, ascending=False).head(n)[["term", field]]

In [None]:
tqdm.pandas()
top_terms_by_article = articles.groupby(["article_id", "society", "country", "year"]).progress_apply(
    apply_get_attention_weights, n=5, field="mean_weight"
).reset_index().drop("level_4", axis=1)

top_terms_by_article.to_csv(os.path.join(BASE_DIR, "Attention Layer Lexicon", "top_terms_by_mean_weight__articles.csv"))
top_terms_by_article

HBox(children=(FloatProgress(value=0.0, max=402150.0), HTML(value='')))

Buffered data was truncated after reaching the output size limit.

In [None]:
top_terms = top_terms_by_article.groupby(["society", "term"]).size().rename("n").reset_index()
top_terms = top_terms.groupby("society").apply(lambda x: x.nlargest(200, ["n"])).reset_index(drop=True)
top_terms.to_csv(os.path.join(BASE_DIR, "Attention Layer Lexicon", "top_terms_by_mean_weight__agg.csv"))
top_terms

Unnamed: 0,society,term,n
0,nonpeaceful,nigeria,6857
1,nonpeaceful,kenya,4559
2,nonpeaceful,'s,3276
3,nonpeaceful,nigerian,2681
4,nonpeaceful,abuja,2561
...,...,...,...
395,peaceful,future,1315
396,peaceful,issue,1314
397,peaceful,statement,1309
398,peaceful,big,1301
