In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import re
import sys
from imp import reload
from collections import defaultdict

if sys.version[0] == '2':
    reload(sys)
    sys.setdefaultencoding("utf-8")

from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
import nltk
from nltk.tokenize import word_tokenize

nltk.download('wordnet')
nltk.download('stopwords')

from nltk.corpus import wordnet
from nltk.corpus import stopwords

def wordnet_pos(w, pos):
    synsets = wordnet.synsets(w)
    return [w for w in synsets if w.pos() == pos]

import warnings
warnings.filterwarnings(action="ignore")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
%%time

BASE_DIR = "/content/drive/MyDrive/peace-speech-project/"
TERMS_PATH = os.path.join(BASE_DIR, "Attention Layer Lexicon", "top_terms_by_mean_weight__articles.csv")

top_terms_by_article = pd.read_csv(TERMS_PATH, index_col=[0])

CPU times: user 29 s, sys: 4.56 s, total: 33.6 s
Wall time: 48 s


In [None]:
def get_top_terms(terms, n):
  top_terms = terms.groupby("article_id").head(n)
  top_terms = top_terms.groupby(["society", "term"]).size().rename("n").reset_index()
  top_terms = top_terms[(top_terms.term.str.isalpha()) & (top_terms.term.str.len() > 2)]
  top_terms = top_terms.reset_index(drop=True)

  top_terms["n_synsets"] = top_terms.term.apply(lambda x: len(wordnet.synsets(x)))
  top_terms["n_nouns"] = top_terms.term.apply(lambda x: len(wordnet_pos(x, "n")))
  top_terms["n_verbs"] = top_terms.term.apply(lambda x: len(wordnet_pos(x, "v")))

  return top_terms

def filter_terms(terms):
  swords = set(stopwords.words('english'))

  filtered_terms = terms[(terms.n_synsets > 0) & (terms.n_nouns == 0)]
  filtered_terms[["society", "term", "n"]].to_csv("filtered_terms_with_counts.csv", index=False)
  filtered_terms = filtered_terms[~filtered_terms.term.apply(lambda x: x in swords)]

  return filtered_terms

def get_lexicon_label(row, median_diff):
  if pd.isna(row.peaceful):
      return "conflict"
  elif pd.isna(row.nonpeaceful):
      return "peace"
  elif abs(row.peaceful - row.nonpeaceful) > median_diff:
      return "peace" if row.peaceful > row.nonpeaceful else "conflict"
  else:
      return "NONE"

def generate_lexicon(terms, cutoff = 0.001):
  terms = terms.copy()
  terms["pct_of_group"] = terms.groupby("society").n.apply(lambda x: x / sum(x))
  lexicon = terms[terms.pct_of_group >= cutoff]

  lexicon["lexicon"] = lexicon.society.apply(lambda x: {"peaceful": "peace", "nonpeaceful": "conflict"}[x])

  lexicon = lexicon[["society", "term", "pct_of_group"]].pivot(index="term", columns="society", values="pct_of_group")
  lexicon = lexicon.reset_index()
  lexicon = lexicon.rename_axis(None, axis = 1)

  median_diff = lexicon.dropna().apply(lambda x: np.abs(x.nonpeaceful - x.peaceful), axis=1).median()
  lexicon["lexicon"] = lexicon.apply(get_lexicon_label, axis=1, median_diff=median_diff)

  return lexicon

def clean_lexicon(lexicon, add_society=False):
  attention_lexicon = lexicon[lexicon.lexicon != "NONE"].copy().reset_index(drop=True)
  attention_lexicon = attention_lexicon[["lexicon", "term"]].sort_values(["lexicon", "term"])

  if add_society:
    attention_lexicon["society"] = attention_lexicon.lexicon.apply(lambda x: "peaceful" if x == "peace" else "nonpeaceful")

  return attention_lexicon

def normalize_lexicon(lexicon, terms_weighting, norm=100):
  lexicon_norm = lexicon.merge(
      terms_weighting,
      on=["society", "term"],
      how="left"
  )

  lexicon_norm = lexicon_norm[["lexicon", "term", "n"]] \
      .sort_values(["lexicon", "n"], ascending=False) \
      .groupby("lexicon") \
      .head(norm) \
      .reset_index()

  lexicon_norm = lexicon_norm[["lexicon", "term"]]

  return lexicon_norm

def run_lexicon(terms, n, norm=100):
  top_terms = get_top_terms(terms, n)
  filtered_terms = filter_terms(top_terms)

  lexicon = generate_lexicon(filtered_terms)
  lexicon = clean_lexicon(lexicon, add_society=True)
  norm_lexicon = normalize_lexicon(lexicon, filtered_terms, norm=norm)

  return top_terms, filtered_terms, lexicon, norm_lexicon

def save_output(n, top_terms, filtered_terms, lexicon, norm_lexicon):
  BASE_PATH = os.path.join(BASE_DIR, "Attention Layer Lexicon", "Sensitivity")

  top_terms.to_csv(os.path.join(BASE_PATH, f"top_terms__{n}.csv"), index=False)
  filtered_terms.to_csv(os.path.join(BASE_PATH, f"filtered_terms__{n}.csv"), index=False)
  lexicon.to_csv(os.path.join(BASE_PATH, f"lexicon__{n}.csv"), index=False)
  norm_lexicon.to_csv(os.path.join(BASE_PATH, f"norm_lexicon__{n}.csv"), index=False)

In [None]:
for n in tqdm([1, 5, 10, 25, 50, 100]):
  top_terms, filtered_terms, lexicon, norm_lexicon = run_lexicon(top_terms_by_article, n)
  save_output(n, top_terms, filtered_terms, lexicon, norm_lexicon)

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))


