In [1]:
import os
import pathlib

from tqdm import tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet

nltk.download('wordnet')

def wordnet_pos(w, pos):
    synsets = wordnet.synsets(w)
    return [w for w in synsets if w.pos() == pos]

import warnings
warnings.filterwarnings(action="ignore")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mattb\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
top_terms_by_article = pd.read_csv("top_terms_by_mean_weight__articles.csv", index_col=[0])
top_terms = top_terms_by_article.groupby(["society", "term"]).size().rename("n").reset_index()
top_terms = top_terms[(top_terms.term.str.isalpha()) & (top_terms.term.str.len() > 2)]
top_terms = top_terms.reset_index(drop=True)

In [3]:
tqdm.pandas(desc="Getting synsets")
top_terms["n_synsets"] = top_terms.term.progress_apply(lambda x: len(wordnet.synsets(x)))

tqdm.pandas(desc="Getting nouns")
top_terms["n_nouns"] = top_terms.term.progress_apply(lambda x: len(wordnet_pos(x, "n")))

tqdm.pandas(desc="Getting verbs")
top_terms["n_verbs"] = top_terms.term.progress_apply(lambda x: len(wordnet_pos(x, "v")))

Getting synsets: 100%|█████████████████████████████████████████████████████████| 11122/11122 [00:02<00:00, 3749.80it/s]
Getting nouns: 100%|██████████████████████████████████████████████████████████| 11122/11122 [00:00<00:00, 66192.24it/s]
Getting verbs: 100%|██████████████████████████████████████████████████████████| 11122/11122 [00:00<00:00, 66599.92it/s]


In [4]:
from nltk.corpus import stopwords

swords = set(stopwords.words('english'))

filtered_terms = top_terms[(top_terms.n_synsets > 0) & (top_terms.n_nouns == 0)]
filtered_terms[["society", "term", "n"]].to_csv("filtered_terms_with_counts.csv", index=False)
filtered_terms = filtered_terms[~filtered_terms.term.apply(lambda x: x in swords)]

filtered_terms

Unnamed: 0,society,term,n,n_synsets,n_nouns,n_verbs
6,nonpeaceful,abide,13,2,0,2
8,nonpeaceful,able,216,4,0,0
12,nonpeaceful,abroad,143,4,0,0
15,nonpeaceful,absolutely,28,2,0,0
20,nonpeaceful,abusive,91,2,0,0
...,...,...,...,...,...,...
11081,peaceful,worsen,21,2,0,2
11090,peaceful,write,1080,10,0,10
11092,peaceful,writes,152,10,0,10
11099,peaceful,yeah,68,1,0,0


In [10]:
cutoff = 0.001
filtered_terms["pct_of_group"] = filtered_terms.groupby("society").n.apply(lambda x: x / sum(x))
lexicon = filtered_terms[filtered_terms.pct_of_group >= cutoff]

lexicon["lexicon"] = lexicon.society.apply(lambda x: {"peaceful": "peace", "nonpeaceful": "conflict"}[x])

lexicon = lexicon[["society", "term", "pct_of_group"]].pivot(index="term", columns="society", values="pct_of_group")
lexicon = lexicon.reset_index()
lexicon = lexicon.rename_axis(None, axis = 1)

median_diff = lexicon.dropna().apply(lambda x: np.abs(x.nonpeaceful - x.peaceful), axis=1).median()
print(median_diff)

def get_lexicon_label(row):
    if pd.isna(row.peaceful):
        return "conflict"
    elif pd.isna(row.nonpeaceful):
        return "peace"
    elif abs(row.peaceful - row.nonpeaceful) > median_diff:
        return "peace" if row.peaceful > row.nonpeaceful else "conflict"
    else:
        return "NONE"
    
lexicon["lexicon"] = lexicon.apply(get_lexicon_label, axis=1)
lexicon.lexicon.value_counts()

0.0005286192281533552


conflict    128
peace       103
NONE         92
Name: lexicon, dtype: int64

In [11]:
attention_lexicon = lexicon[lexicon.lexicon != "NONE"].copy().reset_index(drop=True)
attention_lexicon = attention_lexicon[["lexicon", "term"]].sort_values(["lexicon", "term"])

attention_lexicon.to_csv("attention_lexicon.csv", index=False)
attention_lexicon

Unnamed: 0,lexicon,term
3,conflict,according
4,conflict,accuse
5,conflict,achieve
6,conflict,acquire
9,conflict,adopt
...,...,...
224,peace,vibrant
226,peace,whatever
227,peace,whatsoever
228,peace,wide


In [12]:
top_terms.merge(attention_lexicon, on="term", how="right").sort_values(["society", "n"], ascending=False).groupby("society").apply(lambda x: x.nlargest(5,['n'])).reset_index(drop=True)

Unnamed: 0,society,term,n,n_synsets,n_nouns,n_verbs,lexicon
0,nonpeaceful,also,1248,1,0,0,conflict
1,nonpeaceful,according,1056,4,0,2,conflict
2,nonpeaceful,new,967,12,0,0,conflict
3,nonpeaceful,defamatory,898,1,0,0,conflict
4,nonpeaceful,around,781,10,0,0,peace
5,peaceful,please,10607,4,0,3,peace
6,peaceful,respectful,3384,2,0,0,peace
7,peaceful,around,3259,10,0,0,peace
8,peaceful,late,2970,11,0,0,peace
9,peaceful,also,2969,1,0,0,conflict


In [14]:
bad_terms = top_terms_by_article.groupby(["society", "term"]).size().rename("n").reset_index()

tqdm.pandas(desc="Getting synsets")
bad_terms["n_synsets"] = bad_terms.term.progress_apply(lambda x: len(wordnet.synsets(x)))
bad_terms = bad_terms[bad_terms.term.str.isalpha()]
bad_terms = bad_terms[bad_terms.n_synsets <= 0]

bad_term_articles = bad_terms.merge(top_terms_by_article, on="term", how="left")

bad_terms.to_csv("bad_terms.csv", index=False)
bad_terms

Getting synsets: 100%|████████████████████████████████████████████████████████| 11802/11802 [00:00<00:00, 63452.15it/s]


Unnamed: 0,society,term,n,n_synsets
213,nonpeaceful,abbott,1,0
215,nonpeaceful,abdul,75,0
216,nonpeaceful,abia,120,0
227,nonpeaceful,abu,41,0
228,nonpeaceful,abubakar,398,0
...,...,...,...,...
11762,peaceful,would,2221,0
11773,peaceful,xbox,41,0
11789,peaceful,you,2782,0
11792,peaceful,your,283,0
