In [1]:
import os
import pathlib
import warnings
warnings.filterwarnings(action="ignore")

import pprint
import itertools
from tqdm import tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from utils.reading import read_all_files, get_text
from utils.common import get_country_labels, get_society_label

# CLEAN_DATA_FOLDER = os.path.join("/Users", "mmackenzie", "Data", "peace-speech-project", "clean_sample")
CLEAN_DATA_FOLDER = r"C:\Users\mattb\Documents\GitHub\peace-speech-project\data\clean_sample"
CLEAN_DATA_FOLDER

'C:\\Users\\mattb\\Documents\\GitHub\\peace-speech-project\\data\\clean_sample'

In [None]:
articles = read_all_files(path=CLEAN_DATA_FOLDER)
articles_with_text = get_text(articles, path=CLEAN_DATA_FOLDER)

In [2]:
%%time

countries = list(itertools.chain(*get_country_labels().values()))
articles_with_text = pd.read_csv("data/clean_sample/all_data.csv")
articles_with_text = articles_with_text[~articles_with_text.text.isna()]
articles_with_text.country.value_counts()

Wall time: 24.2 s


US    110197
IN    109777
AU    109475
CA    109119
GB    107366
NZ    107278
ZA    105460
IE    102635
PH     95348
NG     79354
MY     59589
SG     56211
KE     56132
PK     55157
JM     52523
GH     35686
BD     23400
TZ     16861
LK     14676
HK      5069
Name: country, dtype: int64

In [3]:
country_labels = get_country_labels()

articles_with_text["society"] = articles_with_text.country.apply(get_society_label, country_labels=country_labels)
# articles_with_text = articles_with_text[articles_with_text.society != "Other"]
articles = articles_with_text.drop("text", axis=1)

articles.society.value_counts()

Peaceful       592084
Other          588325
Nonpeaceful    230904
Name: society, dtype: int64

In [5]:
pd.read_csv("lexicons/peace_vocab_rationale.csv", names=["index", "term"], skiprows=1)

Unnamed: 0,index,term
0,0,ngati
1,1,iona
2,2,vg
3,3,mcstay
4,4,kaipara
...,...,...
7075,7075,inishmaan
7076,7076,peka
7077,7077,leck
7078,7078,eth


In [4]:
def read_lexicon(lexicon):
    path = os.path.join("lexicons", f"enh_{lexicon}_lexicon.xlsx")
    df = pd.read_excel(path, names=["term"])
    df["lexicon"] = lexicon
    
    return df

old_lexicon = pd.concat([read_lexicon(l) for l in ["peace", "conflict", "resilience"]])
old_lexicon["version"] = "Original"

tf_lexicon = pd.read_csv("lexicons/top_words_lexicon.csv")
tf_lexicon["version"] = "Term Frequency"

attention_lexicon = pd.read_csv("lexicons/attention_lexicon.csv")
attention_lexicon = attention_lexicon.iloc[:, 1:].stack().reset_index().iloc[:, 1:]
attention_lexicon.columns = ["lexicon", "term"]
attention_lexicon["lexicon"] = attention_lexicon.lexicon.apply(lambda x: {"Peaceful": "peace", "Non Peaceful": "conflict"}[x])
attention_lexicon["term"] = attention_lexicon.term.str.strip('][').str.split(', ')
attention_lexicon = attention_lexicon.explode("term")
attention_lexicon["term"] = attention_lexicon.term.str.strip('\'')

attention_lexicon["version"] = "Attention Layer"

r

lexicon = pd.concat([
    old_lexicon, 
    tf_lexicon,
    attention_lexicon
]).reset_index(drop=True)

lexicon["term"] = lexicon.term.str.lower()
lexicon = lexicon.drop_duplicates(["lexicon", "version", "term"]).reset_index(drop=True)

lexicon["num_words"] = lexicon.term.str.split(" ").str.len()
lexicon.to_csv("lexicons/lexicon.csv", index=False) 

lexicon

Unnamed: 0,term,lexicon,version,num_words
0,academic freedom,peace,Original,2
1,acceptance,peace,Original,1
2,accepting,peace,Original,1
3,active listening,peace,Original,2
4,activism,peace,Original,1
...,...,...,...,...
2827,podcast,conflict,Attention Layer,1
2828,menace,conflict,Attention Layer,1
2829,pdp,conflict,Attention Layer,1
2830,aspirant,conflict,Attention Layer,1


In [5]:
unique_terms = lexicon[lexicon.num_words <= 2].term.unique()
unique_terms.shape

(2389,)

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(ngram_range=(1, 2), vocabulary=unique_terms)

In [7]:
%%time
word_counts = cv.fit_transform(articles_with_text.text)
word_counts

Wall time: 6min 51s


<1411313x2389 sparse matrix of type '<class 'numpy.int64'>'
	with 48877502 stored elements in Compressed Sparse Row format>

In [8]:
del articles_with_text
# del counts_df

In [9]:
%%time
counts_df =  pd.DataFrame(word_counts.todense(), columns=cv.get_feature_names())

Wall time: 7.76 s


In [10]:
counts_df["society"] = articles["society"].values
counts_df["country"] = articles["country"].values
counts_df["year"] = articles["year"].values

In [11]:
del articles

In [12]:
counts_df.to_csv("data/word_counts_by_article_ALL.csv", index=False)
counts_df.loc[counts_df.society != "Other", :].to_csv("data/word_counts_by_article_NO_OTHER.csv", index=False)

In [3]:
counts_df = pd.read_csv("data/word_counts_by_article_ALL.csv")

In [52]:
features = counts_df.columns[:-3]
def make_count_summary(group, df, features=features, lexicon=lexicon):
    summary = df[features] \
        .sum(axis=0) \
        .rename("n") \
        .reset_index() \
        .rename(columns={"index": "term"})
    
    summary["society"] = group[0]
    summary["country"] = group[1]
    summary["year"] = group[2]
    
    summary = summary.merge(
        lexicon, 
            on = "term", 
            how = "left"
        ).drop_duplicates(["lexicon", "version", "term"])
    
    summary.to_csv(f"data/word_counts/tf_by_year__{'_'.join([str(g) for g in group[1:]]).upper()}.csv", index=False)    
    return summary

In [53]:
summaries = []
for g, df in tqdm(counts_df.groupby(["society", "country", "year"])):
    summaries.append(make_count_summary(g, df))
    
count_df_summary = pd.concat(summaries)

100%|████████████████████████████████████████████████████████████████████████████████| 220/220 [03:21<00:00,  1.09it/s]


In [55]:
count_df_summary.to_csv("data/tf_by_country_by_year.csv", index=False)

In [56]:
del summaries
del count_df_summary

In [13]:
%%time
corrcoef = np.corrcoef(word_counts.toarray().T)

Wall time: 3min 54s


In [14]:
pd.DataFrame(corrcoef, columns=cv.get_feature_names(), index=cv.get_feature_names()) \
    .stack() \
    .rename("corr") \
    .reset_index() \
    .rename(columns={"level_0": "term1",
                     "level_1": "term2"}) \
    .query("term1 != term2") \
    .to_csv("data/sample_term_corr.csv", index=False)

In [15]:
word_counts.shape

(1411313, 2389)