In [1]:
import os
import pathlib
import warnings
warnings.filterwarnings(action="ignore")

import pprint
import itertools
from tqdm import tqdm

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from utils.reading import read_all_files, get_text
from utils.common import get_country_labels, get_society_label

# CLEAN_DATA_FOLDER = os.path.join("/Users", "mmackenzie", "Data", "peace-speech-project", "clean_sample")
CLEAN_DATA_FOLDER = r"C:\Users\mattb\Documents\GitHub\peace-speech-project\data\clean_sample"
CLEAN_DATA_FOLDER

'C:\\Users\\mattb\\Documents\\GitHub\\peace-speech-project\\data\\clean_sample'

In [None]:
articles = read_all_files(path=CLEAN_DATA_FOLDER)
articles_with_text = get_text(articles, path=CLEAN_DATA_FOLDER)

In [3]:
%%time

countries = list(itertools.chain(*get_country_labels().values()))
articles_with_text = pd.read_csv("data/clean_sample/all_data.csv")
articles_with_text = articles_with_text[~articles_with_text.text.isna()]
articles_with_text.country.value_counts()

Wall time: 33.6 s


US    110197
IN    109777
AU    109475
CA    109119
GB    107366
NZ    107278
ZA    105460
IE    102635
PH     95348
NG     79354
MY     59589
SG     56211
KE     56132
PK     55157
JM     52523
GH     35686
BD     23400
TZ     16861
LK     14676
HK      5069
Name: country, dtype: int64

In [4]:
country_labels = get_country_labels()

articles_with_text["society"] = articles_with_text.country.apply(get_society_label, country_labels=country_labels)
# articles_with_text = articles_with_text[articles_with_text.society != "Other"]
articles = articles_with_text.drop("text", axis=1)

articles.country.value_counts()

AU    109475
CA    109119
GB    107366
NZ    107278
IE    102635
NG     79354
SG     56211
KE     56132
PK     55157
BD     23400
TZ     16861
Name: country, dtype: int64

In [5]:
def read_lexicon(lexicon):
    path = os.path.join("lexicons", f"enh_{lexicon}_lexicon.xlsx")
    df = pd.read_excel(path, names=["term"])
    df["lexicon"] = lexicon
    
    return df

old_lexicon = pd.concat([read_lexicon(l) for l in ["peace", "conflict", "resilience"]])
old_lexicon["version"] = "old"

new_lexicon = pd.read_csv("lexicons/top_words_lexicon.csv")
new_lexicon["version"] = "new"

lexicon = pd.concat([old_lexicon, new_lexicon]).reset_index(drop=True)

lexicon["term"] = lexicon.term.str.lower()
lexicon = lexicon.drop_duplicates(["lexicon", "version", "term"]).reset_index(drop=True)

lexicon["num_words"] = lexicon.term.str.split(" ").str.len()
lexicon.to_csv("lexicons/lexicon.csv", index=False)

lexicon

Unnamed: 0,term,lexicon,version,num_words
0,academic freedom,peace,old,2
1,acceptance,peace,old,1
2,accepting,peace,old,1
3,active listening,peace,old,2
4,activism,peace,old,1
...,...,...,...,...
2363,develop,conflict,new,1
2364,land,conflict,new,1
2365,source,conflict,new,1
2366,attack,conflict,new,1


In [6]:
unique_terms = lexicon[lexicon.num_words <= 2].term.unique()
unique_terms.shape

(2091,)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(ngram_range=(1, 2), vocabulary=unique_terms)

In [8]:
%%time
word_counts = cv.fit_transform(articles_with_text.text)
word_counts

Wall time: 6min 33s


<822988x2091 sparse matrix of type '<class 'numpy.int64'>'
	with 26514820 stored elements in Compressed Sparse Row format>

In [9]:
del articles_with_text
# del counts_df

In [10]:
%%time
counts_df =  pd.DataFrame(word_counts.todense(), columns=cv.get_feature_names())

Wall time: 4.05 s


In [11]:
counts_df["society"] = articles["society"].values
counts_df["country"] = articles["country"].values
counts_df["year"] = articles["year"].values

In [12]:
del articles

In [14]:
counts_df.to_csv("data/word_counts_by_article.csv", index=False)

In [13]:
counts_df_summary = counts_df \
    .groupby(["society", "country", "year"])[cv.get_feature_names()] \
    .sum() \
    .stack() \
    .rename("n") \
    .reset_index() \
    .rename(columns={"level_3": "term"})

In [19]:
counts_df_summary.merge(
    lexicon, 
    on = "term", 
    how = "left"
).drop_duplicates().to_csv("lexicon_tf_by_country_year.csv", index=False)

In [20]:
%%time
corrcoef = np.corrcoef(word_counts.toarray().T)

Wall time: 1min 18s


In [32]:
pd.DataFrame(corrcoef, columns=cv.get_feature_names(), index=cv.get_feature_names()) \
    .stack() \
    .rename("corr") \
    .reset_index() \
    .rename(columns={"level_0": "term1",
                     "level_1": "term2"}) \
    .query("term1 != term2") \
    .to_csv("data/1m_sample_term_corr.csv", index=False)