In [26]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [205]:
import os
import warnings
warnings.filterwarnings(action="ignore")

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

from utils.reading import read_all_files, get_text
from utils.common import get_country_labels, get_society_label

CLEAN_DATA_FOLDER = os.path.join("/Users", "mmackenzie", "Data", "peace-speech-project", "clean_sample")

Matplotlib is building the font cache; this may take a moment.


In [217]:
articles = read_all_files(path=CLEAN_DATA_FOLDER, countries=["CA", "IN", "PK"], sample=20_000)
articles_with_text = get_text(articles, path=CLEAN_DATA_FOLDER)

Finding files...: 100%|██████████| 3/3 [00:05<00:00,  1.70s/it]



274,550 articles found.
Sampling 20000 random articles.



Getting details: 100%|██████████| 20000/20000 [00:06<00:00, 3167.40it/s]
Getting text: 100%|██████████| 20000/20000 [02:09<00:00, 154.44it/s]


In [218]:
country_labels = get_country_labels()
articles_with_text["society"] = articles_with_text.country.apply(get_society_label, country_labels=country_labels)

print(articles_with_text.shape)
articles_with_text.society.value_counts()

(20000, 9)


Peaceful       8101
Other          7864
Nonpeaceful    4035
Name: society, dtype: int64

In [239]:
from sklearn.feature_extraction.text import TfidfVectorizer

def combine_text_by_group(text_df, group):
    by_group = text_df.groupby(group).text.apply(lambda x: x.str.cat(sep = "\n\n"))
    return by_group.reset_index()
    
def fit_tfidf(text, binary=True, min_df=5, max_df=20):
    tfidf = TfidfVectorizer(stop_words="english", 
                            binary=binary, min_df=min_df, max_df=max_df)
    tfidf.fit(text)
    
    return tfidf

def get_top_n_terms(text, tfidf, n=10):
    response = tfidf.transform(text)
    top_values = np.partition(response.toarray(), -n, axis=1)[:, -n:][:, ::-1]

    terms = np.array(tfidf.get_feature_names())
    tfidf_sorting = np.argsort(-response.toarray())
    top_n_by_doc_idx = tfidf_sorting[:, :n]
    
    top_terms = terms[top_n_by_doc_idx]
    
    return np.around(top_values, 4), top_terms

def create_top_n_terms_df(text_df, top_terms, top_values):
    df = text_df.copy()
    
    df["combined"] = np.stack((top_terms,top_values), axis=2).tolist()
    
    df_long = df.explode("combined")
    df_long[['term', 'value']] = pd.DataFrame(df_long['combined'].tolist(), index=df_long.index) 
    
    return df_long.drop("combined", axis=1)

### By Society

In [247]:
%%time 

df = combine_text_by_group(articles_with_text, ["society"])
tfidf = fit_tfidf(df.text, min_df=1, max_df=2, binary=False)

CPU times: user 4.82 s, sys: 125 ms, total: 4.94 s
Wall time: 4.97 s


In [251]:
%%time

top_values, top_terms = get_top_n_terms(df.text, tfidf, 20)
top_terms_df = create_top_n_terms_df(df.drop("text", axis=1), top_terms, top_values)

CPU times: user 4.34 s, sys: 93.4 ms, total: 4.43 s
Wall time: 4.46 s


In [250]:
top_terms

array([['lahore', 'balochistan', 'peshawar', 'bhutto', 'zardari', 'asif',
        'musharraf', 'mqm', 'rehman', 'rupee', 'fbr', 'pakhtunkhwa',
        'chaudhry', 'nisar', 'quetta', 'benazir', 'tehreek', 'wapda',
        'urdu', 'baloch'],
       ['tnn', 'rupee', 'ishant', 'dhoni', 'gambhir', 'telangana',
        'bcci', 'odi', 'goa', 'haryana', 'sebi', 'chhattisgarh', 'nagar',
        'rohit', 'jharkhand', 'madhya', 'shiv', 'mishra', 'kejriwal',
        'bse'],
       ['nhl', 'quebec', 'edmonton', 'manitoba', 'winnipeg', 'rcmp',
        'toolon', 'trudeau', 'uviews', 'scotia', 'harper', 'halifax',
        'oiler', 'ont', 'saskatchewan', 'sudbury', 'cannabis', 'puck',
        'canuck', 'brunswick']], dtype='<U40')