<a href="https://colab.research.google.com/github/muziejus/21F-UP206A/blob/master/src/notebooks/frequency_counts_of_keywords_by_congress.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Frequency Counts of GOPAC Keywords

by Moacir P. de Sá Pereira

This notebook counts the frequencies of the GOPAC keywords across the 85th–114th Congresses.



In [30]:
# Import libraries

import json
import pickle
import pandas as pd
from tqdm.notebook import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import defaultdict

In [31]:
# Download NTLK data

nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [32]:
# Initialize stemmer

stemmer = PorterStemmer()

In [33]:
# Initialize GOPAC keywords.

good_words = "share change opportunity legacy challenge control truth moral courage reform prosperity crusade movement children family debate compete active we candid humane pristine provide liberty commitment principle unique duty precious premise care tough listen learn help lead vision success empower citizen activist mobilize conflict light dream freedom peace rights pioneer proud building preserve proflag prochildren proenvironment reform workfare strength choice fair protect confident incentive initiative passionate"
good_n_grams = ["eliminate good-time in prison", "hard work", "common sense"]
bad_words = "decay failure collapse deeper crisis urgent destructive destroy sick pathetic lie liberal they betray consequences limit shallow traitors sensationalists endanger coercion hypocricy radical threaten devour waste corruption incompetent impose selfserving greed ideological insecure antiflag antifamily antichild antijobs pessimistic excuses intolerant stagnation welfare corrupt selfish insensitive mandate taxes spend shame disgrace punish bizarre cynicism cheat steal machine bosses obsolete patronage"
bad_n_grams = ["unionized bureaucracy", "compassion is not enough", "permissive attitude", "status quo", "abuse of power", "criminal rights"]


In [34]:
# We don't yet know what to do with the n grams, but in the meantime:

good_tokens = word_tokenize(good_words)
bad_tokens = word_tokenize(bad_words)

good_tokens = [word.lower() for word in good_tokens]
bad_tokens = [word.lower() for word in bad_tokens]

good_tokens = [stemmer.stem(word) for word in good_tokens]
bad_tokens = [stemmer.stem(word) for word in bad_tokens]

In [35]:
# Set read/write paths

data_path_prefix =  "/content/drive/MyDrive/gentzkow-et-al-congressional-record-corpus/"
integer_path = f"{data_path_prefix}/tokenized-and-stemmed/integer-encoded"
output_path = f"{data_path_prefix}/eda-outputs"

In [36]:
# Upload our vocab data and read it in

vocab_df = pd.read_parquet(f"{integer_path}/vocabulary.parquet")

In [37]:
# Convert the vocab to a dictionary

vocab = vocab_df[0].to_dict()

In [38]:
# Define a function that takes tokens and converts them to integers
# using our vocabulary.

unused_tokens = []
def tokens_to_integers(tokens):
  integer_tokens = []
  for token in tokens:
    if token not in vocab.keys():
      integer_tokens.append(None)
      unused_tokens.append(token)
    else:
      integer_tokens.append(vocab[token])

  return set(integer_tokens)

In [47]:
# Convert keywords to integers

good_integers = tokens_to_integers(good_tokens)
bad_integers = tokens_to_integers(bad_tokens)
target_integers = good_integers.union(bad_integers)

In [None]:
# Create a dictionary to store counts.
# And one to track totals

counts = defaultdict(lambda: defaultdict(int))
totals = defaultdict(int)

In [None]:
# Iterate over the data frame and add counts and total the tokens
# for each Congress.

for congress in range(85, 115):
  print(f"Processing Congress {congress}")
  df = pd.read_parquet(f"{integer_path}/speeches_{congress:03d}.parquet")
  for _, row in tqdm(df.iterrows()):
    tokens = row["integer_tokens"]
    totals[congress] += len(tokens)

    # Count occurrences of target integers in this row
    for token in tokens:
        if token in target_integers:
            counts[congress][token] += 1

  with open(f"{output_path}/tmp-files/counts_up_to_{congress:03d}.pkl", "wb") as f:
    pickle.dump({"counts": dict(counts), "totals": totals}, f)

Processing Congress 85


0it [00:00, ?it/s]

Processing Congress 86


0it [00:00, ?it/s]

Processing Congress 87


0it [00:00, ?it/s]

Processing Congress 88


0it [00:00, ?it/s]

Processing Congress 89


0it [00:00, ?it/s]

Processing Congress 90


0it [00:00, ?it/s]

Processing Congress 91


0it [00:00, ?it/s]

Processing Congress 92


0it [00:00, ?it/s]

Processing Congress 93


0it [00:00, ?it/s]

Processing Congress 94


0it [00:00, ?it/s]

Processing Congress 95


0it [00:00, ?it/s]

Processing Congress 96


0it [00:00, ?it/s]

Processing Congress 97


0it [00:00, ?it/s]

Processing Congress 98


0it [00:00, ?it/s]

Processing Congress 99


0it [00:00, ?it/s]

Processing Congress 100


0it [00:00, ?it/s]

Processing Congress 101


0it [00:00, ?it/s]

Processing Congress 102


0it [00:00, ?it/s]

Processing Congress 103


0it [00:00, ?it/s]

Processing Congress 104


0it [00:00, ?it/s]

Processing Congress 105


0it [00:00, ?it/s]

Processing Congress 106


0it [00:00, ?it/s]

Processing Congress 107


0it [00:00, ?it/s]

Processing Congress 108


0it [00:00, ?it/s]

Processing Congress 109


0it [00:00, ?it/s]

Processing Congress 110


0it [00:00, ?it/s]

Processing Congress 111


0it [00:00, ?it/s]

Processing Congress 112


0it [00:00, ?it/s]

Processing Congress 113


0it [00:00, ?it/s]

Processing Congress 114


0it [00:00, ?it/s]

In [None]:
# Read in the last pickle

with open(f"{output_path}/tmp-files/counts_up_to_114.pkl", "rb") as f:
  data = pickle.load(f)
  counts = data["counts"]
  totals = data["totals"]

In [None]:
# Convert the counts dictionary to a DataFrame

df = pd.DataFrame(counts).T.fillna(0).astype(int)
df.reset_index(inplace=True)
df.rename(columns={"index": "congress"}, inplace=True)

In [None]:
# Add the total tokens per Congress to the dataframe

df["totals"] = totals.values()

In [48]:
# Reverse the vocabulary dict to rename the columns based on token stem

rev_vocab = {v: k for k, v in vocab.items()}
df.rename(columns=rev_vocab, inplace=True)

NameError: name 'df' is not defined

In [None]:
df.head()

Unnamed: 0,congress,provid,right,proud,citizen,opportun,debat,control,spend,care,...,sensationalist,antijob,antifamili,antiflag,workfar,proenviron,antichild,prochildren,proflag,totals
0,85,161616,157572,13708,52940,49944,38020,62192,33728,43144,...,0,0,0,0,0,0,0,0,0,83836596
1,86,41219,45988,3913,16041,14057,11538,15904,8390,13386,...,0,0,0,0,0,0,0,0,0,21768763
2,87,46064,34768,4408,16279,16074,11229,20108,9514,15381,...,2,0,0,0,0,0,0,0,0,23468972
3,88,43494,57828,4898,19639,18098,13732,16486,10087,13440,...,3,0,0,0,0,0,0,0,0,23887876
4,89,56939,47909,6309,20935,21739,11414,20161,8493,15638,...,2,1,0,0,0,0,0,0,0,26643127


In [None]:
# Save and export

df.to_parquet(f"{output_path}/keyword_counts_by_congress.parquet")

In [52]:
# Make keywords dataframe and save it.

target_integers = list(target_integers)
keywords_df = pd.DataFrame(
    {
        "stem": [rev_vocab[i] for i in target_integers],
        "integer": target_integers,
        "tag": ["good" if i in good_integers else "bad" for i in target_integers]
    }
)
keywords_df.to_parquet(f"{integer_path}/keywords-by-integer.parquet")