## 03: Generate counts

This script takes a directory of `.csv` files containing entity counts by month in the following format:

```csv
,2012-01,2012-02
meat,1011.0,873.0
salt,805.0,897.0
chicken,694.0,713.0
```

It sums the counts from all files, only keeps the `N` most common records and calculates the variance, scaled by the average. This helps select a more "interesting" subset of entities with the most variance over time. The result are the most variant entities (minus the most frequent, which tend to be less interesting). The result can be used to create an interactive [bar chart race visualization](https://public.flourish.studio/visualisation/1532208/). 

In [2]:
INPUT_DIR = "./raw_counts"               # directory of counts file(s) created in the previous step
OUTPUT_FILE = "./output_counts.csv"  # path to output file
MOST_COMMON = 10_000                 # number of most common entities to keep
DROP_MOST_FREQUENT = 10              # number of most frequent entities to drop
N_TOTAL = 50                         # number of results to export

In [None]:
!pip install pandas

In [3]:
import csv
from collections import Counter, defaultdict
from pathlib import Path
import pandas as pd

In [4]:
def read_csv(file_):
    counts = Counter()
    for row in csv.DictReader(file_):
        term = row[""]
        for year, freq in row.items():
            if year != "" and freq:
                counts[(term, year)] = int(float(freq))
    return counts


def prune_rows(counts_by_term, n):
    totals = Counter()
    for term, counts in counts_by_term.items():
        if "Total" in counts:
            total = counts["Total"]
        else:
            total = sum(counts.values())
        totals[term] = total
    pruned = defaultdict(dict)
    for term, _ in totals.most_common(n):
        pruned[term] = counts_by_term[term]
    return pruned


def sum_counts(directory, n=10000):
    directory = Path(directory)
    counts = Counter()
    for path in directory.glob("**/*.csv"):
        with path.open("r", encoding="utf8") as file_:
            counts.update(read_csv(file_))
    by_term = defaultdict(Counter)
    for (term, month), freq in counts.items():
        by_term[term][month] = freq
    records = prune_rows(by_term, n)
    months = set()
    for term, counts in records.items():
        months.update(counts.keys())
    fields = ["Term"] + list(sorted(months))
    rows = []
    for term, month_freqs in records.items():
        month_freqs["Term"] = term
        for month in months:
            month_freqs.setdefault(month, 0.0)
        rows.append(month_freqs)
    return pd.DataFrame.from_records(rows, index="Term", columns=fields)


def sort_by_frequency(df):
    most_common = df.sum(axis=1)
    most_common.sort_values(ascending=False, inplace=True)
    return df.loc[most_common.index]


def drop_most_frequent(df, n):
    return sort_by_frequency(df)[n:]


def get_most_variant(df, n, mean_weight=False):
    cvars = df.var(axis=1)
    if mean_weight:
        cvars = cvars / df.mean(axis=1)
    cvars = cvars.sort_values(ascending=False)
    return df.loc[cvars.index][:n]

In [5]:
DF = sum_counts(INPUT_DIR, MOST_COMMON)
DF

Unnamed: 0_level_0,2010-01,2010-02,2010-03,2010-04,2010-05,2010-06,2010-07,2010-08,2010-09,2010-10,...,2020-03,2020-04,2020-05,2020-06,2020-07,2020-08,2020-09,2020-10,2020-11,2020-12
Term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
communication,8.0,14.0,16.0,14.0,12.0,10.0,17.0,20.0,4.0,7.0,...,11.0,21.0,14.0,10.0,10.0,13.0,13.0,10.0,21.0,8.0
critical thinking,1.0,6.0,5.0,4.0,7.0,0.0,5.0,4.0,4.0,1.0,...,5.0,4.0,13.0,6.0,7.0,2.0,6.0,7.0,7.0,0.0
leadership,5.0,20.0,2.0,4.0,0.0,2.0,2.0,2.0,10.0,0.0,...,3.0,1.0,7.0,2.0,11.0,8.0,5.0,2.0,16.0,0.0
problem solving,3.0,5.0,4.0,3.0,1.0,1.0,1.0,7.0,8.0,2.0,...,2.0,4.0,3.0,2.0,1.0,1.0,5.0,5.0,3.0,3.0
competence,2.0,1.0,2.0,0.0,3.0,1.0,4.0,3.0,1.0,0.0,...,3.0,5.0,2.0,0.0,2.0,2.0,1.0,1.0,3.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
critical area,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
problem-basedlearning,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
collaborative learning,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mind—learning,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
SUBSET = drop_most_frequent(DF, DROP_MOST_FREQUENT)
SUBSET = get_most_variant(SUBSET, N_TOTAL, mean_weight=True)[:200]
SUBSET = sort_by_frequency(SUBSET)
SUBSET = SUBSET.cumsum(axis=1)
SUBSET

Unnamed: 0_level_0,2010-01,2010-02,2010-03,2010-04,2010-05,2010-06,2010-07,2010-08,2010-09,2010-10,...,2020-03,2020-04,2020-05,2020-06,2020-07,2020-08,2020-09,2020-10,2020-11,2020-12
Term,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
willingness,0.0,0.0,0.0,1.0,1.0,1.0,5.0,7.0,7.0,8.0,...,73.0,74.0,75.0,75.0,75.0,76.0,76.0,76.0,76.0,78.0
creative thinking,0.0,0.0,1.0,1.0,2.0,2.0,2.0,3.0,3.0,3.0,...,57.0,57.0,57.0,57.0,57.0,58.0,58.0,58.0,59.0,60.0
communicate effectively,0.0,1.0,1.0,2.0,2.0,3.0,3.0,3.0,4.0,5.0,...,49.0,49.0,49.0,49.0,49.0,51.0,51.0,51.0,51.0,51.0
solve engineering problems,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,...,23.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0,24.0
ability to cooperate,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,2.0,2.0,...,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0,21.0
strategic thinking,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0,15.0
0.079 0.085,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,13.0,13.0,13.0,13.0,13.0,13.0,13.0,14.0,14.0,14.0
cross-curricular,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,6.0,6.0
communication - business,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
801-819.slaughter,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
SUBSET.to_csv(OUTPUT_FILE)