In [1]:
import os
import sys

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm

import numpy as np
import pandas as pd

from scipy.sparse import load_npz

In [2]:
sparse_matrix = load_npz("../data/word_counts_sparse_matrix.npz")
sparse_matrix.shape

(1411313, 2054)

In [3]:
word_counts = sparse_matrix.toarray()

In [4]:
random_indices = np.random.choice(word_counts.shape[0], size=250_000, replace=False)
word_counts_sample = word_counts[random_indices, :]

In [5]:
del word_counts

In [6]:
word_counts_sample.shape

(250000, 2054)

In [7]:
word_counts_binary = (word_counts_sample > 0).astype(int)

In [8]:
%%time
word_counts_dotted = np.dot(word_counts_binary.T, word_counts_binary)

Wall time: 4h 18min 35s


In [10]:
np.triu(word_counts_dotted).tofile('pairwise_counts.dat')

In [12]:
terms = pd.read_csv("../data/word_counts_by_article_NO_OTHER.csv", nrows=1).columns[:-3]

In [13]:
pw_counts = pd.DataFrame(np.triu(word_counts_dotted), columns=terms, index=terms) \
    .stack() \
    .reset_index() \
    .rename(columns={"level_0": "term1", "level_1": "term2", 0: "n"})

pw_counts = pw_counts[pw_counts.term1 != pw_counts.term2]

In [14]:
pw_counts

Unnamed: 0,term1,term2,n
1,academic freedom,acceptance,0
2,academic freedom,accepting,0
3,academic freedom,active listening,0
4,academic freedom,activism,5
5,academic freedom,advocate,12
...,...,...,...
4218910,amend,participate,0
4218911,amend,successful,0
4218912,amend,understand,0
4218913,amend,immediate,0


In [15]:
def make_set(row):
    return list(set([row.term1, row.term2]))

tqdm.pandas()
pw_counts["set"] = pw_counts.progress_apply(make_set, axis=1)

100%|█████████████████████████████████████████████████████████████████████| 4216862/4216862 [01:41<00:00, 41696.83it/s]


In [16]:
pw_counts["termA"] = pw_counts["set"].apply(lambda x: x[0])
pw_counts["termB"] = pw_counts["set"].apply(lambda x: x[1])

In [17]:
tqdm.pandas()
pw_unique_counts = pw_counts.groupby(["termA", "termB"]).progress_apply(lambda x: x.nlargest(1, 'n')).reset_index(drop=True)  

100%|█████████████████████████████████████████████████████████████████████| 2295962/2295962 [1:03:59<00:00, 598.03it/s]


In [18]:
pw_unique_counts.to_csv("pw_unique_counts.csv")

In [20]:
pw_unique_counts.n.max()

67446