# Reading Corpus

In [None]:
import pandas as pd
from random import randint

dataset_path = 'dataset/econstor_2017-06-01.json'
try:
    df = pd.read_json(dataset_path, lines=True, encoding='utf8')
except ValueError:
    raise ValueError("The dataset file seems to be missing. Please contact Nils Witt (n.witt@zbw.eu) \
at ZBW to retrieve your own copy.")
    
df = df[df["classification_jel"].notnull()]
df = df[df["abstract"].notnull()]
df = df[df["language"].apply(lambda row: row == ["eng"])]

# Collection compiler

The collection compiler is able to generate a collection (`generate_collection`), i.e. a set of documents with at least `degree` common JEL codes. It can also generate two disjoint collections (`disjoint_collections`) where two collections don't have any JEL code in common.

In [None]:
from random import randint
from collections import Counter, defaultdict
from itertools import chain

class Collections():
    
    def __init__(self, df):
        self.df = df
        self.cabinet = self._jel_cabinet()
    
    def _jel_cabinet(self):
        """
        creates a dict whos keys are jel codes. the values are list of indices
        of documents attached with the corresponding jel code.
        {
            "N74": [1293, 97128, ...],
            "O57": [8172, 12369, ...],
            ...
        }
        """
        jel_cabinet = defaultdict(list)
        for idx, row in self.df.iterrows():
            for jel_code in row['classification_jel']:
                jel_cabinet[jel_code].append(idx)
        return jel_cabinet

    def jel_set(self, idxs):
        """
        return the set of all jel codes of the documents in idxs, where each document is
        referenced by its index
        """
        return set(chain(*
            (df.loc[idx]["classification_jel"] for idx in idxs)
        ))
        
    def fetch_random_doc(self, degree):
        """
        finds a random document with at least `degree` jel codes.
        """
        num_docs = len(self.df.index)
        rand_doc = self.df.loc[self.df.index[randint(0, num_docs - 1)]]
        while(len(rand_doc["classification_jel"]) < degree + 2): # we want a documenent 
                                                                 # with enough jel codes
            rand_doc = self.df.loc[self.df.index[randint(0, num_docs - 1)]]
        return rand_doc

    def fetch_most_similar_docs(self, rand_doc):
        """
        returns a list of documents that are most similar to `rand_doc`. The first
        document on that list is the one that is most similar to rand_doc.
        """
        friend_docs = []
        for jel_code in rand_doc["classification_jel"]:
            friend_docs.extend(self.cabinet[jel_code])
        return Counter(friend_docs)
    
    def fetch_dissimilar_doc(self, col):
        """
        given a collection `col` (a list of indices) it returns a dissimilar 
        (w.r.t the JEL codes) document.
        """
        return collections.disjoint_collections(col=col, size=1)[1][0]


    def generate_collection(self, size=4, degree=1):
        """
        creates an artificial collection from the pandas dataframe `df`.
        `size` determines the number of documents in the collection while
        `degree` is a measure of connectivitiy density. that is, for a
        degree of x all documents share x concepts.
        """
        assert degree in range(0, 11), "unreasonable value for `degree`"
        
        size_not_ok = True
        collection = ""
        while size_not_ok:
            rand_doc = self.fetch_random_doc(degree)
            collection = self.fetch_most_similar_docs(rand_doc)
            if collection.most_common(size)[-1][1] >= degree:
                size_not_ok = False

        return [v for v, _ in collection.most_common(size)]
    
    def disjoint_collections(self, size=4, degree=1, col=None):
        """
        creates two collections whose JEL codes are disjoint.
        each collection contains `size` documents that have at least `degree`
        common JEL codes.
        if a `col` is passed, only the opposing collection is generated.
        """
        if col is None:
            col_a = self.generate_collection(size=size, degree=degree)
        else:
            col_a = col
            
        col_b = None
        jels_a = self.jel_set(col_a)
        a_b_not_disjoint = True
        
        while a_b_not_disjoint:
            col_b = self.generate_collection(size=size, degree=degree)
            jels_b = self.jel_set(col_b)
            if jels_b.isdisjoint(jels_a):
                a_b_not_disjoint = False
        
        return col_a, col_b

In [None]:
collections = Collections(df)

# Implementations

In [None]:
from ADD import TFIDF, LSI, TextRank

dataset = [d.split() for d, *rest in df["abstract"]]
tfidf = TFIDF.TFIDF(dataset)

collections_compiler = Collections(df)

keyword_extractors = {"LSI": LSI, "TFIDF": tfidf, "TextRank": TextRank}

# ADD property test

In [None]:
from tqdm import tqdm
from collections import namedtuple

class ADD_property():
    def __init__(self):
        self.collections_compiler = Collections(df)
    
    def _name(self, obj):
        return obj.__name__ if hasattr(obj, "__name__") else obj.__module__
    
    def _idx_to_text(self, idx):
        """
        returns the list of words from the document referred to by `idx`
        """
        return df.loc[idx]["abstract"][0].split()

    def _keywords(self, imp, idxs):
        """
        returns the set of all keywords in the documents in `idxs`. `idxs` is a list
        of indices.
        `imp` is the implementation of the keyword extraction algorithm. it must have
        a `keywords` method that takes a list of words and return the keywords.
        """
        return set(chain(*(imp.keywords(self._idx_to_text(idx)) for idx in idxs)))
    
    def _keyword_sets(self, imp, doc_a, doc_z, lib):
        """
        return three keywords sets. (1) the keywords of `doc_a`, (2) the keywords
        of `doc_z` and (3) the keywords of lib.
        the keywords are generated `imp.keywords()`.
        """
        return (self._keywords(imp, [doc_a]), 
            self._keywords(imp, [doc_z]), 
            self._keywords(imp, lib))

    def _setup_ADD_scenario(self, size=5, degree=2):
        """
        returns a three-tupel containing (1) a document (say A), (2) a document
        dissimilar to A (say Z) and collection of documents similar to A.
        """
        doc_a, *collection = \
            self.collections_compiler.generate_collection(size=size, degree=degree)
        doc_z = self.collections_compiler.fetch_dissimilar_doc(collection)
        return doc_a, doc_z, collection

    def _intersection_difference_ratio(self, kws_doc, kws_collection):
        """
        computes the ratio of the size of the intersection between `kws_doc` and
        `kws_collection` and the size of `kws_doc`.
        """
        num_kws_doc = len(kws_doc)
        intersection_size = len(kws_doc.intersection(kws_collection))
        return (intersection_size+1) / (num_kws_doc+1)
    
    def run_test(self, implementations, collection_size=10, degree=5):
        kw_ratios = namedtuple("kw_ratios", ['a_to_col', 'z_to_col'])
        doc_a, doc_z, lib = self._setup_ADD_scenario(size=collection_size, degree=degree)
        results = {}
        
        for imp in implementations:
            kw_a, kw_z, kw_lib = self._keyword_sets(imp, doc_a, doc_z, lib)
            results[self._name(imp)] = kw_ratios(
                self._intersection_difference_ratio(kw_a, kw_lib), \
                self._intersection_difference_ratio(kw_z, kw_lib))
        return results

In [None]:
add_test = ADD_property()

In [None]:
def execute_experiment(sample_size=100, collection_size=5, degree=3):
    bucket = []
    for _ in tqdm(range(sample_size), mininterval=50):
        bucket.append(add_test.run_test((LSI, tfidf, TextRank), 
            collection_size=collection_size, degree=degree))
    return bucket

### Results averages

In [None]:
from collections import defaultdict

def reduce_results(raw_data, sample_size):
    results = pd.DataFrame(index=raw_data[0].keys(), columns=("a_to_col", "z_to_col"))
    results = results.fillna(0)

    for result in raw_data:
        for k, v in result.items():
            results.loc[k, "a_to_col"] += v.a_to_col
            results.loc[k, "z_to_col"] += v.z_to_col

    results = results / sample_size

    return results

### Some experiments for Comparability and Differentiability

In [None]:
ss = 300
cs = 3
degree = 1
raw_results = execute_experiment(sample_size=ss, collection_size=cs, degree=degree)
reduce_results(raw_results, ss)

In [None]:
ss = 100
cs = 10
degree = 1
raw_results = execute_experiment(sample_size=ss, collection_size=cs, degree=degree)
reduce_results(raw_results, ss)

In [None]:
ss = 100
cs = 10
degree = 5
raw_results = execute_experiment(sample_size=ss, collection_size=cs, degree=degree)
reduce_results(raw_results, ss)

In [None]:
ss = 100
cs = 15
degree = 2
raw_results = execute_experiment(sample_size=ss, collection_size=cs, degree=degree)
reduce_results(raw_results, ss)

In [None]:
ss = 100
cs = 15
degree = 5
raw_results = execute_experiment(sample_size=ss, collection_size=cs, degree=degree)
reduce_results(raw_results, ss)

# Testing for Diversity

Map all keywords to their respective documents. Do that for all implementations.

In [None]:
from itertools import product

def idx_to_text(idx):
    return df.loc[idx]["abstract"][0].split()

# find all keywords
doc_kw_mapping = pd.DataFrame(index=df.index, columns=keyword_extractors.keys())

for idx, (extractor_name, extractor) in tqdm(
    product(df.index, keyword_extractors.items()), mininterval=10):
    doc_kw_mapping.loc[idx, extractor_name] = extractor.keywords(idx_to_text(idx))
    
# remove nan entries
doc_kw_mapping = doc_kw_mapping[doc_kw_mapping["LSI"].notnull()]

Create keyword blacklist. the most n most frequent keywords are dropped.

In [None]:
from math import floor

drop_n_most_frequent = .01
kw_blacklist = defaultdict(set)
for col in doc_kw_mapping:
    kw_cnt = Counter(chain(*doc_kw_mapping[col].values))
    n_most_common = floor(len(kw_cnt.most_common()) * .01)
    kw_blacklist[col] = set((kw for kw, _ in kw_cnt.most_common(n_most_common)))

generate the jel-code/keyword mapping matrix. Initially only filled with zeros.

In [None]:
def remove_blacklisted_kws(kwds, col):
    return list(set(kwds).difference(kw_blacklist[col]))

def extract_keywords(idx, extractor):
    return remove_blacklisted_kws(doc_kw_mapping.loc[idx, extractor], extractor)

In [None]:
jel_codes = set(chain(*[row['classification_jel'] for idx, row in df.iterrows()]))

jel_kwcnt = dict([(ex, None) for ex in keyword_extractors.keys()])
for col in doc_kw_mapping:
    kw_set = set(chain(*doc_kw_mapping[col].values))
    kw_set = remove_blacklisted_kws(kw_set, col)
    
    jel_kwcnt[col] = pd.DataFrame(index=jel_codes, columns=kw_set)
    jel_kwcnt[col] = jel_kwcnt[col].fillna(0)

Populate jel-code/keyword mapping matrix.

In [None]:
for col in doc_kw_mapping:
    for k, v in tqdm(doc_kw_mapping[col].iteritems(), total=len(df), mininterval=10):
        jel_codes = df.loc[k]["classification_jel"]
        #kws = doc_kw_mapping.loc[k, col]
        #kws = remove_blacklisted_kws(kws, col)
        kws = extract_keywords(k, col)
        jel_kwcnt[col].loc[jel_codes, kws] += 1

In [None]:
import numpy as np

def associated_jel_codes(keywords, extractor, cnt_threshold=10):
    sum_vector = jel_kwcnt[extractor][keywords].sum(axis=1)
    sum_vector[sum_vector < cnt_threshold] = 0
    sum_vector[sum_vector >= cnt_threshold] = 1
    return sum_vector

def associated_keywords(jels, extractor):
    sum_vector = jel_kwcnt[extractor].loc[jels].sum()
    sum_vector[sum_vector > 0] = 1
    return sum_vector

def to_binary(x, y):
    labels = list(set(x).union(set(y)))
    return np.isin(labels, x), np.isin(labels, y)

## Run diversity experiment

In [None]:
dumping_factor = 150000
thresholds = [floor(jel_kwcnt[extr].sum().sum()/dumping_factor)
    for extr in keyword_extractors]

This is what one trial is looks like

In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, jaccard_similarity_score
thresholds = (10, 20, 5)
idx = doc_kw_mapping.index[randint(0, len(doc_kw_mapping)-1)]
actual_jels = df.loc[idx]["classification_jel"]
print(f"actual jels: {actual_jels}")


for extr, threshold in zip(keyword_extractors, thresholds):
    keywords = extract_keywords(idx, extr)
    candidates = associated_jel_codes(keywords, extr, cnt_threshold=threshold)
    candidates = candidates[candidates == 1]
    
    score = jaccard_similarity_score(*to_binary(actual_jels, candidates.index))
    print(f"{extr}: {score}")
    c = candidates.index
    print(f"candidates: {c}\n")

In [None]:
results = defaultdict(list)
trials = 10000
for _ in range(trials):
    idx = doc_kw_mapping.index[randint(0, len(doc_kw_mapping)-1)]
    actual_jels = df.loc[idx]["classification_jel"]

    for extr, threshold in zip(keyword_extractors, thresholds):
        keywords = extract_keywords(idx, extr)
        candidates = associated_jel_codes(keywords, extr, cnt_threshold=threshold)
        candidates = candidates[candidates == 1]

        score = jaccard_similarity_score(*to_binary(actual_jels, candidates.index))
        results[extr].append(score)

In [None]:
from statistics import mean, variance
m, v = mean(results['LSI']), variance(results['LSI'])
print(f"LSI/Rank\nmean: {m}\nvariance: {v}")

In [None]:
m, v = mean(results['TFIDF']), variance(results['TFIDF'])
print(f"TFIDF\nmean: {m}\nvariance: {v}")

In [None]:
m, v = mean(results['TextRank']), variance(results['TextRank'])
print(f"TextRank\nmean: {m}\nvariance: {v}")

# Specifity

In [None]:
def count_keywords(kwds):
    return Counter(chain(*(kw for kw in kwds)))

In [None]:
trials = 500
keywords = pd.DataFrame(index=range(trials), columns=keyword_extractors.keys())
opposing_collections = namedtuple("opposing_collections", ["light", "dark"])

for trial_num in range(trials):
    coll_a, coll_b = collections.disjoint_collections(size=10, degree=5)
    for extractor_name, extractor in keyword_extractors.items():
        kwds_a = []
        kwds_b = []
        for idx_a, idx_b in zip(coll_a, coll_b):
            kwds_a.append(extractor.keywords(idx_to_text(idx_a)))
            kwds_b.append(extractor.keywords(idx_to_text(idx_b)))
            
        keywords.loc[trial_num, extractor_name] = opposing_collections(
            count_keywords(kwds_a),
            count_keywords(kwds_b)
        )

## Let's look at an excerpt of the results:

In [None]:
for idx in range(min(10, trials)):
    print(f"document #{idx}")
    for extractor in keyword_extractors.keys():
        light_kwds = set([k for k, _ in keywords.loc[idx, extractor].light.most_common()])
        dark_kwds = set([k for k, _ in keywords.loc[idx, extractor].dark.most_common()])

        i = len(light_kwds.intersection(dark_kwds))
        d = len(light_kwds.symmetric_difference(dark_kwds))
        print(f"{extractor}\nintersection: {i}\ndifference: {d}\n")
    print("#######################\n")

Now some statistics:

In [None]:
diversity_result = namedtuple("diversity_result", ["intersection_length", "difference_length"])
diversity_results = {k: [] for k in keyword_extractors}

for idx in range(trials):
    for extractor in keyword_extractors.keys():
        light_kwds = set([k for k, _ in keywords.loc[idx, extractor].light.most_common()])
        dark_kwds = set([k for k, _ in keywords.loc[idx, extractor].dark.most_common()])
        
        diversity_results[extractor].append(diversity_result(
            len(light_kwds.intersection(dark_kwds)),
            len(light_kwds.symmetric_difference(dark_kwds))
        ))

In [None]:
for extractor in keyword_extractors.keys():
    i = mean((r.intersection_length for r in diversity_results[extractor]))
    d = mean((r.difference_length for r in diversity_results[extractor]))
    print(f"{extractor}\nmean keyword intersection per collection duo: {i:.1f}")
    print(f"mean keyword difference size per collection duo: {d:.1f}\nintersection share: {100*(i/d):.1f}%\n")