### Import library

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import collections
import re
import os
import string
pd.set_option('display.max_colwidth', 200)
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#!pip install BeautifulSoup4
# import nltk
# nltk.download()  # Download text data sets, including stop words

In [3]:
from nltk.corpus import stopwords # Import the stop word list
from collections import Counter
from nltk.util import ngrams
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

In [4]:
lemmatizer = WordNetLemmatizer()

In [5]:
lemmatizer.lemmatize("requires", pos='v')

'require'

### Whitepaper datasource

In [6]:
stopwords_set = set(stopwords.words('english'))

In [7]:
context_dict = {}
def enrich_context_dict(context_dict, words, line):
    for w in words:
        context_dict.setdefault(w,[]).append(line)

In [8]:
def extract_and_clean(line, stopwords):
    # words = [x.strip() for x in re.split(',| |\. |\: ', line) if x]
    # words = map(str.lower, words)
    # words = [x.replace('-', '') for x in words]
    words = word_tokenize(re.sub(r'[^\w\s]', '', line.lower()))
    # words = [x.replace('-', '') for x in words]
    words = [word for word in words if not word in stopwords]
    # ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
    words = [lemmatizer.lemmatize(w, pos='s') for w in words]
    words = [lemmatizer.lemmatize(w, pos='n') for w in words]
    words = [lemmatizer.lemmatize(w, pos='v') for w in words]
    words = [lemmatizer.lemmatize(w, pos='a') for w in words]
    enrich_context_dict(context_dict, words, line)
    return words

In [9]:
def read_whitepapers(filename, stopwords):
    directory = "../whitepapers/top20_whitepapers/"
    words_list = []
    words_context_dict = {}
    # context_tuples_ref = []
    word_idx = 0
    context_idx = 0
    for entry in os.scandir(directory):
        if (entry.path.endswith(filename) and entry.is_file()):
            with open(entry.path, "r") as f:
                for line in f:
                    # context_tuples_ref += [line]
                    temp_words = extract_and_clean(line, stopwords)
                    words_list.extend(temp_words)
                    for i in range(len(temp_words)):
                        words_context_dict[word_idx] = context_idx
                        word_idx += 1
                    context_idx += 1
    return words_list, words_context_dict #, context_tuples_ref

In [10]:
bitcoin_filename="Bitcoin.txt"
whitepapers, words_context_dict_test = read_whitepapers(bitcoin_filename, stopwords_set)
# whitepapers.rename(columns={bitcoin_filename: "whitepapers"}, inplace=True)
whitepapers[:15]

['bitcoin',
 'peertopeer',
 'electronic',
 'cash',
 'system',
 'satoshi',
 'nakamoto',
 'satoshingmxcom',
 'wwwbitcoinorg',
 'abstract',
 'purely',
 'peertopeer',
 'version',
 'electronic',
 'cash']

In [10]:
list(words_context_dict_test.items())[:10]

[(0, 0),
 (1, 0),
 (2, 0),
 (3, 0),
 (4, 0),
 (5, 1),
 (6, 1),
 (7, 2),
 (8, 3),
 (9, 4)]

In [11]:
filenames = ['Algorand.txt', 'Avalanche.txt', 'Binance.txt', 'Bitcoin.txt', 'Cardano.txt', 'Chainlink.txt',
            'Crypto_com.txt', 'Ethereum.txt', 'FTX_token.txt', 'PolkaDot.txt', 'Polygon.txt', 'Ripple.txt', 
            'Solana.txt', 'Terra.txt', 'Tether.txt', 'Tron.txt', 'Uniswap.txt', 'Wrapped.txt']

In [12]:
compiled_r = re.compile('(?<!\S)\d+(?!\S)')

In [13]:
def filter_less_important_words(words):
    tagged = pos_tag(words)
    # Only keep verb, noun, adj
    return [w[0] for w in tagged if w[1].startswith('N') or w[1].startswith('J') or w[1].startswith('V')]
    # return [w for w in words if not compiled_reg.match(w)]

In [14]:
def creat_word_bank():
    word_bank = {}
    context_ref = {}
    for i in filenames:
        word_bank[i], context_ref[i] = read_whitepapers(i, stopwords_set)
        #deduplicate in each paper
        # deduped_words = list(set(read_whitepapers(i, stopwords_set)))
        # word_bank[i] = filter_less_important_words(deduped_words)
    return word_bank, context_ref

In [15]:
def dedupe(words, context_ref):
    appearance_dict = {}
    for i in range(len(words)):
        appearance_dict.setdefault(words[i],[]).append(context_ref[i])
    return list(set(words)), appearance_dict

In [16]:
word_bank, context_ref = creat_word_bank()

In [17]:
lookup_dict = {}

In [18]:
def enrich_lookup_dict(lookup_dict, words, paper_name, appearance_dict):
    for w in words:
        lookup_dict.setdefault(w,[]).append((paper_name, appearance_dict[w]))
        # lookup_dict[w] = paper_name

In [19]:
# Single words
agg_words = []
for coin in word_bank:
    deduped_words, appearance_dict = dedupe(word_bank[coin], context_ref[coin])
    agg_words.extend(deduped_words)
    enrich_lookup_dict(lookup_dict, deduped_words, coin, appearance_dict)

single_counter = Counter(agg_words)
print(single_counter.most_common(300))
# print([x[0] for x in single_counter.most_common(2000)])

[('system', 18), ('2', 18), ('require', 18), ('paper', 18), ('give', 18), ('7', 18), ('new', 18), ('number', 18), ('1', 18), ('make', 18), ('6', 18), ('user', 18), ('also', 18), ('since', 18), ('take', 18), ('4', 18), ('two', 18), ('5', 18), ('amount', 18), ('one', 18), ('3', 18), ('use', 18), ('many', 18), ('keep', 17), ('transaction', 17), ('key', 17), ('may', 17), ('well', 17), ('exist', 17), ('time', 17), ('work', 17), ('8', 17), ('public', 17), ('need', 17), ('receive', 17), ('follow', 17), ('within', 17), ('become', 17), ('every', 17), ('start', 17), ('network', 17), ('order', 17), ('increase', 17), ('include', 17), ('provide', 17), ('case', 16), ('9', 16), ('know', 16), ('control', 16), ('hold', 16), ('10', 16), ('generate', 16), ('second', 16), ('set', 16), ('able', 16), ('protocol', 16), ('value', 16), ('even', 16), ('high', 16), ('base', 16), ('send', 16), ('access', 16), ('simple', 16), ('reference', 16), ('change', 16), ('see', 16), ('fast', 16), ('possible', 16), ('mean', 

In [20]:
# list(lookup_dict.items())[:1]
# lookup_dict['use']

In [21]:
#2-gram counter
agg_words_2gram = []
for coin in word_bank:
    deduped_words, appearance_dict = dedupe(list(ngrams(word_bank[coin], 2)), context_ref[coin])
    agg_words_2gram.extend(deduped_words)
    enrich_lookup_dict(lookup_dict, deduped_words, coin, appearance_dict)

two_gram_counter = Counter(agg_words_2gram)
print(two_gram_counter.most_common(250))
# print([x[0] for x in two_gram_counter.most_common(300)])

[(('1', 'introduction'), 10), (('reference', '1'), 10), (('smart', 'contract'), 10), (('transaction', 'fee'), 10), (('private', 'key'), 10), (('allow', 'user'), 9), (('white', 'paper'), 9), (('1', '2'), 8), (('decentralize', 'exchange'), 8), (('use', 'case'), 8), (('period', 'time'), 8), (('large', 'number'), 8), (('create', 'new'), 8), (('transaction', 'per'), 8), (('third', 'party'), 8), (('also', 'provide'), 8), (('public', 'key'), 7), (('per', 'second'), 7), (('token', 'holder'), 7), (('consensus', 'protocol'), 7), (('also', 'use'), 7), (('high', 'level'), 7), (('block', 'block'), 7), (('transaction', 'transaction'), 7), (('transaction', 'cost'), 7), (('merkle', 'tree'), 7), (('data', 'structure'), 7), (('buy', 'sell'), 7), (('would', 'need'), 7), (('step', '1'), 6), (('stake', 'system'), 6), (('digital', 'signature'), 6), (('block', 'transaction'), 6), (('amount', 'time'), 6), (('block', 'hash'), 6), (('1', '1'), 6), (('block', 'time'), 6), (('give', 'time'), 6), (('double', 'spen

In [22]:
lookup_dict[('smart', 'contract')]

[('Avalanche.txt', [348, 349, 350, 352, 354, 354, 358, 364, 367]),
 ('Chainlink.txt',
  [31,
   36,
   40,
   42,
   55,
   177,
   181,
   183,
   192,
   196,
   201,
   211,
   215,
   222,
   229,
   238,
   278,
   279,
   280,
   283,
   289,
   289,
   296,
   299,
   322,
   354,
   356,
   388,
   391,
   400,
   409,
   411,
   432,
   465,
   472,
   491,
   496,
   535,
   594,
   706,
   756,
   797,
   826,
   831,
   837,
   867,
   881,
   883,
   901,
   1009,
   1052,
   1065,
   1079,
   1131,
   1202,
   1216,
   1431,
   1468,
   1542,
   1582,
   1712,
   1719,
   1724,
   1726,
   1767,
   1770,
   1772,
   1773,
   1776,
   1793,
   1803,
   1817,
   1821,
   1836,
   1838,
   2110,
   2111,
   2297,
   2438,
   2439,
   2476,
   2573,
   2755,
   2764,
   2764,
   2830,
   2832,
   3170,
   3269,
   3293,
   3298,
   3429,
   3506,
   3528,
   3824,
   3825,
   3829,
   3829,
   3832,
   3834,
   3849,
   3852,
   3857,
   3864,
   3975,
   4017,
   4024,
   40

In [23]:
#3-gram counter
agg_words_3gram = []
for coin in word_bank:
    deduped_words, appearance_dict = dedupe(list(ngrams(word_bank[coin], 3)), context_ref[coin])
    agg_words_3gram.extend(deduped_words)
    enrich_lookup_dict(lookup_dict, deduped_words, coin, appearance_dict)

three_gram_counter = Counter(agg_words_3gram)
print(three_gram_counter.most_common(150))
# print([x[0] for x in three_gram_counter.most_common(300)])

[(('transaction', 'per', 'second'), 7), (('ethereum', 'virtual', 'machine'), 4), (('block', 'header', 'block'), 4), (('1', '2', '1'), 4), (('proof', 'stake', 'po'), 4), (('use', 'smart', 'contract'), 4), (('high', 'transaction', 'fee'), 4), (('control', 'private', 'key'), 4), (('cryptology', 'eprint', 'archive'), 3), (('2', '1', '1'), 3), (('digital', 'signature', 'scheme'), 3), (('eprint', 'archive', 'report'), 3), (('electronic', 'cash', 'system'), 3), (('bitcoin', 'peertopeer', 'electronic'), 3), (('long', 'period', 'time'), 3), (('decentralize', 'application', 'dapps'), 3), (('peertopeer', 'electronic', 'cash'), 3), (('every', 'node', 'network'), 3), (('conference', 'computer', 'communication'), 3), (('computer', 'communication', 'security'), 3), (('key', 'public', 'key'), 3), (('symposium', 'security', 'privacy'), 3), (('proof', 'stake', 'system'), 3), (('2', '1', '2'), 3), (('double', 'spend', 'attack'), 3), (('two', 'reason', 'first'), 3), (('1', '2', '2'), 3), (('r', 'r', '1'),

In [24]:
#4-gram counter
agg_words_4gram = []
for coin in word_bank:
    deduped_words, appearance_dict = dedupe(list(ngrams(word_bank[coin], 4)), context_ref[coin])
    agg_words_4gram.extend(deduped_words)
    enrich_lookup_dict(lookup_dict, deduped_words, coin, appearance_dict)

four_gram_counter = Counter(agg_words_4gram)
print(four_gram_counter.most_common(150))
# print([x[0] for x in four_gram_counter.most_common(100)])

[(('cryptology', 'eprint', 'archive', 'report'), 3), (('peertopeer', 'electronic', 'cash', 'system'), 3), (('bitcoin', 'peertopeer', 'electronic', 'cash'), 3), (('conference', 'computer', 'communication', 'security'), 3), (('cid16', 'cid17', 'cid16', 'cid17'), 2), (('silvio', 'micali', 'algorand', 'eﬃcient'), 2), (('algorand', 'eﬃcient', 'democratic', 'ledger'), 2), (('veriﬁable', 'random', 'function', 'vrfs'), 2), (('micali', 'algorand', 'eﬃcient', 'democratic'), 2), (('cid17', 'cid16', 'cid17', 'cid16'), 2), (('undertake', 'obligation', 'update', 'forwardlooking'), 2), (('nakamoto', 'bitcoin', 'peertopeer', 'electronic'), 2), (('obligation', 'update', 'forwardlooking', 'statement'), 2), (('ethereum', 'secure', 'decentralise', 'generalise'), 2), (('time', 'pass', 'since', 'last'), 2), (('party', 'transact', 'directly', 'without'), 2), (('instead', 'trust', 'allow', 'two'), 2), (('will', 'party', 'transact', 'directly'), 2), (('two', 'will', 'party', 'transact'), 2), (('block', 'header

In [25]:
print(lookup_dict['sha256'])

[('Bitcoin.txt', [82]), ('Ethereum.txt', [119, 865]), ('Solana.txt', [97, 154, 154]), ('Tron.txt', [625, 626])]


In [26]:
print(lookup_dict['proofofwork'])

[('Avalanche.txt', [280, 307]), ('Bitcoin.txt', [10, 11, 17, 79, 82, 85, 87, 93, 95, 96, 99, 103, 109, 110, 163, 168, 327, 332]), ('Cardano.txt', [3146]), ('Chainlink.txt', [3197, 5249, 5251]), ('Ethereum.txt', [40, 41, 103, 116, 146, 171, 177, 438, 674]), ('PolkaDot.txt', [76])]


In [27]:
print(lookup_dict['proofofstake'])

[('Avalanche.txt', [139, 280, 311]), ('Cardano.txt', [0, 3479, 3534]), ('Chainlink.txt', [652, 3198, 3285, 4377, 5249, 5251]), ('Ethereum.txt', [49, 855]), ('PolkaDot.txt', [407, 1467, 1467, 1474]), ('Polygon.txt', [59])]


In [28]:
print(lookup_dict[('consensus', 'protocol')])

[('Avalanche.txt', [41, 79, 87, 96, 125, 180, 191, 275, 289, 292, 295, 517, 555]), ('Cardano.txt', [3280, 3282]), ('Chainlink.txt', [208, 375, 593, 631, 679, 732, 970, 981, 1551, 1919, 1996, 2947, 4045, 4822, 5279]), ('Ethereum.txt', [199, 218]), ('PolkaDot.txt', [1398]), ('Ripple.txt', [3, 5]), ('Solana.txt', [546, 687])]


In [29]:
def lookup_context(lookup_dict, words):
    directory = "../whitepapers/top20_whitepapers/"
    appearances = lookup_dict[words]
    # For each white paper, read and print relevant contents
    for appear in appearances:
        filename = appear[0]
        idxs = appear[1]
        context_idx = 0
        for entry in os.scandir(directory):
            if (entry.path.endswith(filename) and entry.is_file()):
                with open(entry.path, "r") as f:
                    for line in f:
                        if context_idx in idxs:
                            print(filename + " [line " + str(context_idx) + "] : " + line)
                        context_idx += 1

In [30]:
lookup_context(lookup_dict, ('consensus', 'protocol'))

Avalanche.txt [line 41] : Secure Avalanche is designed to be robust and achieve high security. Classical consensus protocols are

Avalanche.txt [line 79] : of the platform is called “$AVAX”. The family of consensus protocols used by the Avalanche platform is

Avalanche.txt [line 87] : of machines. Therefore, consensus protocols, which enable a group of nodes to achieve agreement, lie at the

Avalanche.txt [line 96] : static deployments. Nakamoto consensus protocols [5,7,4], on the other hand, are robust, but suﬀer from

Avalanche.txt [line 125] : consensus protocols and therefore require full membership knowledge. Knowing the entire set of par-

Avalanche.txt [line 180] : of consensus protocols through a set of 8 critical axes.

Avalanche.txt [line 191] : Table 1. Comparative chart between the three known families of consensus protocols. Avalanche, Snowman, and

Avalanche.txt [line 275] : Consensus protocols provide their security guarantees under the assumption that up to a threshold 

In [31]:
lookup_context(lookup_dict, 'sha256')

Bitcoin.txt [line 82] : The proof-of-work involves scanning for a value that when hashed, such as with SHA-256, the 

Ethereum.txt [line 119] : preventing sybil attackers from remaking the entire blockchain in their favor. Because SHA256 is designed

Ethereum.txt [line 865] : The Bitcoin mining algorithm works by having miners compute SHA256 on slightly modified versions of

Solana.txt [line 97] : sha256, ripemd, etc.), run the function from some random starting value

Solana.txt [line 154] : sha256 of the photograph. The index and the sha256 of the photograph are

Tron.txt [line 625] : The raw data then undergoes SHA-256 hashing. The private key corresponding to the contract 

Tron.txt [line 626] : address then signs the result of the SHA256 hash. The signature result is then added to the 



In [32]:
lookup_context(lookup_dict, ('proof', 'stake'))

Cardano.txt [line 4] : We present “Ouroboros,” the ﬁrst blockchain protocol based on proof of stake with rig-

Cardano.txt [line 6] : those achieved by the bitcoin blockchain protocol. As the protocol provides a “proof of stake”

Cardano.txt [line 9] : centivizing proof of stake protocols and we prove that, given this mechanism, honest behavior

Cardano.txt [line 27] : A natural alternative mechanism relies on the notion of “proof of stake” (PoS). Rather than

Cardano.txt [line 43] : ideal; however, realizing such a proof of stake protocol appears to involve a number of deﬁnitional,

Cardano.txt [line 45] : Previous work. The concept of PoS has been discussed extensively in the bitcoin forum.1 Proof

Cardano.txt [line 50] : Heuristic proof of stake based blockchain protocols have been proposed (and implemented) for a

Cardano.txt [line 73] : Our Results. We present “Ouroboros,” a provably secure proof of stake system. To the best of

Cardano.txt [line 80] : 1See “Proof of stake instead

In [26]:
print(context_dict['proofofstake'])

['of the system. In almost every proof-of-stake protocol that attempts to scale to a large participant set,\n', 'of Sybil deterrence mechanisms that span proof-of-work (PoW), proof-of-stake (PoS), proof-of-elapsed-time\n', 'proof-of-stake, because it is green, accessible, and open to all. We note, however, that while the $AVAX uses\n', 'Ouroboros: A Provably Secure Proof-of-Stake Blockchain Protocol\n', 'Ouroboros genesis: Composable proof-of-stake blockchains with dynamic availability. In\n', 'praos: An adaptively-secure, semi-synchronous proof-of-stake protocol. IACR Cryptology\n', 'blockchain, e.g., in Proof-of-Stake systems in which committees are selected to execute\n', 'require the majority of hash power to be honest, Proof-of-Stake systems typically re-\n', 'of-work, proof-of-stake, and permissioned systems are susceptible to prospective\n', '[204] Tezos. Proof-of-Stake in Tezos. https://tezos.gitlab.io/whitedoc/proof_of_stake.\n', 'ble in and can impact the security of proof-of

In [28]:
print(context_dict['proofofwork'])

['hash-based proof-of-work, forming a record that cannot be changed without redoing \n', 'the proof-of-work.  The longest chain not only serves as proof of the sequence of \n', 'proof-of-work chain as proof of what happened while they were gone.\n', '4. Proof-of-Work\n', 'The proof-of-work involves scanning for a value that when hashed, such as with SHA-256, the \n', 'For our timestamp network, we implement the proof-of-work by incrementing a nonce in the \n', 'effort has been expended to make it satisfy the proof-of-work, the block cannot be changed \n', 'The proof-of-work also solves the problem of determining representation in majority decision \n', 'able to allocate many IPs.  Proof-of-work is essentially one-CPU-one-vote.  The majority \n', 'decision is represented by the longest chain, which has the greatest proof-of-work effort invested \n', 'redo the proof-of-work of the block and all blocks after it and then catch up with and surpass the \n', 'the proof-of-work difficulty is d