### Import library

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import collections
import re
import os
import string
pd.set_option('display.max_colwidth', 200)
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#!pip install BeautifulSoup4
# import nltk
# nltk.download()  # Download text data sets, including stop words

In [3]:
from nltk.corpus import stopwords # Import the stop word list
from collections import Counter
from nltk.util import ngrams
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

In [4]:
lemmatizer = WordNetLemmatizer()

In [5]:
lemmatizer.lemmatize("requires", pos='v')

'require'

### Whitepaper datasource

In [6]:
stopwords_set = set(stopwords.words('english'))

In [7]:
context_dict = {}
def enrich_context_dict(context_dict, words, line):
    for w in words:
        context_dict.setdefault(w,[]).append(line)

In [8]:
def extract_and_clean(line, stopwords):
    # words = [x.strip() for x in re.split(',| |\. |\: ', line) if x]
    # words = map(str.lower, words)
    # words = [x.replace('-', '') for x in words]
    words = word_tokenize(re.sub(r'[^\w\s]', '', line.lower()))
    # words = [x.replace('-', '') for x in words]
    words = [word for word in words if not word in stopwords]
    # ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
    words = [lemmatizer.lemmatize(w, pos='s') for w in words]
    words = [lemmatizer.lemmatize(w, pos='n') for w in words]
    words = [lemmatizer.lemmatize(w, pos='v') for w in words]
    words = [lemmatizer.lemmatize(w, pos='a') for w in words]
    enrich_context_dict(context_dict, words, line)
    return words

In [9]:
def read_whitepapers(filename, stopwords):
    directory = "../whitepapers/top20_whitepapers/"
    words_list = []
    for entry in os.scandir(directory):
        if (entry.path.endswith(filename) and entry.is_file()):
            with open(entry.path, "r") as f:
                for line in f:
                    # words_list.extend(line.split())
                    # words_list.extend([x.strip() for x in line.split()])
                    words_list.extend(extract_and_clean(line, stopwords))
    return words_list

In [10]:
bitcoin_filename="Bitcoin.txt"
whitepapers = read_whitepapers(bitcoin_filename, stopwords_set)
# whitepapers.rename(columns={bitcoin_filename: "whitepapers"}, inplace=True)
whitepapers[:50]

['bitcoin',
 'peertopeer',
 'electronic',
 'cash',
 'system',
 'satoshi',
 'nakamoto',
 'satoshingmxcom',
 'wwwbitcoinorg',
 'abstract',
 'purely',
 'peertopeer',
 'version',
 'electronic',
 'cash',
 'would',
 'allow',
 'online',
 'payment',
 'send',
 'directly',
 'one',
 'party',
 'another',
 'without',
 'go',
 'financial',
 'institution',
 'digital',
 'signature',
 'provide',
 'part',
 'solution',
 'main',
 'benefit',
 'lose',
 'trust',
 'third',
 'party',
 'still',
 'require',
 'prevent',
 'doublespending',
 'propose',
 'solution',
 'doublespending',
 'problem',
 'use',
 'peertopeer',
 'network']

In [11]:
filenames = ['Algorand.txt', 'Avalanche.txt', 'Binance.txt', 'Bitcoin.txt', 'Cardano.txt', 'Chainlink.txt',
            'Crypto_com.txt', 'Ethereum.txt', 'FTX_token.txt', 'PolkaDot.txt', 'Polygon.txt', 'Ripple.txt', 
            'Solana.txt', 'Terra.txt', 'Tether.txt', 'Tron.txt', 'Uniswap.txt', 'Wrapped.txt']

In [12]:
compiled_r = re.compile('(?<!\S)\d+(?!\S)')

In [13]:
def filter_less_important_words(words):
    tagged = pos_tag(words)
    # Only keep verb, noun, adj
    return [w[0] for w in tagged if w[1].startswith('N') or w[1].startswith('J') or w[1].startswith('V')]
    # return [w for w in words if not compiled_reg.match(w)]

In [14]:
def creat_word_bank():
    word_bank = {}
    for i in filenames:
        #deduplicate in each paper
        deduped_words = list(set(read_whitepapers(i, stopwords_set)))
        word_bank[i] = filter_less_important_words(deduped_words)
    return word_bank

In [15]:
word_bank = creat_word_bank()

In [16]:
lookup_dict = {}

def enrich_lookup_dict(lookup_dict, words, paper_name):
    for w in words:
        lookup_dict.setdefault(w,[]).append(paper_name)
        # lookup_dict[w] = paper_name

In [17]:
# Single words
agg_words = []
for coin in word_bank:
    agg_words.extend(word_bank[coin])
    enrich_lookup_dict(lookup_dict, word_bank[coin], coin)

single_counter = Counter(agg_words)
print(single_counter.most_common(300))
# print([x[0] for x in single_counter.most_common(2000)])

[('system', 18), ('many', 18), ('amount', 18), ('give', 18), ('new', 18), ('number', 18), ('take', 18), ('require', 18), ('make', 18), ('paper', 18), ('exist', 17), ('start', 17), ('key', 17), ('transaction', 17), ('receive', 17), ('time', 17), ('public', 17), ('become', 17), ('follow', 17), ('use', 17), ('network', 17), ('work', 17), ('keep', 17), ('order', 17), ('include', 17), ('increase', 17), ('high', 16), ('hold', 16), ('change', 16), ('generate', 16), ('see', 16), ('send', 16), ('reference', 16), ('mean', 16), ('set', 16), ('able', 16), ('second', 16), ('base', 16), ('possible', 16), ('know', 16), ('access', 16), ('control', 16), ('simple', 16), ('case', 16), ('value', 16), ('fee', 16), ('problem', 16), ('reduce', 16), ('large', 16), ('result', 16), ('current', 16), ('share', 16), ('create', 16), ('fund', 16), ('way', 16), ('full', 15), ('update', 15), ('particular', 15), ('protocol', 15), ('future', 15), ('function', 15), ('point', 15), ('potential', 15), ('need', 15), ('privat

In [18]:
#2-gram counter
agg_words_2gram = []
for coin in word_bank:
    words = list(ngrams(word_bank[coin], 2))
    agg_words_2gram.extend(words)
    enrich_lookup_dict(lookup_dict, words, coin)

two_gram_counter = Counter(agg_words_2gram)
print(two_gram_counter.most_common(250))
# print([x[0] for x in two_gram_counter.most_common(300)])

[(('require', 'move'), 13), (('transfer', 'high'), 10), (('state', 'size'), 10), (('many', 'future'), 10), (('open', 'incentive'), 9), (('requirement', 'additional'), 9), (('make', 'share'), 9), (('base', 'prove'), 8), (('example', 'account'), 8), (('time', 'result'), 8), (('important', 'currency'), 8), (('product', 'represent'), 8), (('refer', 'guarantee'), 7), (('mean', 'v'), 7), (('necessary', 'able'), 7), (('v', 'provide'), 7), (('provide', 'trade'), 7), (('owner', 'valid'), 7), (('track', 'space'), 7), (('le', 'change'), 7), (('work', 'main'), 7), (('day', 'requirement'), 7), (('access', 'role'), 7), (('algorithm', 'keep'), 7), (('describe', 'order'), 6), (('development', 'environment'), 6), (('storage', 'limit'), 6), (('time', 'payment'), 6), (('throughput', 'chain'), 6), (('begin', 'several'), 6), (('result', 'look'), 6), (('ensure', 'sell'), 6), (('identify', 'key'), 6), (('cost', 'mine'), 6), (('order', 'describe'), 6), (('key', 'recent'), 6), (('evaluate', 'current'), 6), (('

In [19]:
# print(lookup_dict)

In [20]:
#3-gram counter
agg_words_3gram = []
for coin in word_bank:
    words = list(ngrams(word_bank[coin], 3))
    agg_words_3gram.extend(words)
    enrich_lookup_dict(lookup_dict, words, coin)

three_gram_counter = Counter(agg_words_3gram)
print(three_gram_counter.most_common(150))
# print([x[0] for x in three_gram_counter.most_common(300)])

[(('day', 'requirement', 'additional'), 6), (('run', 'example', 'account'), 5), (('capability', 'important', 'currency'), 5), (('time', 'payment', 'result'), 4), (('precise', 'fork', 'matter'), 4), (('mean', 'v', 'provide'), 4), (('implement', 'second', 'join'), 4), (('time', 'result', 'look'), 4), (('make', 'share', 'security'), 4), (('channel', 'point', 'enough'), 4), (('maintain', 'customer', 'owner'), 4), (('refer', 'guarantee', 'notice'), 4), (('identify', 'key', 'recent'), 4), (('liquidity', 'necessary', 'able'), 4), (('impossible', 'algorithm', 'keep'), 4), (('result', 'look', 'turn'), 4), (('complexity', 'diﬀerent', 'exist'), 3), (('update', 'introduce', 'generate'), 3), (('able', 'implement', 'second'), 3), (('cryptocurrencies', 'cryptographic', 'secure'), 3), (('transfer', 'high', 'minute'), 3), (('focus', 'policy', 'refer'), 3), (('v', 'provide', 'trade'), 3), (('decentralize', 'blockchains', 'large'), 3), (('additional', 'mechanism', 'follow'), 3), (('list', 'robust', 'cont

In [21]:
#4-gram counter
agg_words_4gram = []
for coin in word_bank:
    words = list(ngrams(word_bank[coin], 4))
    agg_words_4gram.extend(words)
    enrich_lookup_dict(lookup_dict, words, coin)

four_gram_counter = Counter(agg_words_4gram)
print(four_gram_counter.most_common(150))
# print([x[0] for x in four_gram_counter.most_common(100)])

[(('last', 'binary', 'fair', 'work'), 3), (('time', 'result', 'look', 'turn'), 3), (('ownership', 'transfer', 'high', 'minute'), 2), (('focus', 'policy', 'refer', 'guarantee'), 2), (('mean', 'forwardlooking', 'v', 'provide'), 2), (('able', 'poa', 'second', 'first'), 2), (('care', 'history', 'expect', 'base'), 2), (('ethereum', 'compensation', 'growth', 'response'), 2), (('lock', 'requirement', 'additional', 'mechanism'), 2), (('requirement', 'additional', 'mechanism', 'follow'), 2), (('use', 'enter', 'hash', 'group'), 2), (('introduction', 'much', 'market', 'avoid'), 2), (('able', 'implement', 'second', 'join'), 2), (('competition', 'cofounder', 'standard', 'plan'), 2), (('take', 'slow', 'maintain', 'customer'), 2), (('scenario', 'counterparty', 'request', 'distribution'), 2), (('consult', 'core', 'par', 'settlement'), 2), (('facilitate', 'do', 'get', 'new'), 2), (('prototype', 'refer', 'guarantee', 'notice'), 2), (('censorship', 'avoid', 'game', 'contain'), 2), (('mean', 'v', 'provide

In [29]:
print(lookup_dict[('store', 'ie', 'creation', 'lead')])

KeyError: ('store', 'ie', 'creation', 'lead')

In [30]:
print(lookup_dict[('secure', 'vision', 'ensure', 'state')])

KeyError: ('secure', 'vision', 'ensure', 'state')

In [31]:
print(lookup_dict[('uniswap', 'mandatory')])

KeyError: ('uniswap', 'mandatory')

In [23]:
print(lookup_dict['sha256'])

['Bitcoin.txt', 'Ethereum.txt', 'Solana.txt', 'Tron.txt']


In [24]:
print(lookup_dict['proofofwork'])

['Avalanche.txt', 'Bitcoin.txt', 'Cardano.txt', 'Chainlink.txt', 'Ethereum.txt', 'PolkaDot.txt']


In [25]:
print(lookup_dict['proofofstake'])

['Avalanche.txt', 'Cardano.txt', 'Chainlink.txt', 'Ethereum.txt', 'PolkaDot.txt', 'Polygon.txt']


In [26]:
print(context_dict['proofofstake'])

['of the system. In almost every proof-of-stake protocol that attempts to scale to a large participant set,\n', 'of Sybil deterrence mechanisms that span proof-of-work (PoW), proof-of-stake (PoS), proof-of-elapsed-time\n', 'proof-of-stake, because it is green, accessible, and open to all. We note, however, that while the $AVAX uses\n', 'Ouroboros: A Provably Secure Proof-of-Stake Blockchain Protocol\n', 'Ouroboros genesis: Composable proof-of-stake blockchains with dynamic availability. In\n', 'praos: An adaptively-secure, semi-synchronous proof-of-stake protocol. IACR Cryptology\n', 'blockchain, e.g., in Proof-of-Stake systems in which committees are selected to execute\n', 'require the majority of hash power to be honest, Proof-of-Stake systems typically re-\n', 'of-work, proof-of-stake, and permissioned systems are susceptible to prospective\n', '[204] Tezos. Proof-of-Stake in Tezos. https://tezos.gitlab.io/whitedoc/proof_of_stake.\n', 'ble in and can impact the security of proof-of

In [28]:
print(context_dict['proofofwork'])

['hash-based proof-of-work, forming a record that cannot be changed without redoing \n', 'the proof-of-work.  The longest chain not only serves as proof of the sequence of \n', 'proof-of-work chain as proof of what happened while they were gone.\n', '4. Proof-of-Work\n', 'The proof-of-work involves scanning for a value that when hashed, such as with SHA-256, the \n', 'For our timestamp network, we implement the proof-of-work by incrementing a nonce in the \n', 'effort has been expended to make it satisfy the proof-of-work, the block cannot be changed \n', 'The proof-of-work also solves the problem of determining representation in majority decision \n', 'able to allocate many IPs.  Proof-of-work is essentially one-CPU-one-vote.  The majority \n', 'decision is represented by the longest chain, which has the greatest proof-of-work effort invested \n', 'redo the proof-of-work of the block and all blocks after it and then catch up with and surpass the \n', 'the proof-of-work difficulty is d