### Import library

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import collections
import re
import os
import string
pd.set_option('display.max_colwidth', 200)
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#!pip install BeautifulSoup4
# import nltk
# nltk.download()  # Download text data sets, including stop words

In [3]:
from nltk.corpus import stopwords # Import the stop word list
from collections import Counter
from nltk.util import ngrams
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

In [4]:
lemmatizer = WordNetLemmatizer()

In [5]:
lemmatizer.lemmatize("requires", pos='v')

'require'

### Whitepaper datasource

In [6]:
stopwords_set = set(stopwords.words('english'))

In [7]:
def extract_and_clean(line, stopwords):
    # words = [x.strip() for x in re.split(',| |\. |\: ', line) if x]
    # words = map(str.lower, words)
    # words = [x.replace('-', '') for x in words]
    words = word_tokenize(re.sub(r'[^\w\s]', '', line.lower()))
    # words = [x.replace('-', '') for x in words]
    words = [word for word in words if not word in stopwords]
    # ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
    words = [lemmatizer.lemmatize(w, pos='s') for w in words]
    words = [lemmatizer.lemmatize(w, pos='n') for w in words]
    words = [lemmatizer.lemmatize(w, pos='v') for w in words]
    words = [lemmatizer.lemmatize(w, pos='a') for w in words]
    return words

In [8]:
def read_whitepapers(filename, stopwords):
    directory = "../whitepapers/top20_whitepapers/"
    words_list = []
    for entry in os.scandir(directory):
        if (entry.path.endswith(filename) and entry.is_file()):
            with open(entry.path, "r") as f:
                for line in f:
                    # words_list.extend(line.split())
                    # words_list.extend([x.strip() for x in line.split()])
                    words_list.extend(extract_and_clean(line, stopwords))
    return words_list

In [9]:
bitcoin_filename="Bitcoin.txt"
whitepapers = read_whitepapers(bitcoin_filename, stopwords_set)
# whitepapers.rename(columns={bitcoin_filename: "whitepapers"}, inplace=True)
whitepapers[:50]

['bitcoin',
 'peertopeer',
 'electronic',
 'cash',
 'system',
 'satoshi',
 'nakamoto',
 'satoshingmxcom',
 'wwwbitcoinorg',
 'abstract',
 'purely',
 'peertopeer',
 'version',
 'electronic',
 'cash',
 'would',
 'allow',
 'online',
 'payment',
 'send',
 'directly',
 'one',
 'party',
 'another',
 'without',
 'go',
 'financial',
 'institution',
 'digital',
 'signature',
 'provide',
 'part',
 'solution',
 'main',
 'benefit',
 'lose',
 'trust',
 'third',
 'party',
 'still',
 'require',
 'prevent',
 'doublespending',
 'propose',
 'solution',
 'doublespending',
 'problem',
 'use',
 'peertopeer',
 'network']

In [10]:
filenames = ['Algorand.txt', 'Avalanche.txt', 'Binance.txt', 'Bitcoin.txt', 'Cardano.txt', 'Chainlink.txt',
            'Crypto_com.txt', 'Ethereum.txt', 'FTX_token.txt', 'PolkaDot.txt', 'Polygon.txt', 'Ripple.txt', 
            'Solana.txt', 'Terra.txt', 'Tether.txt', 'Tron.txt', 'Uniswap.txt', 'Wrapped.txt']

In [11]:
compiled_r = re.compile('(?<!\S)\d+(?!\S)')

In [12]:
def filter_less_important_words(words):
    tagged = pos_tag(words)
    # Only keep verb, noun, adj
    return [w[0] for w in tagged if w[1].startswith('N') or w[1].startswith('J') or w[1].startswith('V')]
    # return [w for w in words if not compiled_reg.match(w)]

In [13]:
def creat_word_bank():
    word_bank = {}
    for i in filenames:
        word_bank[i] = read_whitepapers(i, stopwords_set)
        #deduplicate in each paper
        # deduped_words = list(set(read_whitepapers(i, stopwords_set)))
        # word_bank[i] = filter_less_important_words(deduped_words)
    return word_bank

In [14]:
def dedupe(words):
    return list(set(words))

In [15]:
word_bank = creat_word_bank()

In [16]:
lookup_dict = {}

In [17]:
def enrich_lookup_dict(lookup_dict, words, paper_name):
    for w in words:
        lookup_dict.setdefault(w,[]).append(paper_name)
        # lookup_dict[w] = paper_name

In [18]:
# Single words
agg_words = []
for coin in word_bank:
    agg_words.extend(dedupe(word_bank[coin]))
    enrich_lookup_dict(lookup_dict, word_bank[coin], coin)

single_counter = Counter(agg_words)
print(single_counter.most_common(300))
# print([x[0] for x in single_counter.most_common(2000)])

[('system', 18), ('require', 18), ('3', 18), ('user', 18), ('6', 18), ('make', 18), ('take', 18), ('one', 18), ('since', 18), ('number', 18), ('two', 18), ('amount', 18), ('give', 18), ('many', 18), ('use', 18), ('2', 18), ('also', 18), ('5', 18), ('paper', 18), ('4', 18), ('new', 18), ('1', 18), ('7', 18), ('follow', 17), ('8', 17), ('become', 17), ('within', 17), ('key', 17), ('order', 17), ('receive', 17), ('start', 17), ('time', 17), ('every', 17), ('may', 17), ('network', 17), ('transaction', 17), ('well', 17), ('public', 17), ('need', 17), ('work', 17), ('keep', 17), ('exist', 17), ('include', 17), ('increase', 17), ('provide', 17), ('generate', 16), ('simple', 16), ('able', 16), ('reference', 16), ('protocol', 16), ('access', 16), ('possible', 16), ('allow', 16), ('case', 16), ('high', 16), ('set', 16), ('base', 16), ('change', 16), ('even', 16), ('10', 16), ('mean', 16), ('know', 16), ('second', 16), ('control', 16), ('fast', 16), ('value', 16), ('see', 16), ('9', 16), ('hold',

In [19]:
#2-gram counter
agg_words_2gram = []
for coin in word_bank:
    words = dedupe(ngrams(word_bank[coin], 2))
    agg_words_2gram.extend(words)
    enrich_lookup_dict(lookup_dict, words, coin)

two_gram_counter = Counter(agg_words_2gram)
print(two_gram_counter.most_common(250))
# print([x[0] for x in two_gram_counter.most_common(300)])

[(('reference', '1'), 10), (('1', 'introduction'), 10), (('smart', 'contract'), 10), (('private', 'key'), 10), (('transaction', 'fee'), 10), (('allow', 'user'), 9), (('white', 'paper'), 9), (('1', '2'), 8), (('use', 'case'), 8), (('decentralize', 'exchange'), 8), (('period', 'time'), 8), (('create', 'new'), 8), (('transaction', 'per'), 8), (('large', 'number'), 8), (('third', 'party'), 8), (('also', 'provide'), 8), (('public', 'key'), 7), (('token', 'holder'), 7), (('per', 'second'), 7), (('also', 'use'), 7), (('consensus', 'protocol'), 7), (('high', 'level'), 7), (('transaction', 'cost'), 7), (('merkle', 'tree'), 7), (('transaction', 'transaction'), 7), (('block', 'block'), 7), (('data', 'structure'), 7), (('buy', 'sell'), 7), (('would', 'need'), 7), (('1', '1'), 6), (('block', 'transaction'), 6), (('step', '1'), 6), (('block', 'hash'), 6), (('digital', 'signature'), 6), (('amount', 'time'), 6), (('block', 'time'), 6), (('give', 'time'), 6), (('stake', 'system'), 6), (('protocol', 'pr

In [20]:
# print(lookup_dict)

In [21]:
#3-gram counter
agg_words_3gram = []
for coin in word_bank:
    words = dedupe(ngrams(word_bank[coin], 3))
    agg_words_3gram.extend(words)
    enrich_lookup_dict(lookup_dict, words, coin)

three_gram_counter = Counter(agg_words_3gram)
print(three_gram_counter.most_common(150))
# print([x[0] for x in three_gram_counter.most_common(300)])

[(('transaction', 'per', 'second'), 7), (('ethereum', 'virtual', 'machine'), 4), (('block', 'header', 'block'), 4), (('proof', 'stake', 'po'), 4), (('1', '2', '1'), 4), (('use', 'smart', 'contract'), 4), (('high', 'transaction', 'fee'), 4), (('control', 'private', 'key'), 4), (('eprint', 'archive', 'report'), 3), (('digital', 'signature', 'scheme'), 3), (('2', '1', '1'), 3), (('cryptology', 'eprint', 'archive'), 3), (('bitcoin', 'peertopeer', 'electronic'), 3), (('decentralize', 'application', 'dapps'), 3), (('peertopeer', 'electronic', 'cash'), 3), (('electronic', 'cash', 'system'), 3), (('long', 'period', 'time'), 3), (('every', 'node', 'network'), 3), (('conference', 'computer', 'communication'), 3), (('computer', 'communication', 'security'), 3), (('symposium', 'security', 'privacy'), 3), (('key', 'public', 'key'), 3), (('2', '1', '2'), 3), (('double', 'spend', 'attack'), 3), (('least', 'one', 'honest'), 3), (('r', 'r', '1'), 3), (('two', 'reason', 'first'), 3), (('1', '2', '2'), 3

In [22]:
#4-gram counter
agg_words_4gram = []
for coin in word_bank:
    words = dedupe(ngrams(word_bank[coin], 4))
    agg_words_4gram.extend(words)
    enrich_lookup_dict(lookup_dict, words, coin)

four_gram_counter = Counter(agg_words_4gram)
print(four_gram_counter.most_common(150))
# print([x[0] for x in four_gram_counter.most_common(100)])

[(('cryptology', 'eprint', 'archive', 'report'), 3), (('peertopeer', 'electronic', 'cash', 'system'), 3), (('bitcoin', 'peertopeer', 'electronic', 'cash'), 3), (('conference', 'computer', 'communication', 'security'), 3), (('cid16', 'cid17', 'cid16', 'cid17'), 2), (('micali', 'algorand', 'eﬃcient', 'democratic'), 2), (('veriﬁable', 'random', 'function', 'vrfs'), 2), (('cid17', 'cid16', 'cid17', 'cid16'), 2), (('algorand', 'eﬃcient', 'democratic', 'ledger'), 2), (('silvio', 'micali', 'algorand', 'eﬃcient'), 2), (('obligation', 'update', 'forwardlooking', 'statement'), 2), (('nakamoto', 'bitcoin', 'peertopeer', 'electronic'), 2), (('ethereum', 'secure', 'decentralise', 'generalise'), 2), (('undertake', 'obligation', 'update', 'forwardlooking'), 2), (('time', 'pass', 'since', 'last'), 2), (('acm', 'conference', 'computer', 'communication'), 2), (('party', 'transact', 'directly', 'without'), 2), (('proof', 'instead', 'trust', 'allow'), 2), (('system', 'base', 'cryptographic', 'proof'), 2),

In [23]:
print(lookup_dict['sha256'])

['Bitcoin.txt', 'Ethereum.txt', 'Ethereum.txt', 'Solana.txt', 'Solana.txt', 'Solana.txt', 'Tron.txt', 'Tron.txt']


In [24]:
print(lookup_dict['proofofwork'])

['Avalanche.txt', 'Avalanche.txt', 'Bitcoin.txt', 'Bitcoin.txt', 'Bitcoin.txt', 'Bitcoin.txt', 'Bitcoin.txt', 'Bitcoin.txt', 'Bitcoin.txt', 'Bitcoin.txt', 'Bitcoin.txt', 'Bitcoin.txt', 'Bitcoin.txt', 'Bitcoin.txt', 'Bitcoin.txt', 'Bitcoin.txt', 'Bitcoin.txt', 'Bitcoin.txt', 'Bitcoin.txt', 'Bitcoin.txt', 'Cardano.txt', 'Chainlink.txt', 'Chainlink.txt', 'Chainlink.txt', 'Ethereum.txt', 'Ethereum.txt', 'Ethereum.txt', 'Ethereum.txt', 'Ethereum.txt', 'Ethereum.txt', 'Ethereum.txt', 'Ethereum.txt', 'Ethereum.txt', 'PolkaDot.txt']


In [25]:
print(lookup_dict['proofofstake'])

['Avalanche.txt', 'Avalanche.txt', 'Avalanche.txt', 'Cardano.txt', 'Cardano.txt', 'Cardano.txt', 'Chainlink.txt', 'Chainlink.txt', 'Chainlink.txt', 'Chainlink.txt', 'Chainlink.txt', 'Chainlink.txt', 'Ethereum.txt', 'Ethereum.txt', 'PolkaDot.txt', 'PolkaDot.txt', 'PolkaDot.txt', 'PolkaDot.txt', 'Polygon.txt']


In [26]:
print(lookup_dict[('consensus', 'protocol')])

['Avalanche.txt', 'Cardano.txt', 'Chainlink.txt', 'Ethereum.txt', 'PolkaDot.txt', 'Ripple.txt', 'Solana.txt']
