### Import library

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import collections
import re
import os
import string
pd.set_option('display.max_colwidth', 200)
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#!pip install BeautifulSoup4
# import nltk
# nltk.download()  # Download text data sets, including stop words

In [3]:
from nltk.corpus import stopwords # Import the stop word list
from collections import Counter
from nltk.util import ngrams
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

In [4]:
lemmatizer = WordNetLemmatizer()

In [5]:
lemmatizer.lemmatize("requires", pos='v')

'require'

### Whitepaper datasource

In [6]:
stopwords_set = set(stopwords.words('english'))

In [7]:
def extract_and_clean(line, stopwords):
    # words = [x.strip() for x in re.split(',| |\. |\: ', line) if x]
    # words = map(str.lower, words)
    # words = [x.replace('-', '') for x in words]
    words = word_tokenize(re.sub(r'[^\w\s]', '', line.lower()))
    # words = [x.replace('-', '') for x in words]
    words = [word for word in words if not word in stopwords]
    # ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
    words = [lemmatizer.lemmatize(w, pos='s') for w in words]
    words = [lemmatizer.lemmatize(w, pos='n') for w in words]
    words = [lemmatizer.lemmatize(w, pos='v') for w in words]
    words = [lemmatizer.lemmatize(w, pos='a') for w in words]
    return words

In [8]:
def read_whitepapers(filename, stopwords):
    directory = "../whitepapers/top20_whitepapers/"
    words_list = []
    for entry in os.scandir(directory):
        if (entry.path.endswith(filename) and entry.is_file()):
            with open(entry.path, "r") as f:
                for line in f:
                    # words_list.extend(line.split())
                    # words_list.extend([x.strip() for x in line.split()])
                    words_list.extend(extract_and_clean(line, stopwords))
    return words_list

In [9]:
bitcoin_filename="Bitcoin.txt"
whitepapers = read_whitepapers(bitcoin_filename, stopwords_set)
# whitepapers.rename(columns={bitcoin_filename: "whitepapers"}, inplace=True)
whitepapers[:50]

['bitcoin',
 'peertopeer',
 'electronic',
 'cash',
 'system',
 'satoshi',
 'nakamoto',
 'satoshingmxcom',
 'wwwbitcoinorg',
 'abstract',
 'purely',
 'peertopeer',
 'version',
 'electronic',
 'cash',
 'would',
 'allow',
 'online',
 'payment',
 'send',
 'directly',
 'one',
 'party',
 'another',
 'without',
 'go',
 'financial',
 'institution',
 'digital',
 'signature',
 'provide',
 'part',
 'solution',
 'main',
 'benefit',
 'lose',
 'trust',
 'third',
 'party',
 'still',
 'require',
 'prevent',
 'doublespending',
 'propose',
 'solution',
 'doublespending',
 'problem',
 'use',
 'peertopeer',
 'network']

In [10]:
filenames = ['Algorand.txt', 'Avalanche.txt', 'Binance.txt', 'Bitcoin.txt', 'Cardano.txt', 'Chainlink.txt',
            'Crypto_com.txt', 'Ethereum.txt', 'FTX_token.txt', 'PolkaDot.txt', 'Polygon.txt', 'Ripple.txt', 
            'Solana.txt', 'Terra.txt', 'Tether.txt', 'Tron.txt', 'Uniswap.txt', 'Wrapped.txt']

In [11]:
compiled_r = re.compile('(?<!\S)\d+(?!\S)')

In [12]:
def filter_less_important_words(words):
    tagged = pos_tag(words)
    # Only keep verb, noun, adj
    return [w[0] for w in tagged if w[1].startswith('N') or w[1].startswith('J') or w[1].startswith('V')]
    # return [w for w in words if not compiled_reg.match(w)]

In [13]:
def creat_word_bank():
    word_bank = {}
    for i in filenames:
        #deduplicate in each paper
        deduped_words = list(set(read_whitepapers(i, stopwords_set)))
        word_bank[i] = filter_less_important_words(deduped_words)
    return word_bank

In [14]:
word_bank = creat_word_bank()

In [15]:
lookup_dict = {}

In [16]:
def enrich_lookup_dict(lookup_dict, words, paper_name):
    for w in words:
        lookup_dict.setdefault(w,[]).append(paper_name)
        # lookup_dict[w] = paper_name

In [17]:
# Single words
agg_words = []
for coin in word_bank:
    agg_words.extend(word_bank[coin])
    enrich_lookup_dict(lookup_dict, word_bank[coin], coin)

single_counter = Counter(agg_words)
print(single_counter.most_common(300))
# print([x[0] for x in single_counter.most_common(2000)])

[('give', 18), ('system', 18), ('new', 18), ('amount', 18), ('require', 18), ('many', 18), ('make', 18), ('take', 18), ('number', 18), ('paper', 18), ('use', 17), ('start', 17), ('transaction', 17), ('become', 17), ('network', 17), ('work', 17), ('receive', 17), ('follow', 17), ('key', 17), ('public', 17), ('order', 17), ('keep', 17), ('exist', 17), ('time', 17), ('increase', 17), ('include', 17), ('second', 16), ('control', 16), ('base', 16), ('simple', 16), ('send', 16), ('access', 16), ('know', 16), ('see', 16), ('value', 16), ('change', 16), ('generate', 16), ('protocol', 16), ('able', 16), ('case', 16), ('hold', 16), ('high', 16), ('mean', 16), ('set', 16), ('possible', 16), ('reference', 16), ('share', 16), ('fund', 16), ('create', 16), ('store', 16), ('current', 16), ('large', 16), ('result', 16), ('reduce', 16), ('problem', 16), ('fee', 16), ('way', 16), ('future', 15), ('update', 15), ('function', 15), ('need', 15), ('particular', 15), ('point', 15), ('full', 15), ('potential'

In [18]:
#2-gram counter
agg_words_2gram = []
for coin in word_bank:
    words = list(ngrams(word_bank[coin], 2))
    agg_words_2gram.extend(words)
    enrich_lookup_dict(lookup_dict, words, coin)

two_gram_counter = Counter(agg_words_2gram)
print(two_gram_counter.most_common(250))
# print([x[0] for x in two_gram_counter.most_common(300)])

[(('low', 'transaction'), 14), (('receive', 'form'), 11), (('increase', 'implement'), 11), (('ensure', 'state'), 10), (('reach', 'market'), 10), (('new', 'amount'), 9), (('much', 'easy'), 9), (('estimate', 'new'), 9), (('use', 'give'), 8), (('participant', 'version'), 8), (('input', 'previous'), 8), (('creation', 'lead'), 8), (('pay', 'ability'), 8), (('key', 'many'), 8), (('success', 'potential'), 8), (('give', 'pay'), 8), (('interest', 'model'), 8), (('share', 'equal'), 8), (('write', 'work'), 8), (('see', 'step'), 7), (('signature', 'participant'), 7), (('particular', 'case'), 7), (('additional', 'validators'), 7), (('reduce', 'active'), 7), (('system', 'limit'), 7), (('distribute', 'history'), 7), (('server', 'consensus'), 7), (('malicious', 'network'), 7), (('second', 'solve'), 7), (('transaction', 'track'), 6), (('achieve', 'send'), 6), (('introduction', 'network'), 6), (('vision', 'ensure'), 6), (('user', 'independent'), 6), (('face', 'current'), 6), (('game', 'crypto'), 6), (('

In [27]:
# print(lookup_dict)

In [20]:
#3-gram counter
agg_words_3gram = []
for coin in word_bank:
    words = list(ngrams(word_bank[coin], 3))
    agg_words_3gram.extend(words)
    enrich_lookup_dict(lookup_dict, words, coin)

three_gram_counter = Counter(agg_words_3gram)
print(three_gram_counter.most_common(150))
# print([x[0] for x in three_gram_counter.most_common(300)])

[(('signature', 'participant', 'version'), 6), (('vision', 'ensure', 'state'), 6), (('additional', 'validators', 'holder'), 6), (('provide', 'success', 'potential'), 5), (('give', 'pay', 'ability'), 5), (('space', 'setup', 'particular'), 5), (('estimate', 'new', 'amount'), 5), (('malicious', 'introduction', 'network'), 4), (('miner', 'cryptocurrencies', 'notation'), 4), (('incentive', 'software', 'security'), 4), (('reach', 'market', 'custodian'), 4), (('store', 'ie', 'creation'), 4), (('ie', 'creation', 'lead'), 4), (('face', 'current', 'wish'), 4), (('ownership', 'special', 'reduce'), 4), (('low', 'transaction', 'track'), 4), (('introduction', 'network', 'represent'), 4), (('place', 'interest', 'model'), 4), (('achieve', 'successful', 'send'), 4), (('see', 'step', 'contract'), 4), (('distribute', 'history', 'reward'), 4), (('secure', 'vision', 'ensure'), 3), (('maintain', 'par', 'create'), 3), (('create', 'come', 'perform'), 3), (('private', 'completion', 'reference'), 3), (('creatio

In [21]:
#4-gram counter
agg_words_4gram = []
for coin in word_bank:
    words = list(ngrams(word_bank[coin], 4))
    agg_words_4gram.extend(words)
    enrich_lookup_dict(lookup_dict, words, coin)

four_gram_counter = Counter(agg_words_4gram)
print(four_gram_counter.most_common(150))
# print([x[0] for x in four_gram_counter.most_common(100)])

[(('store', 'ie', 'creation', 'lead'), 4), (('secure', 'vision', 'ensure', 'state'), 3), (('ie', 'creation', 'lead', 'rich'), 3), (('goal', 'additional', 'validators', 'holder'), 3), (('space', 'setup', 'particular', 'case'), 3), (('use', 'give', 'pay', 'ability'), 3), (('long', 'malicious', 'introduction', 'network'), 3), (('require', 'enable', 'public', 'connect'), 2), (('create', 'come', 'perform', 'true'), 2), (('broad', 'private', 'completion', 'reference'), 2), (('private', 'completion', 'reference', 'select'), 2), (('performance', 'implication', 'store', 'ie'), 2), (('creation', 'lead', 'rich', 'happen'), 2), (('cryptography', 'cost', 'data', 'participate'), 2), (('propose', 'goal', 'additional', 'validators'), 2), (('neighbor', 'typical', 'agree', 'put'), 2), (('engine', 'line', 'theoretical', 'sell'), 2), (('phase', 'claim', 'election', 'endpoint'), 2), (('reduce', 'active', 'store', 'basic'), 2), (('abstract', 'introduction', 'network', 'represent'), 2), (('denial', 'exact', 

In [24]:
print(lookup_dict[('store', 'ie', 'creation', 'lead')])

['Avalanche.txt', 'Crypto_com.txt', 'PolkaDot.txt', 'Polygon.txt']


In [25]:
print(lookup_dict[('secure', 'vision', 'ensure', 'state')])

['Avalanche.txt', 'Ethereum.txt', 'Tron.txt']


In [26]:
print(lookup_dict[('uniswap', 'mandatory')])

['Uniswap.txt']


In [28]:
print(lookup_dict['sha256'])

['Bitcoin.txt', 'Ethereum.txt', 'Solana.txt', 'Tron.txt']


In [29]:
print(lookup_dict['proofofwork'])

['Avalanche.txt', 'Bitcoin.txt', 'Cardano.txt', 'Chainlink.txt', 'Ethereum.txt', 'PolkaDot.txt']


In [30]:
print(lookup_dict['proofofstake'])

['Avalanche.txt', 'Cardano.txt', 'Chainlink.txt', 'Ethereum.txt', 'PolkaDot.txt', 'Polygon.txt']
