### Import library

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import collections
import re
import os
import string
pd.set_option('display.max_colwidth', 200)
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#!pip install BeautifulSoup4
# import nltk
# nltk.download()  # Download text data sets, including stop words

In [3]:
from nltk.corpus import stopwords # Import the stop word list
from collections import Counter
from nltk.util import ngrams
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

In [4]:
lemmatizer = WordNetLemmatizer()

In [5]:
lemmatizer.lemmatize("requires", pos='v')

'require'

### Whitepaper datasource

In [6]:
stopwords_set = set(stopwords.words('english'))

In [7]:
def extract_and_clean(line, stopwords):
    # words = [x.strip() for x in re.split(',| |\. |\: ', line) if x]
    # words = map(str.lower, words)
    # words = [x.replace('-', '') for x in words]
    words = word_tokenize(re.sub(r'[^\w\s]', '', line.lower()))
    # words = [x.replace('-', '') for x in words]
    words = [word for word in words if not word in stopwords]
    # ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
    words = [lemmatizer.lemmatize(w, pos='s') for w in words]
    words = [lemmatizer.lemmatize(w, pos='n') for w in words]
    words = [lemmatizer.lemmatize(w, pos='v') for w in words]
    words = [lemmatizer.lemmatize(w, pos='a') for w in words]
    return words

In [8]:
def read_whitepapers(filename, stopwords):
    directory = "../whitepapers/top20_whitepapers/"
    words_list = []
    for entry in os.scandir(directory):
        if (entry.path.endswith(filename) and entry.is_file()):
            with open(entry.path, "r") as f:
                for line in f:
                    # words_list.extend(line.split())
                    # words_list.extend([x.strip() for x in line.split()])
                    words_list.extend(extract_and_clean(line, stopwords))
    return words_list

In [9]:
bitcoin_filename="Bitcoin.txt"
whitepapers = read_whitepapers(bitcoin_filename, stopwords_set)
# whitepapers.rename(columns={bitcoin_filename: "whitepapers"}, inplace=True)
whitepapers[:100]

['bitcoin',
 'peertopeer',
 'electronic',
 'cash',
 'system',
 'satoshi',
 'nakamoto',
 'satoshingmxcom',
 'wwwbitcoinorg',
 'abstract',
 'purely',
 'peertopeer',
 'version',
 'electronic',
 'cash',
 'would',
 'allow',
 'online',
 'payment',
 'send',
 'directly',
 'one',
 'party',
 'another',
 'without',
 'go',
 'financial',
 'institution',
 'digital',
 'signature',
 'provide',
 'part',
 'solution',
 'main',
 'benefit',
 'lose',
 'trust',
 'third',
 'party',
 'still',
 'require',
 'prevent',
 'doublespending',
 'propose',
 'solution',
 'doublespending',
 'problem',
 'use',
 'peertopeer',
 'network',
 'network',
 'timestamps',
 'transaction',
 'hash',
 'ongoing',
 'chain',
 'hashbased',
 'proofofwork',
 'form',
 'record',
 'change',
 'without',
 'redo',
 'proofofwork',
 'long',
 'chain',
 'serf',
 'proof',
 'sequence',
 'event',
 'witness',
 'proof',
 'come',
 'large',
 'pool',
 'cpu',
 'power',
 'long',
 'majority',
 'cpu',
 'power',
 'control',
 'node',
 'cooperate',
 'attack',
 'netw

In [10]:
filenames = ['Algorand.txt', 'Avalanche.txt', 'Binance.txt', 'Bitcoin.txt', 'Cardano.txt', 'Chainlink.txt',
            'Crypto_com.txt', 'Ethereum.txt', 'FTX_token.txt', 'PolkaDot.txt', 'Polygon.txt', 'Ripple.txt', 
            'Solana.txt', 'Terra.txt', 'Tether.txt', 'Tron.txt', 'Uniswap.txt', 'Wrapped.txt']

In [11]:
compiled_r = re.compile('(?<!\S)\d+(?!\S)')

In [12]:
def filter_less_important_words(words):
    tagged = pos_tag(words)
    # Only keep verb, noun, adj
    return [w[0] for w in tagged if w[1].startswith('N') or w[1].startswith('J') or w[1].startswith('V')]
    # return [w for w in words if not compiled_reg.match(w)]

In [13]:
def creat_word_bank():
    word_bank = {}
    for i in filenames:
        #deduplicate in each paper
        deduped_words = list(set(read_whitepapers(i, stopwords_set)))
        word_bank[i] = filter_less_important_words(deduped_words)
    return word_bank

In [14]:
word_bank = creat_word_bank()

In [15]:
# Single words
agg_words = []
for coin in word_bank:
    agg_words.extend(word_bank[coin])

single_counter = Counter(agg_words)
print(single_counter.most_common(300))
# print([x[0] for x in single_counter.most_common(2000)])

[('paper', 18), ('require', 18), ('amount', 18), ('take', 18), ('many', 18), ('make', 18), ('give', 18), ('use', 18), ('number', 18), ('system', 18), ('new', 18), ('exist', 17), ('receive', 17), ('follow', 17), ('key', 17), ('start', 17), ('transaction', 17), ('work', 17), ('order', 17), ('network', 17), ('public', 17), ('need', 17), ('time', 17), ('increase', 17), ('include', 17), ('base', 16), ('high', 16), ('become', 16), ('mean', 16), ('protocol', 16), ('change', 16), ('possible', 16), ('access', 16), ('hold', 16), ('second', 16), ('send', 16), ('set', 16), ('case', 16), ('simple', 16), ('reference', 16), ('see', 16), ('control', 16), ('able', 16), ('generate', 16), ('value', 16), ('keep', 16), ('create', 16), ('result', 16), ('fund', 16), ('fee', 16), ('large', 16), ('reduce', 16), ('current', 16), ('share', 16), ('problem', 16), ('way', 16), ('update', 15), ('future', 15), ('particular', 15), ('choose', 15), ('full', 15), ('potential', 15), ('function', 15), ('point', 15), ('mode

In [19]:
#2-gram counter
agg_words_2gram = []
for coin in word_bank:
    agg_words_2gram.extend(ngrams(word_bank[coin], 2))

two_gram_counter = Counter(agg_words_2gram)
print(two_gram_counter.most_common(250))
# print([x[0] for x in two_gram_counter.most_common(300)])

[(('paper', 'include'), 13), (('node', 'network'), 11), (('future', 'total'), 10), (('blockchain', 'client'), 10), (('come', 'participant'), 10), (('cryptographic', 'exist'), 9), (('consume', 'receive'), 9), (('prevent', 'remain'), 9), (('initial', 'large'), 9), (('let', 'currency'), 9), (('introduce', 'grow'), 8), (('take', 'individual'), 8), (('secure', 'identity'), 8), (('create', 'hash'), 8), (('balance', 'ecosystem'), 8), (('act', 'risk'), 8), (('agree', 'requirement'), 8), (('developer', 'hold'), 8), (('model', 'group'), 8), (('entire', 'successful'), 8), (('propose', 'change'), 8), (('step', 'add'), 7), (('order', 'access'), 7), (('represent', 'launch'), 7), (('price', 'new'), 7), (('access', 'creation'), 7), (('able', 'machine'), 7), (('design', 'attempt'), 7), (('robust', 'model'), 7), (('algorithm', 'know'), 7), (('client', 'traditional'), 7), (('today', 'give'), 7), (('current', 'rest'), 7), (('live', 'see'), 7), (('rate', 'dollar'), 7), (('separate', 'communicate'), 6), (('

In [17]:
#3-gram counter
agg_words_3gram = []
for coin in word_bank:
    agg_words_3gram.extend(ngrams(word_bank[coin], 3))

three_gram_counter = Counter(agg_words_3gram)
print(three_gram_counter.most_common(150))
# print([x[0] for x in three_gram_counter.most_common(300)])

[(('scalability', 'node', 'network'), 6), (('blockchain', 'client', 'traditional'), 6), (('robust', 'model', 'group'), 5), (('future', 'total', 'base'), 4), (('world', 'limit', 'basis'), 4), (('balance', 'ecosystem', 'enable'), 4), (('country', 'come', 'participant'), 4), (('see', 'earn', 'need'), 4), (('order', 'access', 'information'), 4), (('constant', 'create', 'hash'), 4), (('foundation', 'act', 'risk'), 4), (('much', 'ideal', 'facilitate'), 4), (('work', 'effect', 'allow'), 4), (('manage', 'price', 'new'), 4), (('spot', 'cryptographic', 'exist'), 4), (('improvement', 'let', 'currency'), 4), (('rate', 'dollar', 'settle'), 4), (('introduce', 'grow', 'depend'), 3), (('strategy', 'code', 'complex'), 3), (('computer', 'represent', 'launch'), 3), (('genesis', 'smart', 'support'), 3), (('transition', 'maximal', 'term'), 3), (('download', 'idea', 'object'), 3), (('contract', 'con', 'result'), 3), (('maintain', 'short', 'say'), 3), (('true', 'pay', 'address'), 3), (('primitive', 'avoid', 

In [18]:
#4-gram counter
agg_words_4gram = []
for coin in word_bank:
    agg_words_4gram.extend(ngrams(word_bank[coin], 4))

four_gram_counter = Counter(agg_words_4gram)
print(four_gram_counter.most_common(150))
# print([x[0] for x in four_gram_counter.most_common(100)])

[('mention', 'build', 'compromise', 'legal'), ('company', 'white', 'work', 'effect'), ('white', 'work', 'effect', 'allow'), ('dozen', 'strategy', 'code', 'complex'), ('statement', 'condition', 'hardcoded', 'r'), ('proportional', 'transition', 'maximal', 'term'), ('transition', 'maximal', 'term', 'discussion'), ('download', 'idea', 'object', 'governance'), ('instant', 'mode', 'maintain', 'short'), ('dns', 'custom', 'reader', 'account'), ('reduce', 'relate', 'rich', 'provide'), ('apis', 'content', 'ledger', 'type'), ('language', 'doesnt', 'late', 'deploy'), ('list', 'ﬁxed', 'able', 'machine'), ('advance', 'form', 'prevent', 'remain'), ('form', 'prevent', 'remain', 'serialize'), ('robust', 'model', 'group', 'store'), ('antonopoulos', 'entire', 'successful', 'system'), ('entire', 'successful', 'system', 'continuous'), ('live', 'see', 'earn', 'need'), ('receive', 'independent', 'payment', 'verification'), ('code', 'practical', 'propose', 'change'), ('security', 'order', 'access', 'informati