### Import library

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import collections
import re
import os
import string
pd.set_option('display.max_colwidth', 200)
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#!pip install BeautifulSoup4
# import nltk
# nltk.download()  # Download text data sets, including stop words

In [3]:
# Import BeautifulSoup into your workspace
from bs4 import BeautifulSoup         

In [4]:
from nltk.corpus import stopwords # Import the stop word list
from collections import Counter
from nltk.util import ngrams
from nltk import word_tokenize

### Whitepaper datasource

In [5]:
stopwords_set = set(stopwords.words('english'))

In [6]:
def extract_and_clean(line, stopwords):
    # words = [x.strip() for x in re.split(',| |\. |\: ', line) if x]
    # words = map(str.lower, words)
    # words = [x.replace('-', '') for x in words]
    words = word_tokenize(re.sub(r'[^\w\s]', '', line.lower()))
    # words = [x.replace('-', '') for x in words]
    words = [word for word in words if not word in stopwords]
    return words

In [7]:
def read_whitepapers(filename, stopwords):
    directory = "../whitepapers/top20_whitepapers/"
    words_list = []
    for entry in os.scandir(directory):
        if (entry.path.endswith(filename) and entry.is_file()):
            with open(entry.path, "r") as f:
                for line in f:
                    # words_list.extend(line.split())
                    # words_list.extend([x.strip() for x in line.split()])
                    words_list.extend(extract_and_clean(line, stopwords))
    return words_list

In [8]:
bitcoin_filename="Bitcoin.txt"
whitepapers = read_whitepapers(bitcoin_filename, stopwords_set)
# whitepapers.rename(columns={bitcoin_filename: "whitepapers"}, inplace=True)
whitepapers[:25]

['bitcoin',
 'peertopeer',
 'electronic',
 'cash',
 'system',
 'satoshi',
 'nakamoto',
 'satoshingmxcom',
 'wwwbitcoinorg',
 'abstract',
 'purely',
 'peertopeer',
 'version',
 'electronic',
 'cash',
 'would',
 'allow',
 'online',
 'payments',
 'sent',
 'directly',
 'one',
 'party',
 'another',
 'without']

In [9]:
filenames = ['Algorand.txt', 'Avalanche.txt', 'Binance.txt', 'Bitcoin.txt', 'Cardano.txt', 'Chainlink.txt',
            'Crypto_com.txt', 'Ethereum.txt', 'FTX_token.txt', 'PolkaDot.txt', 'Polygon.txt', 'Ripple.txt', 
            'Solana.txt', 'Terra.txt', 'Tether.txt', 'Tron.txt', 'Uniswap.txt', 'Wrapped.txt']

In [10]:
def creat_word_bank():
    word_bank = {}
    for i in filenames:
        #deduplicate in each paper
        word_bank[i] = list(set(read_whitepapers(i, stopwords_set)))
    return word_bank

In [11]:
word_bank = creat_word_bank()

In [12]:
# Single words
agg_words = []
for coin in word_bank:
    agg_words.extend(word_bank[coin])

single_counter = Counter(agg_words)
# print(single_counter.most_common(700))
print([x[0] for x in single_counter.most_common(2000)])

['7', 'many', '3', '6', '4', 'number', 'amount', 'user', 'since', '1', 'also', 'two', 'one', '2', 'given', 'use', 'new', 'used', 'users', '5', 'using', 'network', 'transactions', 'may', 'public', 'paper', 'make', 'well', 'key', '8', 'time', 'every', 'within', 'system', 'value', 'able', '10', 'even', 'access', 'simple', 'possible', 'order', 'based', 'need', '9', 'receive', 'like', 'current', '11', 'provide', 'required', 'following', 'full', 'case', 'private', 'second', 'first', 'without', 'potential', 'would', 'function', 'set', '25', 'future', 'protocol', 'particular', 'transaction', 'fees', 'work', 'fee', 'must', 'data', 'high', '15', 'could', 'funds', 'problem', '12', 'existing', 'per', 'example', 'less', 'size', 'process', 'large', 'multiple', 'way', 'still', 'small', 'control', 'digital', 'block', 'abstract', 'another', 'much', 'means', 'together', 'see', 'already', 'however', 'blockchain', 'fast', 'long', 'keep', '13', 'ensure', 'single', 'point', 'security', 'consensus', 'low', '

In [13]:
#2-gram counter
agg_words_2gram = []
for coin in word_bank:
    agg_words_2gram.extend(ngrams(word_bank[coin], 2))

two_gram_counter = Counter(agg_words_2gram)
# print(two_gram_counter.most_common(150))
print([x[0] for x in two_gram_counter.most_common(300)])

[('2', 'together'), ('used', 'introduction'), ('prevent', 'per'), ('several', 'based'), ('transactions', 'small'), ('pool', 'ability'), ('best', 'limited'), ('role', 'given'), ('per', 'support'), ('protocols', 'code'), ('14', 'supply'), ('know', 'problem'), ('users', 'etc'), ('necessary', 'randomly'), ('work', 'economic'), ('run', 'set'), ('like', 'build'), ('applications', 'create'), ('machine', 'user'), ('current', '15'), ('network', 'times'), ('associated', '1'), ('oracle', 'also'), ('easy', 'received'), ('period', 'work'), ('price', 'every'), ('management', 'holders'), ('holders', 'incentives'), ('entire', 'established'), ('software', 'move'), ('store', 'first'), ('become', '17'), ('17', 'two'), ('must', 'already'), ('version', 'mechanism'), ('7', 'reducing'), ('loss', 'around'), ('requirement', 'messages'), ('development', 'setup'), ('vote', 'paper'), ('selected', 'functions'), ('already', 'new'), ('f', 'attack'), ('growth', 'separate'), ('lower', 'rules'), ('blockchains', '50'), 

In [14]:
#3-gram counter
agg_words_3gram = []
for coin in word_bank:
    agg_words_3gram.extend(ngrams(word_bank[coin], 3))

three_gram_counter = Counter(agg_words_3gram)
print(three_gram_counter.most_common(150))
# print([x[0] for x in three_gram_counter.most_common(300)])

[(('prevent', 'per', 'support'), 7), (('period', 'work', 'economic'), 6), (('management', 'holders', 'incentives'), 5), (('become', '17', 'two'), 5), (('run', 'set', 'implementing'), 4), (('current', '15', '32'), 4), (('2', 'together', 'pending'), 4), (('easy', 'received', 'developed'), 4), (('independently', 'store', 'first'), 4), (('must', 'already', 'new'), 4), (('upon', 'pos', 'working'), 4), (('modify', 'users', 'etc'), 4), (('current', '32', '15'), 4), (('merkle', 'machine', 'user'), 4), (('thus', 'vote', 'paper'), 3), (('throughput', 'transaction', 'number'), 3), (('allows', 'actual', 'fee'), 3), (('phase', 'like', 'build'), 3), (('goal', 'queried', '2'), 3), (('predictable', 'blockchain', 'information'), 3), (('electronic', 'globally', 'ie'), 3), (('future', 'risks', 'including'), 3), (('others', 'currently', 'machines'), 3), (('mechanisms', 'satoshi', 'blocks'), 3), (('found', 'theory', 'become'), 3), (('adopted', 'retaining', 'previously'), 3), (('standard', 'requirement', 'm

In [15]:
#4-gram counter
agg_words_4gram = []
for coin in word_bank:
    agg_words_4gram.extend(ngrams(word_bank[coin], 4))

four_gram_counter = Counter(agg_words_4gram)
# print(four_gram_counter.most_common(150))
print([x[0] for x in four_gram_counter.most_common(100)])

[('delivery', 'thus', 'vote', 'paper'), ('proposed', 'used', 'introduction', 'cryptographic'), ('period', 'work', 'economic', 'cryptocurrencies'), ('line', 'computers', 'fully', 'state'), ('computers', 'fully', 'state', 'members'), ('fully', 'state', 'members', 'begins'), ('end', 'validating', 'reached', 'enforce'), ('13', 'type', 'stack', 'reduces'), ('lower', 'rules', 'gained', 'management'), ('rules', 'gained', 'management', 'holders'), ('member', 'lost', 'vision', 'statements'), ('validated', 'prevent', 'per', 'support'), ('prevent', 'per', 'support', 'repeated'), ('hence', 'exchange', 'prevents', 'fact'), ('transaction', 'private', 'authority', 'number'), ('hashes', 'mechanisms', 'satoshi', 'blocks'), ('must', 'already', 'new', '28'), ('calculated', 'denote', 'better', 'maintaining'), ('current', '15', '32', 'computed'), ('produces', 'framework', 'reduce', 'gain'), ('framework', 'reduce', 'gain', 'btc'), ('compared', 'length', 'produce', 'additionally'), ('concepts', 'providing', 