### Import library

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import collections
import re
import os
import string
pd.set_option('display.max_colwidth', 200)
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#!pip install BeautifulSoup4
# import nltk
# nltk.download()  # Download text data sets, including stop words

In [3]:
# Import BeautifulSoup into your workspace
from bs4 import BeautifulSoup         

In [4]:
from nltk.corpus import stopwords # Import the stop word list
from collections import Counter
from nltk.util import ngrams
from nltk import word_tokenize, pos_tag

### Whitepaper datasource

In [5]:
stopwords_set = set(stopwords.words('english'))

In [6]:
def extract_and_clean(line, stopwords):
    # words = [x.strip() for x in re.split(',| |\. |\: ', line) if x]
    # words = map(str.lower, words)
    # words = [x.replace('-', '') for x in words]
    words = word_tokenize(re.sub(r'[^\w\s]', '', line.lower()))
    # words = [x.replace('-', '') for x in words]
    words = [word for word in words if not word in stopwords]
    return words

In [7]:
def read_whitepapers(filename, stopwords):
    directory = "../whitepapers/top20_whitepapers/"
    words_list = []
    for entry in os.scandir(directory):
        if (entry.path.endswith(filename) and entry.is_file()):
            with open(entry.path, "r") as f:
                for line in f:
                    # words_list.extend(line.split())
                    # words_list.extend([x.strip() for x in line.split()])
                    words_list.extend(extract_and_clean(line, stopwords))
    return words_list

In [8]:
bitcoin_filename="Bitcoin.txt"
whitepapers = read_whitepapers(bitcoin_filename, stopwords_set)
# whitepapers.rename(columns={bitcoin_filename: "whitepapers"}, inplace=True)
whitepapers[:25]

['bitcoin',
 'peertopeer',
 'electronic',
 'cash',
 'system',
 'satoshi',
 'nakamoto',
 'satoshingmxcom',
 'wwwbitcoinorg',
 'abstract',
 'purely',
 'peertopeer',
 'version',
 'electronic',
 'cash',
 'would',
 'allow',
 'online',
 'payments',
 'sent',
 'directly',
 'one',
 'party',
 'another',
 'without']

In [9]:
filenames = ['Algorand.txt', 'Avalanche.txt', 'Binance.txt', 'Bitcoin.txt', 'Cardano.txt', 'Chainlink.txt',
            'Crypto_com.txt', 'Ethereum.txt', 'FTX_token.txt', 'PolkaDot.txt', 'Polygon.txt', 'Ripple.txt', 
            'Solana.txt', 'Terra.txt', 'Tether.txt', 'Tron.txt', 'Uniswap.txt', 'Wrapped.txt']

In [10]:
compiled_r = re.compile('(?<!\S)\d+(?!\S)')

In [11]:
def filter_less_important_words(words):
    tagged = pos_tag(words)
    return [w[0] for w in tagged if w[1].startswith('N') or w[1].startswith('J')]
    # return [w for w in words if not compiled_reg.match(w)]

In [12]:
def creat_word_bank():
    word_bank = {}
    for i in filenames:
        #deduplicate in each paper
        deduped_words = list(set(read_whitepapers(i, stopwords_set)))
        word_bank[i] = filter_less_important_words(deduped_words)
    return word_bank

In [13]:
word_bank = creat_word_bank()

In [14]:
# Single words
agg_words = []
for coin in word_bank:
    agg_words.extend(word_bank[coin])

single_counter = Counter(agg_words)
# print(single_counter.most_common(700))
print([x[0] for x in single_counter.most_common(2000)])

['users', 'new', 'many', 'number', 'network', 'amount', 'paper', 'key', 'system', 'time', 'transactions', 'public', 'able', 'order', 'access', 'value', 'possible', 'current', 'future', 'full', 'simple', 'case', 'potential', 'private', 'second', 'funds', 'large', 'process', 'fees', 'data', 'size', 'example', 'transaction', 'problem', 'way', 'abstract', 'small', 'digital', 'block', 'fee', 'additional', 'systems', 'information', 'point', 'single', 'low', 'work', 'security', 'high', 'consensus', 'space', 'history', 'entire', 'several', 'initial', 'multiple', 'currency', 'changes', 'control', 'period', 'function', 'previous', 'cryptocurrency', 'introduction', 'references', 'next', 'total', 'particular', 'risk', 'similar', 'common', 'implementation', 'section', 'part', 'source', 'costs', 'loss', 'assets', 'use', 'cost', 'exchange', 'due', 'available', 'operations', 'record', 'addition', 'stake', 'blocks', 'blockchain', 'signature', 'protocol', 'cases', 'complete', 'conditions', 'types', 'cre

In [15]:
#2-gram counter
agg_words_2gram = []
for coin in word_bank:
    agg_words_2gram.extend(ngrams(word_bank[coin], 2))

two_gram_counter = Counter(agg_words_2gram)
# print(two_gram_counter.most_common(150))
print([x[0] for x in two_gram_counter.most_common(300)])

[('cryptocurrency', 'record'), ('strong', 'value'), ('tokens', 'operations'), ('event', 'transaction'), ('amount', 'users'), ('long', 'access'), ('private', 'avoid'), ('ownership', 'safe'), ('public', 'transfer'), ('get', 'key'), ('previous', 'facilitate'), ('signature', 'proposal'), ('potential', 'start'), ('creation', 'virtual'), ('block', 'ownership'), ('correct', 'attack'), ('special', 'easy'), ('security', 'transition'), ('use', 'best'), ('details', 'many'), ('history', 'open'), ('gas', 'different'), ('key', 'timestamp'), ('point', 'software'), ('data', 'equivalent'), ('paper', 'malicious'), ('problem', 'maximum'), ('layer', 'second'), ('consensus', 'initial'), ('call', 'prevent'), ('random', 'stake'), ('balance', 'represent'), ('validator', 'active'), ('active', 'r'), ('blocks', 'release'), ('virtual', 'members'), ('users', 'parameters'), ('vote', 'features'), ('unit', 'source'), ('enable', 'byzantine'), ('node', 'completion'), ('best', 'proofofstake'), ('desirable', 'proofofwork

In [16]:
#3-gram counter
agg_words_3gram = []
for coin in word_bank:
    agg_words_3gram.extend(ngrams(word_bank[coin], 3))

three_gram_counter = Counter(agg_words_3gram)
print(three_gram_counter.most_common(150))
# print([x[0] for x in three_gram_counter.most_common(300)])

[(('creation', 'virtual', 'members'), 5), (('block', 'ownership', 'safe'), 5), (('open', 'former', 'see'), 4), (('transfers', 'individual', 'wallets'), 4), (('validator', 'active', 'r'), 3), (('true', 'correct', 'attack'), 3), (('special', 'easy', 'enjoy'), 3), (('use', 'best', 'proofofstake'), 3), (('main', 'period', 'generation'), 3), (('period', 'generation', 'space'), 3), (('term', 'tokens', 'operations'), 3), (('credit', 'technology', 'usd'), 3), (('strong', 'value', 'alternative'), 3), (('bitcoin', 'special', 'easy'), 3), (('holders', 'needs', 'attacks'), 3), (('accept', 'get', 'key'), 3), (('key', 'timestamp', 'state'), 3), (('public', 'transfer', 'scale'), 3), (('tokens', 'operations', 'request'), 3), (('success', 'monthly', 'conditions'), 3), (('sender', 'event', 'transaction'), 3), (('event', 'transaction', 'complexity'), 3), (('network', 'banking', 'capabilities'), 3), (('history', 'open', 'former'), 3), (('see', 'fund', 'year'), 3), (('point', 'software', 'general'), 3), ((

In [17]:
#4-gram counter
agg_words_4gram = []
for coin in word_bank:
    agg_words_4gram.extend(ngrams(word_bank[coin], 4))

four_gram_counter = Counter(agg_words_4gram)
# print(four_gram_counter.most_common(150))
print([x[0] for x in four_gram_counter.most_common(100)])

[('main', 'period', 'generation', 'space'), ('history', 'open', 'former', 'see'), ('tokens', 'operations', 'conversion', 'request'), ('validator', 'active', 'r', 'future'), ('supply', 'feature', 'cap', 'hold'), ('release', 'creation', 'virtual', 'members'), ('language', 'true', 'correct', 'attack'), ('terms', 'main', 'period', 'generation'), ('assumptions', 'term', 'tokens', 'operations'), ('conclusion', 'point', 'software', 'digital'), ('bitcoin', 'special', 'easy', 'democracy'), ('get', 'key', 'timestamp', 'state'), ('key', 'timestamp', 'state', 'larger'), ('quantity', 'assumptions', 'term', 'operations'), ('public', 'transfer', 'scale', 'allows'), ('millions', 'contributors', 'home', 'privacy'), ('sender', 'event', 'transaction', 'complexity'), ('development', 'pain', 'providing', 'rules'), ('consensus', 'initial', 'holders', 'attacks'), ('open', 'former', 'see', 'fund'), ('former', 'see', 'fund', 'year'), ('ownership', 'safe', 'servers', 'validity'), ('sale', 'works', 'problematic'