### Import library

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import collections
import re
import os
import string
pd.set_option('display.max_colwidth', 200)
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#!pip install BeautifulSoup4
# import nltk
# nltk.download()  # Download text data sets, including stop words

In [3]:
# Import BeautifulSoup into your workspace
from bs4 import BeautifulSoup         

In [4]:
from nltk.corpus import stopwords # Import the stop word list
from collections import Counter
from nltk.util import ngrams
from nltk import word_tokenize

### Whitepaper datasource

In [5]:
stopwords_set = set(stopwords.words('english'))

In [6]:
def extract_and_clean(line, stopwords):
    # words = [x.strip() for x in re.split(',| |\. |\: ', line) if x]
    # words = map(str.lower, words)
    # words = [x.replace('-', '') for x in words]
    words = word_tokenize(re.sub(r'[^\w\s]', '', line.lower()))
    # words = [x.replace('-', '') for x in words]
    words = [word for word in words if not word in stopwords]
    return words

In [7]:
def read_whitepapers(filename, stopwords):
    directory = "../whitepapers/top20_whitepapers/"
    words_list = []
    for entry in os.scandir(directory):
        if (entry.path.endswith(filename) and entry.is_file()):
            with open(entry.path, "r") as f:
                for line in f:
                    # words_list.extend(line.split())
                    # words_list.extend([x.strip() for x in line.split()])
                    words_list.extend(extract_and_clean(line, stopwords))
    return words_list

In [8]:
bitcoin_filename="Bitcoin.txt"
whitepapers = read_whitepapers(bitcoin_filename, stopwords_set)
# whitepapers.rename(columns={bitcoin_filename: "whitepapers"}, inplace=True)
whitepapers[:25]

['bitcoin',
 'peertopeer',
 'electronic',
 'cash',
 'system',
 'satoshi',
 'nakamoto',
 'satoshingmxcom',
 'wwwbitcoinorg',
 'abstract',
 'purely',
 'peertopeer',
 'version',
 'electronic',
 'cash',
 'would',
 'allow',
 'online',
 'payments',
 'sent',
 'directly',
 'one',
 'party',
 'another',
 'without']

In [9]:
filenames = ['Algorand.txt', 'Avalanche.txt', 'Binance.txt', 'Bitcoin.txt', 'Cardano.txt', 'Chainlink.txt',
            'Crypto_com.txt', 'Ethereum.txt', 'FTX_token.txt', 'PolkaDot.txt', 'Polygon.txt', 'Ripple.txt', 
            'Solana.txt', 'Terra.txt', 'Tether.txt', 'Tron.txt', 'Uniswap.txt', 'Wrapped.txt']

In [10]:
def creat_word_bank():
    word_bank = {}
    for i in filenames:
        #deduplicate in each paper
        word_bank[i] = list(set(read_whitepapers(i, stopwords_set)))
    return word_bank

In [11]:
word_bank = creat_word_bank()

In [44]:
# Single words
agg_words = []
for coin in word_bank:
    agg_words.extend(word_bank[coin])

single_counter = Counter(agg_words)
# print(single_counter.most_common(700))
print([x[0] for x in single_counter.most_common(2000)])

['users', '4', 'number', 'use', '3', 'many', '5', 'used', '7', '6', 'new', 'one', '2', 'two', 'user', 'given', '1', 'since', 'amount', 'also', 'may', 'using', 'transactions', '8', 'within', 'system', 'well', 'key', 'make', 'time', 'public', 'paper', 'every', 'network', 'simple', '10', 'receive', '9', 'value', 'able', 'order', 'need', 'even', 'access', 'possible', 'based', 'provide', '11', 'required', 'like', 'current', 'future', 'without', 'protocol', 'would', 'second', 'full', 'function', 'set', 'private', 'particular', 'case', '25', 'first', 'potential', 'following', 'process', '15', 'existing', 'problem', 'size', 'fee', 'must', 'fees', 'transaction', 'per', 'could', 'data', 'less', 'work', 'high', '12', 'large', 'funds', 'example', 'multiple', 'way', 'ensure', 'means', 'however', 'small', 'together', 'digital', 'still', 'see', 'already', 'much', 'abstract', 'keep', 'block', 'long', 'another', 'fast', 'blockchain', 'control', '13', 'currently', 'prevent', 'get', 'entire', 'informatio

In [35]:
#2-gram counter
agg_words_2gram = []
for coin in word_bank:
    agg_words_2gram.extend(ngrams(word_bank[coin], 2))

two_gram_counter = Counter(agg_words_2gram)
# print(two_gram_counter.most_common(150))
print([x[0] for x in two_gram_counter.most_common(300)])

[('existing', '3'), ('become', 'consensus'), ('via', 'create'), ('fees', 'includes'), ('increase', '30'), ('useful', 'allows'), ('multiple', 'increase'), ('due', 'initial'), ('check', 'using'), ('included', 'make'), ('many', '5'), ('information', 'costs'), ('without', 'storage'), ('implemented', 'node'), ('following', 'world'), ('services', 'miners'), ('every', 'less'), ('use', 'holds'), ('time', 'particular'), ('amount', 'long'), ('follows', 'total'), ('requirements', 'eg'), ('required', 'ecosystem'), ('small', 'list'), ('rewards', 'cases'), ('8', 'c'), ('funds', 'together'), ('19', '11'), ('growing', 'key'), ('using', 'becomes'), ('2', 'system'), ('times', 'instant'), ('introduced', 'potential'), ('require', 'instead'), ('full', 'technology'), ('5', 'store'), ('ensure', 'work'), ('20', 'still'), ('payment', 'purposes'), ('take', 'receives'), ('within', 'verify'), ('10', 'record'), ('represents', 'current'), ('started', 'users'), ('describes', 'second'), ('guaranteed', 'message'), ('n

In [37]:
#3-gram counter
agg_words_3gram = []
for coin in word_bank:
    agg_words_3gram.extend(ngrams(word_bank[coin], 3))

three_gram_counter = Counter(agg_words_3gram)
print(three_gram_counter.most_common(150))
# print([x[0] for x in three_gram_counter.most_common(300)])

[(('multiple', 'increase', '30'), 7), (('difficult', 'without', 'storage'), 6), (('create', 'implemented', 'node'), 5), (('many', '5', 'store'), 5), (('check', 'using', 'becomes'), 5), (('without', 'storage', 'cost'), 4), (('via', 'create', 'implemented'), 4), (('special', 'rewards', 'cases'), 4), (('fees', 'includes', 'help'), 4), (('almost', 'information', 'costs'), 4), (('parties', 'private', 'successful'), 4), (('amount', 'long', 'ties'), 4), (('outputs', 'result', 'error'), 4), (('rest', 'useful', 'allows'), 4), (('given', 'functionality', 'analysis'), 4), (('names', 'build', 'evm'), 4), (('stake', 'supported', 'web'), 4), (('risk', 'true', 'trust'), 4), (('works', 'existing', '3'), 4), (('updated', 'hashes', 'enough'), 3), (('2', 'system', 'sign'), 3), (('100', 'times', 'instant'), 3), (('instance', 'called', 'economic'), 3), (('constant', 'become', 'consensus'), 3), (('two', 'adaptive', 'issues'), 3), (('abstract', 'duration', 'validate'), 3), (('required', 'ecosystem', 'operate

In [40]:
#4-gram counter
agg_words_4gram = []
for coin in word_bank:
    agg_words_4gram.extend(ngrams(word_bank[coin], 4))

four_gram_counter = Counter(agg_words_4gram)
# print(four_gram_counter.most_common(150))
print([x[0] for x in four_gram_counter.most_common(100)])

[('via', 'create', 'implemented', 'node'), ('contributions', 'many', '5', 'store'), ('introduced', 'potential', 'satisfy', 'remaining'), ('within', 'verify', 'payment', 'purposes'), ('difficult', 'without', 'storage', 'cost'), ('updated', 'hashes', 'enough', 'let'), ('properties', 'choose', 'users', 'collectively'), ('choose', 'users', 'collectively', 'established'), ('without', 'storage', 'cost', 'reserve'), ('lock', 'every', 'implementations', 'less'), ('every', 'implementations', 'less', 'proposals'), ('implementations', 'less', 'proposals', '13'), ('pow', 'exists', 'bitcoin', 'propose'), ('loss', 'via', 'create', 'implemented'), ('theoretical', 'anyone', 'frontier', 'time'), ('required', 'ecosystem', 'operates', 'mainnet'), ('important', 'following', 'world', 'large'), ('valid', 'operation', 'dependent', 'tps'), ('operation', 'dependent', 'tps', 'blockchains'), ('daniel', 'full', 'technology', 'equally'), ('paper', 'environment', 'model', 'running'), ('many', 'live', 'huobi', '5'),

In [14]:
df_whitepapers = create_dataframe()
df_whitepapers.columns = filenames
df_whitepapers

NameError: name 'create_dataframe' is not defined