# Import library

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import collections
import re
import os
import string
pd.set_option('display.max_colwidth', 200)
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
#!pip install BeautifulSoup4
# import nltk
# nltk.download()  # Download text data sets, including stop words

In [3]:
from nltk.corpus import stopwords # Import the stop word list
from collections import Counter
from nltk.util import ngrams
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer

In [4]:
lemmatizer = WordNetLemmatizer()

In [5]:
lemmatizer.lemmatize("requires", pos='v')

'require'

# Whitepaper datasource

In [6]:
stopwords_set = set(stopwords.words('english'))

In [7]:
def extract_and_clean(line, stopwords):
    # words = [x.strip() for x in re.split(',| |\. |\: ', line) if x]
    # words = map(str.lower, words)
    # words = [x.replace('-', '') for x in words]
    words = word_tokenize(re.sub(r'[^\w\s]', '', line.lower()))
    # words = [x.replace('-', '') for x in words]
    words = [word for word in words if not word in stopwords]
    # ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
    words = [lemmatizer.lemmatize(w, pos='s') for w in words]
    words = [lemmatizer.lemmatize(w, pos='n') for w in words]
    words = [lemmatizer.lemmatize(w, pos='v') for w in words]
    words = [lemmatizer.lemmatize(w, pos='a') for w in words]
    return words

In [8]:
def read_whitepapers(filename, stopwords):
    directory = "../whitepapers/top20_whitepapers/"
    words_list = []
    words_context_dict = {}
    # context_tuples_ref = []
    word_idx = 0
    context_idx = 0
    for entry in os.scandir(directory):
        if (entry.path.endswith(filename) and entry.is_file()):
            with open(entry.path, "r") as f:
                for line in f:
                    # context_tuples_ref += [line]
                    temp_words = extract_and_clean(line, stopwords)
                    words_list.extend(temp_words)
                    for i in range(len(temp_words)):
                        words_context_dict[word_idx] = context_idx
                        word_idx += 1
                    context_idx += 1
    return words_list, words_context_dict #, context_tuples_ref

In [11]:
filenames = ['Algorand.txt', 'Avalanche.txt', 'Binance.txt', 'Bitcoin.txt', 'Cardano.txt', 'Chainlink.txt',
            'Crypto_com.txt', 'Ethereum.txt', 'FTX_token.txt', 'PolkaDot.txt', 'Polygon.txt', 'Ripple.txt', 
            'Solana.txt', 'Terra.txt', 'Tether.txt', 'Tron.txt', 'Uniswap.txt', 'Wrapped.txt']

In [14]:
def creat_word_bank():
    word_bank = {}
    context_ref = {}
    for i in filenames:
        word_bank[i], context_ref[i] = read_whitepapers(i, stopwords_set)
    return word_bank, context_ref

In [15]:
def dedupe(words, context_ref):
    appearance_dict = {}
    for i in range(len(words)):
        appearance_dict.setdefault(words[i],[]).append(context_ref[i])
    return list(set(words)), appearance_dict

In [16]:
word_bank, context_ref = creat_word_bank()

In [17]:
lookup_dict = {}

In [18]:
def enrich_lookup_dict(lookup_dict, words, paper_name, appearance_dict):
    for w in words:
        lookup_dict.setdefault(w,[]).append((paper_name, appearance_dict[w]))
        # lookup_dict[w] = paper_name

# Construct Word Counter for n-grams

In [111]:
# Single words
agg_words = []
for coin in word_bank:
    deduped_words, appearance_dict = dedupe(word_bank[coin], context_ref[coin])
    agg_words.extend(deduped_words)
    enrich_lookup_dict(lookup_dict, deduped_words, coin, appearance_dict)

single_counter = Counter(agg_words)
# print(single_counter.most_common(300))
# print([x[0] for x in single_counter.most_common(2000)])

In [112]:
#2-gram counter
agg_words_2gram = []
for coin in word_bank:
    deduped_words, appearance_dict = dedupe(list(ngrams(word_bank[coin], 2)), context_ref[coin])
    agg_words_2gram.extend(deduped_words)
    enrich_lookup_dict(lookup_dict, deduped_words, coin, appearance_dict)

two_gram_counter = Counter(agg_words_2gram)
# print(two_gram_counter.most_common(250))
# print([x[0] for x in two_gram_counter.most_common(300)])

In [115]:
#3-gram counter
agg_words_3gram = []
for coin in word_bank:
    deduped_words, appearance_dict = dedupe(list(ngrams(word_bank[coin], 3)), context_ref[coin])
    agg_words_3gram.extend(deduped_words)
    enrich_lookup_dict(lookup_dict, deduped_words, coin, appearance_dict)

three_gram_counter = Counter(agg_words_3gram)
# print(three_gram_counter.most_common(150))
# print([x[0] for x in three_gram_counter.most_common(300)])

In [116]:
#4-gram counter
agg_words_4gram = []
for coin in word_bank:
    deduped_words, appearance_dict = dedupe(list(ngrams(word_bank[coin], 4)), context_ref[coin])
    agg_words_4gram.extend(deduped_words)
    enrich_lookup_dict(lookup_dict, deduped_words, coin, appearance_dict)

four_gram_counter = Counter(agg_words_4gram)
# print(four_gram_counter.most_common(150))
# print([x[0] for x in four_gram_counter.most_common(100)])

# Show the most common word/words

In [131]:
def show_most_common(counter, top_n, min_count, max_count):
    print_count = 0
    for pair in counter.most_common():
        if print_count == top_n:
            break
        if pair[1] <= max_count and pair[1] >= min_count:
            print(str(pair[0]) + " ---  Count: " + str(pair[1]))
            print_count = print_count + 1

In [132]:
show_most_common(single_counter, 10, 2, 15)

private ---  Count: 15
update ---  Count: 15
choose ---  Count: 15
full ---  Count: 15
model ---  Count: 15
without ---  Count: 15
future ---  Count: 15
point ---  Count: 15
25 ---  Count: 15
particular ---  Count: 15


In [133]:
show_most_common(two_gram_counter, 10, 2, 14)

('1', 'introduction') ---  Count: 10
('reference', '1') ---  Count: 10
('smart', 'contract') ---  Count: 10
('private', 'key') ---  Count: 10
('transaction', 'fee') ---  Count: 10
('allow', 'user') ---  Count: 9
('white', 'paper') ---  Count: 9
('1', '2') ---  Count: 8
('large', 'number') ---  Count: 8
('create', 'new') ---  Count: 8


In [25]:
print(lookup_dict['sha256'])

[('Bitcoin.txt', [82]), ('Ethereum.txt', [119, 865]), ('Solana.txt', [97, 154, 154]), ('Tron.txt', [625, 626])]


In [26]:
print(lookup_dict['proofofwork'])

[('Avalanche.txt', [280, 307]), ('Bitcoin.txt', [10, 11, 17, 79, 82, 85, 87, 93, 95, 96, 99, 103, 109, 110, 163, 168, 327, 332]), ('Cardano.txt', [3146]), ('Chainlink.txt', [3197, 5249, 5251]), ('Ethereum.txt', [40, 41, 103, 116, 146, 171, 177, 438, 674]), ('PolkaDot.txt', [76])]


In [27]:
print(lookup_dict['proofofstake'])

[('Avalanche.txt', [139, 280, 311]), ('Cardano.txt', [0, 3479, 3534]), ('Chainlink.txt', [652, 3198, 3285, 4377, 5249, 5251]), ('Ethereum.txt', [49, 855]), ('PolkaDot.txt', [407, 1467, 1467, 1474]), ('Polygon.txt', [59])]


In [28]:
print(lookup_dict[('consensus', 'protocol')])

[('Avalanche.txt', [41, 79, 87, 96, 125, 180, 191, 275, 289, 292, 295, 517, 555]), ('Cardano.txt', [3280, 3282]), ('Chainlink.txt', [208, 375, 593, 631, 679, 732, 970, 981, 1551, 1919, 1996, 2947, 4045, 4822, 5279]), ('Ethereum.txt', [199, 218]), ('PolkaDot.txt', [1398]), ('Ripple.txt', [3, 5]), ('Solana.txt', [546, 687])]


In [29]:
def lookup_context(lookup_dict, words):
    directory = "../whitepapers/top20_whitepapers/"
    appearances = lookup_dict[words]
    # For each white paper, read and print relevant contents
    for appear in appearances:
        filename = appear[0]
        idxs = appear[1]
        context_idx = 0
        for entry in os.scandir(directory):
            if (entry.path.endswith(filename) and entry.is_file()):
                with open(entry.path, "r") as f:
                    for line in f:
                        if context_idx in idxs:
                            print(filename + " [line " + str(context_idx) + "] : " + line)
                        context_idx += 1

# Look up word/words appearance

In [30]:
lookup_context(lookup_dict, ('consensus', 'protocol'))

Avalanche.txt [line 41] : Secure Avalanche is designed to be robust and achieve high security. Classical consensus protocols are

Avalanche.txt [line 79] : of the platform is called “$AVAX”. The family of consensus protocols used by the Avalanche platform is

Avalanche.txt [line 87] : of machines. Therefore, consensus protocols, which enable a group of nodes to achieve agreement, lie at the

Avalanche.txt [line 96] : static deployments. Nakamoto consensus protocols [5,7,4], on the other hand, are robust, but suﬀer from

Avalanche.txt [line 125] : consensus protocols and therefore require full membership knowledge. Knowing the entire set of par-

Avalanche.txt [line 180] : of consensus protocols through a set of 8 critical axes.

Avalanche.txt [line 191] : Table 1. Comparative chart between the three known families of consensus protocols. Avalanche, Snowman, and

Avalanche.txt [line 275] : Consensus protocols provide their security guarantees under the assumption that up to a threshold 

In [31]:
lookup_context(lookup_dict, 'sha256')

Bitcoin.txt [line 82] : The proof-of-work involves scanning for a value that when hashed, such as with SHA-256, the 

Ethereum.txt [line 119] : preventing sybil attackers from remaking the entire blockchain in their favor. Because SHA256 is designed

Ethereum.txt [line 865] : The Bitcoin mining algorithm works by having miners compute SHA256 on slightly modified versions of

Solana.txt [line 97] : sha256, ripemd, etc.), run the function from some random starting value

Solana.txt [line 154] : sha256 of the photograph. The index and the sha256 of the photograph are

Tron.txt [line 625] : The raw data then undergoes SHA-256 hashing. The private key corresponding to the contract 

Tron.txt [line 626] : address then signs the result of the SHA256 hash. The signature result is then added to the 



In [32]:
lookup_context(lookup_dict, ('proof', 'stake'))

Cardano.txt [line 4] : We present “Ouroboros,” the ﬁrst blockchain protocol based on proof of stake with rig-

Cardano.txt [line 6] : those achieved by the bitcoin blockchain protocol. As the protocol provides a “proof of stake”

Cardano.txt [line 9] : centivizing proof of stake protocols and we prove that, given this mechanism, honest behavior

Cardano.txt [line 27] : A natural alternative mechanism relies on the notion of “proof of stake” (PoS). Rather than

Cardano.txt [line 43] : ideal; however, realizing such a proof of stake protocol appears to involve a number of deﬁnitional,

Cardano.txt [line 45] : Previous work. The concept of PoS has been discussed extensively in the bitcoin forum.1 Proof

Cardano.txt [line 50] : Heuristic proof of stake based blockchain protocols have been proposed (and implemented) for a

Cardano.txt [line 73] : Our Results. We present “Ouroboros,” a provably secure proof of stake system. To the best of

Cardano.txt [line 80] : 1See “Proof of stake instead

# Construct Whitelist

In [67]:
def filter_word_count(orig, min_c, max_c):
    new_dict = {}
    for (key, value) in orig.items():
        if value >= min_c and value <= max_c:
            new_dict[key] = value
    return new_dict

In [68]:
def filter_numeric_or_short_words(orig):
    result = []
    for pair in orig:
        has_num = False
        for w in pair[0]:
            if w.isnumeric() or len(w) < 2:
                has_num = True
                break
        if not has_num:
            result.append(pair)
    return result

In [106]:
def filter_numeric_or_short_word(orig):
    result = []
    for pair in orig:
        if not pair[0].isnumeric() and len(pair[0]) > 1:
            result.append(pair)
    return result

In [107]:
def print_results_with_filter_singleword(arr, min_c, max_c, outfile):
    with open(outfile, 'w') as f:
        for pair in arr:
            if pair[1] >= min_c and pair[1] <= max_c:
                print(pair[0], file=f)
    print("Output to filename " + outfile + " - [Done]")

In [100]:
def print_results_with_filter(arr, min_c, max_c, outfile):
    with open(outfile, 'w') as f:
        for pair in arr:
            if pair[1] >= min_c and pair[1] <= max_c:
                print(','.join(pair[0]), file=f)
                # print(pair[0], file=f)
    print("Output to filename " + outfile + " - [Done]")

In [101]:
print_results_with_filter(filter_numeric_or_short_words(two_gram_counter.most_common()), 2, 18, "whitelist/word2_w.txt")

Output to filename whitelist/word2_w.txt - [Done]


In [102]:
print_results_with_filter(filter_numeric_or_short_words(three_gram_counter.most_common()), 2, 18, "whitelist/word3_w.txt")

Output to filename whitelist/word3_w.txt - [Done]


In [103]:
print_results_with_filter(filter_numeric_or_short_words(four_gram_counter.most_common()), 2, 18, "whitelist/word4_w.txt")

Output to filename whitelist/word4_w.txt - [Done]


In [110]:
print_results_with_filter_singleword(filter_numeric_or_short_word(single_counter.most_common()), 2, 12, "whitelist/word1_w.txt")
# print(filter_numeric_or_short_word(single_counter.most_common())[:10])

Output to filename whitelist/word1_w.txt - [Done]
