# Import

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import collections
import re
import os
import string
pd.set_option('display.max_colwidth', 200)
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords # Import the stop word list
from collections import Counter
from nltk.util import ngrams
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [2]:
def read_words_list(filenames):
    result_ls = set()
    for filename in filenames:
        with open(filename, 'r') as f:
            for line in f:
                words = line.strip().split(',')
                if len(words) == 1:
                    result_ls.add(words[0])
                else:
                    result_ls.add(tuple(words))
    return result_ls

# Parse access list and block list

In [3]:
wl = read_words_list(["whitelist/word1_w.txt", "whitelist/word2_w.txt", "whitelist/word3_w.txt", "whitelist/word4_w.txt"])
bl = read_words_list(["blacklist/word1_b.txt", "blacklist/word2_b.txt", "blacklist/word3_b.txt", "blacklist/word4_b.txt"])

# Parse incoming whitepapers

In [4]:
def read_whitepapers(filename, stopwords):
    directory = "../whitepapers/top20_whitepapers/"
    words_list = []
    words_context_dict = {}
    lines = []
    # context_tuples_ref = []
    word_idx = 0
    context_idx = 0
    for entry in os.scandir(directory):
        if (entry.path.endswith(filename) and entry.is_file()):
            with open(entry.path, "r") as f:
                for line in f:
                    # context_tuples_ref += [line]
                    temp_words = extract_and_clean(line, stopwords)
                    words_list.extend(temp_words)
                    for i in range(len(temp_words)):
                        words_context_dict[word_idx] = context_idx
                        word_idx += 1
                    context_idx += 1
                    
                    if len(line) > 100:
                        lines.append(line)
                        
    return words_list, words_context_dict, lines #, context_tuples_ref

In [5]:
def extract_and_clean(line, stopwords):
    # words = [x.strip() for x in re.split(',| |\. |\: ', line) if x]
    # words = map(str.lower, words)
    # words = [x.replace('-', '') for x in words]
    words = word_tokenize(re.sub(r'[^\w\s]', '', line.lower()))
    # words = [x.replace('-', '') for x in words]
    words = [word for word in words if not word in stopwords]
    # ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
    words = [lemmatizer.lemmatize(w, pos='s') for w in words]
    words = [lemmatizer.lemmatize(w, pos='n') for w in words]
    words = [lemmatizer.lemmatize(w, pos='v') for w in words]
    words = [lemmatizer.lemmatize(w, pos='a') for w in words]
    return words

In [6]:
def creat_word_bank():
    word_bank = {}
    context_ref = {}
    lines_bank = {}
    for i in filenames:
        word_bank[i], context_ref[i], lines_bank[i] = read_whitepapers(i, stopwords_set)
    return word_bank, context_ref, lines_bank

In [7]:
def dedupe(words, context_ref):
    appearance_dict = {}
    for i in range(len(words)):
        appearance_dict.setdefault(words[i],[]).append(context_ref[i])
    return list(set(words)), appearance_dict

In [8]:
def filter_wl_bl_words(words, context_ref, wl, bl):
    unfiltered_words = []
    filtered_words = []
    filtered_context = {}
    for w in words:
        unfiltered_words.append(w)
        if w in wl and w not in bl:
            filtered_words.append(w)
            filtered_context[w] = context_ref[w]
    return unfiltered_words, filtered_words, filtered_context

In [9]:
stopwords_set = set(stopwords.words('english'))

In [10]:
filenames = ['Algorand.txt', 'Avalanche.txt', 'Binance.txt', 'Bitcoin.txt', 'Cardano.txt', 'Chainlink.txt',
            'Crypto_com.txt', 'Ethereum.txt', 'FTX_token.txt', 'PolkaDot.txt', 'Polygon.txt', 'Ripple.txt', 
            'Solana.txt', 'Terra.txt', 'Tether.txt', 'Tron.txt', 'Uniswap.txt', 'Wrapped.txt']

In [11]:
words_bank, context_ref, lines_bank = creat_word_bank()

In [12]:
def enrich_lookup_dict(lookup_dict, words, paper_name, appearance_dict):
    for w in words:
        lookup_dict.setdefault(w,[]).append((paper_name, appearance_dict[w]))

In [13]:
lookup_dict = {}

# Stats

## Single Word

In [14]:
agg_words = []
agg_wl_words = []

for coin in words_bank:
    words_ls = words_bank[coin]
    words_context_dict = context_ref[coin]
    deduped_words, deduped_appearance_dict = dedupe(words_ls, words_context_dict)
    raw_words, wl_words, wl_appearance_dict = filter_wl_bl_words(deduped_words, deduped_appearance_dict, wl, bl)
    agg_words.extend(raw_words)
    agg_wl_words.extend(wl_words)
    enrich_lookup_dict(lookup_dict, wl_words, coin, wl_appearance_dict)

In [15]:
# The number of raw single word extracted from 20 whitepapers
print(len(agg_words))

28871


In [16]:
# The number of filtered single word from 20 whitepapers
print(len(agg_wl_words))

1108


## Two Gram

In [17]:
agg_words_2gram = []
agg_wl_words_2 = []
stats_2 = {}
stats_wl_2 = {}

for coin in words_bank:
    words_ls = words_bank[coin]
    words_context_dict = context_ref[coin]
    deduped_words_2, deduped_appearance_dict_2 = dedupe(list(ngrams(words_ls, 2)), words_context_dict)
    raw_words_2gram, wl_words_2, wl_appearance_dict_2 = filter_wl_bl_words(deduped_words_2, deduped_appearance_dict_2, wl, bl)
    stats_2[coin] = len(raw_words_2gram)
    stats_wl_2[coin] = len(wl_words_2)
    agg_words_2gram.extend(raw_words_2gram)
    agg_wl_words_2.extend(wl_words_2)
    enrich_lookup_dict(lookup_dict, wl_words_2, coin, wl_appearance_dict_2)

In [18]:
# The number of raw 2-gram extracted from 20 whitepapers
print(len(agg_words_2gram))

95618


In [19]:
# The number of filtered 2-gram from 20 whitepapers
print(len(agg_wl_words_2))

4835


In [20]:
stats_2

{'Algorand.txt': 1372,
 'Avalanche.txt': 2999,
 'Binance.txt': 1809,
 'Bitcoin.txt': 1793,
 'Cardano.txt': 15418,
 'Chainlink.txt': 26506,
 'Crypto_com.txt': 6418,
 'Ethereum.txt': 6807,
 'FTX_token.txt': 1677,
 'PolkaDot.txt': 8469,
 'Polygon.txt': 4088,
 'Ripple.txt': 1428,
 'Solana.txt': 3551,
 'Terra.txt': 2370,
 'Tether.txt': 2864,
 'Tron.txt': 4162,
 'Uniswap.txt': 2006,
 'Wrapped.txt': 1881}

In [21]:
stats_wl_2

{'Algorand.txt': 70,
 'Avalanche.txt': 250,
 'Binance.txt': 114,
 'Bitcoin.txt': 169,
 'Cardano.txt': 483,
 'Chainlink.txt': 964,
 'Crypto_com.txt': 367,
 'Ethereum.txt': 476,
 'FTX_token.txt': 77,
 'PolkaDot.txt': 330,
 'Polygon.txt': 392,
 'Ripple.txt': 45,
 'Solana.txt': 206,
 'Terra.txt': 139,
 'Tether.txt': 162,
 'Tron.txt': 299,
 'Uniswap.txt': 106,
 'Wrapped.txt': 186}

## Three Gram

In [22]:
agg_words_3gram = []
agg_wl_words_3 = []
stats_3 = {}
stats_wl_3 = {}

for coin in words_bank:
    words_ls = words_bank[coin]
    words_context_dict = context_ref[coin]
    deduped_words_3, deduped_appearance_dict_3 = dedupe(list(ngrams(words_ls, 3)), words_context_dict)
    raw_words_3gram, wl_words_3, wl_appearance_dict_3 = filter_wl_bl_words(deduped_words_3, deduped_appearance_dict_3, wl, bl)
    stats_3[coin] = len(raw_words_3gram)
    stats_wl_3[coin] = len(wl_words_3)
    agg_words_3gram.extend(raw_words_3gram)
    agg_wl_words_3.extend(wl_words_3)
    enrich_lookup_dict(lookup_dict, wl_words_3, coin, wl_appearance_dict_3)

In [23]:
# The number of raw 3-gram extracted from 20 whitepapers
print(len(agg_words_3gram))

111328


In [24]:
# The number of filtered 3-gram from 20 whitepapers
print(len(agg_wl_words_3))

491


In [25]:
stats_3

{'Algorand.txt': 1783,
 'Avalanche.txt': 3255,
 'Binance.txt': 1966,
 'Bitcoin.txt': 1972,
 'Cardano.txt': 19204,
 'Chainlink.txt': 31599,
 'Crypto_com.txt': 7405,
 'Ethereum.txt': 7695,
 'FTX_token.txt': 1870,
 'PolkaDot.txt': 8983,
 'Polygon.txt': 4780,
 'Ripple.txt': 1486,
 'Solana.txt': 4078,
 'Terra.txt': 2645,
 'Tether.txt': 3261,
 'Tron.txt': 4941,
 'Uniswap.txt': 2218,
 'Wrapped.txt': 2187}

In [26]:
stats_wl_3

{'Algorand.txt': 11,
 'Avalanche.txt': 25,
 'Binance.txt': 9,
 'Bitcoin.txt': 29,
 'Cardano.txt': 56,
 'Chainlink.txt': 92,
 'Crypto_com.txt': 43,
 'Ethereum.txt': 45,
 'FTX_token.txt': 4,
 'PolkaDot.txt': 17,
 'Polygon.txt': 48,
 'Ripple.txt': 0,
 'Solana.txt': 14,
 'Terra.txt': 6,
 'Tether.txt': 14,
 'Tron.txt': 53,
 'Uniswap.txt': 4,
 'Wrapped.txt': 21}

## Four Gram

In [27]:
agg_words_4gram = []
agg_wl_words_4 = []
stats_4 = {}
stats_wl_4 = {}
for coin in words_bank:
    words_ls = words_bank[coin]
    words_context_dict = context_ref[coin]
    deduped_words_4, deduped_appearance_dict_4 = dedupe(list(ngrams(words_ls, 4)), words_context_dict)
    raw_words_4gram, wl_words_4, wl_appearance_dict_4 = filter_wl_bl_words(deduped_words_4, deduped_appearance_dict_4, wl, bl)
    stats_4[coin] = len(raw_words_4gram)
    stats_wl_4[coin] = len(wl_words_4)
    agg_words_4gram.extend(raw_words_4gram)
    agg_wl_words_4.extend(wl_words_4)
    enrich_lookup_dict(lookup_dict, wl_words_4, coin, wl_appearance_dict_4)


In [28]:
# The number of raw 4-gram extracted from 20 whitepapers
print(len(agg_words_4gram))

114807


In [29]:
# The number of filtered 4-gram from 20 whitepapers
print(len(agg_wl_words_4))

98


In [30]:
stats_4

{'Algorand.txt': 1991,
 'Avalanche.txt': 3295,
 'Binance.txt': 1984,
 'Bitcoin.txt': 2016,
 'Cardano.txt': 20184,
 'Chainlink.txt': 32486,
 'Crypto_com.txt': 7674,
 'Ethereum.txt': 7831,
 'FTX_token.txt': 1931,
 'PolkaDot.txt': 9039,
 'Polygon.txt': 4979,
 'Ripple.txt': 1495,
 'Solana.txt': 4183,
 'Terra.txt': 2720,
 'Tether.txt': 3356,
 'Tron.txt': 5143,
 'Uniswap.txt': 2255,
 'Wrapped.txt': 2245}

In [31]:
stats_wl_4

{'Algorand.txt': 5,
 'Avalanche.txt': 6,
 'Binance.txt': 0,
 'Bitcoin.txt': 7,
 'Cardano.txt': 21,
 'Chainlink.txt': 22,
 'Crypto_com.txt': 15,
 'Ethereum.txt': 1,
 'FTX_token.txt': 0,
 'PolkaDot.txt': 3,
 'Polygon.txt': 10,
 'Ripple.txt': 0,
 'Solana.txt': 0,
 'Terra.txt': 0,
 'Tether.txt': 3,
 'Tron.txt': 3,
 'Uniswap.txt': 0,
 'Wrapped.txt': 2}

# Evaluate
Steps: 
1. Sample 8 paragraphs from 3 whitepapers
2. Manually extract technical terms, named "X"
3. Run through models to extract lists "Y"
4. Calculate accuracy, recall, f1

### Sample
Randomly chose 8 paragraphs from 5 whitepapers  
Store them for manually labelling 

In [32]:
import random
random.seed(1)

In [33]:
sample_paper = ['Avalanche.txt', 'Algorand.txt', 'Bitcoin.txt', 'Ethereum.txt', 'Chainlink.txt']
sample = {}
for i in sample_paper:
    sample[i] = random.sample(lines_bank[i], 8)

In [34]:
df = pd.DataFrame(sample)
df.to_csv('manual/sample.csv')

In [35]:
test_set = {}
for i in sample_paper:
    test_lines = []
    
    for line in sample[i]:
        test_words = extract_and_clean(line, stopwords_set)
        test_lines.extend(test_words)
    test_set[i] = test_lines

### Test bigram

In [36]:
agg_test_words_2gram = []
agg_test_wl_words_2 = []
test_2grams = {}
for coin in sample_paper:
    test_ls = test_set[coin]
    test_context_dict = context_ref[coin]
    test_deduped_words_2, test_deduped_appearance_dict_2 = dedupe(list(ngrams(test_ls, 2)), test_context_dict)
    test_words_2gram, test_wl_words_2, test_wl_appearance_dict_2 = filter_wl_bl_words(test_deduped_words_2, test_deduped_appearance_dict_2, wl, bl)
    agg_test_words_2gram.extend(test_words_2gram)
    agg_test_wl_words_2.extend(test_wl_words_2)
    test_2grams[coin] = test_wl_words_2

In [37]:
len(agg_test_words_2gram)

391

In [38]:
len(agg_test_wl_words_2)

40

In [39]:
agg_test_wl_words_2

[('global', 'network'),
 ('sharding', 'process'),
 ('proofofstake', 'protocol'),
 ('resource', 'order'),
 ('native', 'token'),
 ('view', 'network'),
 ('stable', 'view'),
 ('support', 'global'),
 ('veriﬁable', 'random'),
 ('function', 'vrfs'),
 ('future', 'period'),
 ('random', 'function'),
 ('block', 'protocol'),
 ('network', 'partition'),
 ('special', 'transaction'),
 ('transaction', 'block'),
 ('value', 'transaction'),
 ('network', 'node'),
 ('incentive', 'value'),
 ('merkle', 'tree'),
 ('node', 'receive'),
 ('transaction', 'fee'),
 ('transaction', 'hash'),
 ('place', 'chain'),
 ('financial', 'institution'),
 ('input', 'value'),
 ('hash', 'merkle'),
 ('acm', 'conference'),
 ('block', 'hash'),
 ('sybil', 'attack'),
 ('satoshi', 'nakamoto'),
 ('assume', 'transaction'),
 ('merkle', 'tree'),
 ('source', 'randomness'),
 ('computational', 'resource'),
 ('full', 'node'),
 ('financial', 'instrument'),
 ('support', 'generalize'),
 ('oracle', 'functionality'),
 ('sus', 'protocol')]

In [40]:
test_2grams

{'Algorand.txt': [('veriﬁable', 'random'),
  ('function', 'vrfs'),
  ('future', 'period'),
  ('random', 'function'),
  ('block', 'protocol'),
  ('network', 'partition')],
 'Avalanche.txt': [('global', 'network'),
  ('sharding', 'process'),
  ('proofofstake', 'protocol'),
  ('resource', 'order'),
  ('native', 'token'),
  ('view', 'network'),
  ('stable', 'view'),
  ('support', 'global')],
 'Bitcoin.txt': [('special', 'transaction'),
  ('transaction', 'block'),
  ('value', 'transaction'),
  ('network', 'node'),
  ('incentive', 'value'),
  ('merkle', 'tree'),
  ('node', 'receive'),
  ('transaction', 'fee'),
  ('transaction', 'hash'),
  ('place', 'chain'),
  ('financial', 'institution'),
  ('input', 'value'),
  ('hash', 'merkle'),
  ('acm', 'conference'),
  ('block', 'hash')],
 'Chainlink.txt': [('support', 'generalize'),
  ('oracle', 'functionality'),
  ('sus', 'protocol')],
 'Ethereum.txt': [('sybil', 'attack'),
  ('satoshi', 'nakamoto'),
  ('assume', 'transaction'),
  ('merkle', 'tree')

### Test trigram

In [41]:
agg_test_words_3gram = []
agg_test_wl_words_3 = []
test_3grams = {}

for coin in sample_paper:
    test_ls = test_set[coin]
    test_context_dict = context_ref[coin]
    test_deduped_words_3, test_deduped_appearance_dict_3 = dedupe(list(ngrams(test_ls, 3)), test_context_dict)
    test_words_3gram, test_wl_words_3, test_wl_appearance_dict_3 = filter_wl_bl_words(test_deduped_words_3, test_deduped_appearance_dict_3, wl, bl)
    agg_test_words_3gram.extend(test_words_3gram)
    agg_test_wl_words_3.extend(test_wl_words_3)
    test_3grams[coin] = test_wl_words_3

In [42]:
len(agg_test_words_3gram)

389

In [43]:
len(agg_test_wl_words_3)

4

In [44]:
agg_test_wl_words_3

[('veriﬁable', 'random', 'function'),
 ('random', 'function', 'vrfs'),
 ('root', 'include', 'block'),
 ('transaction', 'hash', 'merkle')]

In [45]:
test_3grams

{'Algorand.txt': [('veriﬁable', 'random', 'function'),
  ('random', 'function', 'vrfs')],
 'Avalanche.txt': [],
 'Bitcoin.txt': [('root', 'include', 'block'),
  ('transaction', 'hash', 'merkle')],
 'Chainlink.txt': [],
 'Ethereum.txt': []}

### Test four gram

In [46]:
agg_test_words_4gram = []
agg_test_wl_words_4 = []
test_4grams = {}

for coin in sample_paper:
    test_ls = test_set[coin]
    test_context_dict = context_ref[coin]
    test_deduped_words_4, test_deduped_appearance_dict_4 = dedupe(list(ngrams(test_ls, 4)), test_context_dict)
    test_words_4gram, test_wl_words_4, test_wl_appearance_dict_4 = filter_wl_bl_words(test_deduped_words_4, test_deduped_appearance_dict_4, wl, bl)
    agg_test_words_4gram.extend(test_words_4gram)
    agg_test_wl_words_4.extend(test_wl_words_4)
    test_4grams[coin] = test_wl_words_4

In [47]:
len(agg_test_words_4gram)

384

In [48]:
len(agg_test_wl_words_4)

1

In [49]:
agg_test_wl_words_4

[('veriﬁable', 'random', 'function', 'vrfs')]

In [50]:
test_4grams

{'Algorand.txt': [('veriﬁable', 'random', 'function', 'vrfs')],
 'Avalanche.txt': [],
 'Bitcoin.txt': [],
 'Chainlink.txt': [],
 'Ethereum.txt': []}

In [51]:
# True Positive: model labeled correctly
bigram_tp = 25
trigram_tp = 2
fourgram_tp = 1

# False Positive: model labeled incorrectly
bigram_fp = 15
trigram_fp = 2
fourgram_fp = 0

# True Negative: model unlabeled correctly
bigram_tn = 351
trigram_tn = 385
fourgram_tn = 383

# False Negative: model unlabeled incorrectly
bigram_fn = 37
trigram_fn = 18
fourgram_fn = 2

### Accuracy
Accuracy = TP / (TP + TN)

In [52]:
# Bigram
bigram_acc = bigram_tp / (bigram_tp + bigram_tn)

# Trigram
trigram_acc = trigram_tp / (trigram_tp + trigram_tn)

# Fourgram
fourgram_acc = fourgram_tp / (fourgram_tp + fourgram_tn)

print("Accuracy of Bigram: {:.3f}".format(bigram_acc))

print("Accuracy of Trigram: {:.3f}".format(trigram_acc))

print("Accuracy of Fourgram: {:.3f}".format(fourgram_acc))

Accuracy of Bigram: 0.066
Accuracy of Trigram: 0.005
Accuracy of Fourgram: 0.003


### Precision
Precision = TP / (TP + FP)

In [53]:
# Bigram
bigram_pre = bigram_tp / (bigram_tp + bigram_fp)

# Trigram
trigram_pre = trigram_tp / (trigram_tp + trigram_fp)

# Fourgram
fourgram_pre = fourgram_tp / (fourgram_tp + fourgram_fp)

print("Precision of Bigram: {:.3f}".format(bigram_pre))

print("Precision of Trigram: {:.3f}".format(trigram_pre))

print("Precision of Fourgram: {:.3f}".format(fourgram_pre))

Precision of Bigram: 0.625
Precision of Trigram: 0.500
Precision of Fourgram: 1.000


### Recall
Recall = TP / (TP + FN)

In [54]:
# Bigram
bigram_rec = bigram_tp / (bigram_tp + bigram_fn)

# Trigram
trigram_rec = trigram_tp / (trigram_tp + trigram_fn)

# Fourgram
fourgram_rec = fourgram_tp / (fourgram_tp + fourgram_fn)

print("Recall of Bigram: {:.3f}".format(bigram_rec))

print("Recall of Trigram: {:.3f}".format(trigram_rec))

print("Recall of Fourgram: {:.3f}".format(fourgram_rec))

Recall of Bigram: 0.403
Recall of Trigram: 0.100
Recall of Fourgram: 0.333


### F1
F1 = 2 * Precision * Recall / (Precision + Recall)

In [55]:
# Bigram
bigram_f1 = 2 * bigram_pre * bigram_rec / (bigram_pre + bigram_rec)

# Trigram
trigram_f1 = 2 * trigram_pre * trigram_rec / (trigram_pre + trigram_rec)

# Fourgram
fourgram_f1 = 2 * fourgram_pre * fourgram_rec / (fourgram_pre + fourgram_rec)

print("F1 score of Bigram: {:.3f}".format(bigram_f1))

print("F1 score of Trigram: {:.3f}".format(trigram_f1))

print("F1 score of Fourgram: {:.3f}".format(fourgram_f1))

F1 score of Bigram: 0.490
F1 score of Trigram: 0.167
F1 score of Fourgram: 0.500
