# Import

In [1]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import collections
import re
import os
import string
pd.set_option('display.max_colwidth', 200)
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords # Import the stop word list
from collections import Counter
from nltk.util import ngrams
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [2]:
def read_words_list(filenames):
    result_ls = set()
    for filename in filenames:
        with open(filename, 'r') as f:
            for line in f:
                words = line.strip().split(',')
                if len(words) == 1:
                    result_ls.add(words[0])
                else:
                    result_ls.add(tuple(words))
    return result_ls

# Parse access list and block list

In [3]:
wl = read_words_list(["whitelist/word1_w.txt", "whitelist/word2_w.txt", "whitelist/word3_w.txt", "whitelist/word4_w.txt"])
bl = read_words_list(["blacklist/word1_b.txt", "blacklist/word2_b.txt", "blacklist/word3_b.txt", "blacklist/word4_b.txt"])

# Parse incoming whitepapers

In [4]:
def read_whitepapers(filename, stopwords):
    directory = "../whitepapers/top20_whitepapers/"
    words_list = []
    words_context_dict = {}
    # context_tuples_ref = []
    word_idx = 0
    context_idx = 0
    for entry in os.scandir(directory):
        if (entry.path.endswith(filename) and entry.is_file()):
            with open(entry.path, "r") as f:
                for line in f:
                    # context_tuples_ref += [line]
                    temp_words = extract_and_clean(line, stopwords)
                    words_list.extend(temp_words)
                    for i in range(len(temp_words)):
                        words_context_dict[word_idx] = context_idx
                        word_idx += 1
                    context_idx += 1
    return words_list, words_context_dict #, context_tuples_ref

In [5]:
def extract_and_clean(line, stopwords):
    # words = [x.strip() for x in re.split(',| |\. |\: ', line) if x]
    # words = map(str.lower, words)
    # words = [x.replace('-', '') for x in words]
    words = word_tokenize(re.sub(r'[^\w\s]', '', line.lower()))
    # words = [x.replace('-', '') for x in words]
    words = [word for word in words if not word in stopwords]
    # ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
    words = [lemmatizer.lemmatize(w, pos='s') for w in words]
    words = [lemmatizer.lemmatize(w, pos='n') for w in words]
    words = [lemmatizer.lemmatize(w, pos='v') for w in words]
    words = [lemmatizer.lemmatize(w, pos='a') for w in words]
    return words

In [6]:
def creat_word_bank():
    word_bank = {}
    context_ref = {}
    for i in filenames:
        word_bank[i], context_ref[i] = read_whitepapers(i, stopwords_set)
    return word_bank, context_ref

In [7]:
def dedupe(words, context_ref):
    appearance_dict = {}
    for i in range(len(words)):
        appearance_dict.setdefault(words[i],[]).append(context_ref[i])
    return list(set(words)), appearance_dict

In [8]:
def filter_wl_bl_words(words, context_ref, wl, bl):
    unfiltered_words = []
    filtered_words = []
    filtered_context = {}
    for w in words:
        unfiltered_words.append(w)
        if w in wl and w not in bl:
            filtered_words.append(w)
            filtered_context[w] = context_ref[w]
    return unfiltered_words, filtered_words, filtered_context

In [9]:
stopwords_set = set(stopwords.words('english'))

In [10]:
filenames = ['Algorand.txt', 'Avalanche.txt', 'Binance.txt', 'Bitcoin.txt', 'Cardano.txt', 'Chainlink.txt',
            'Crypto_com.txt', 'Ethereum.txt', 'FTX_token.txt', 'PolkaDot.txt', 'Polygon.txt', 'Ripple.txt', 
            'Solana.txt', 'Terra.txt', 'Tether.txt', 'Tron.txt', 'Uniswap.txt', 'Wrapped.txt']

In [11]:
words_bank, context_ref = creat_word_bank()

In [12]:
def enrich_lookup_dict(lookup_dict, words, paper_name, appearance_dict):
    for w in words:
        lookup_dict.setdefault(w,[]).append((paper_name, appearance_dict[w]))

In [13]:
lookup_dict = {}

# Stats

## Single Word

In [14]:
agg_words = []
agg_wl_words = []
for coin in words_bank:
    words_ls = words_bank[coin]
    words_context_dict = context_ref[coin]
    deduped_words, deduped_appearance_dict = dedupe(words_ls, words_context_dict)
    raw_words, wl_words, wl_appearance_dict = filter_wl_bl_words(deduped_words, deduped_appearance_dict, wl, bl)
    agg_words.extend(raw_words)
    agg_wl_words.extend(wl_words)
    enrich_lookup_dict(lookup_dict, wl_words, coin, wl_appearance_dict)

In [15]:
# The number of raw single word extracted from 20 whitepapers
print(len(agg_words))

28871


In [16]:
# The number of filtered single word from 20 whitepapers
print(len(agg_wl_words))

1108


## Two Gram

In [17]:
agg_words_2gram = []
agg_wl_words_2 = []
for coin in words_bank:
    words_ls = words_bank[coin]
    words_context_dict = context_ref[coin]
    deduped_words_2, deduped_appearance_dict_2 = dedupe(list(ngrams(words_ls, 2)), words_context_dict)
    raw_words_2gram, wl_words_2, wl_appearance_dict_2 = filter_wl_bl_words(deduped_words_2, deduped_appearance_dict_2, wl, bl)
    agg_words_2gram.extend(raw_words_2gram)
    agg_wl_words_2.extend(wl_words_2)
    enrich_lookup_dict(lookup_dict, wl_words_2, coin, wl_appearance_dict_2)

In [18]:
# The number of raw 2-gram extracted from 20 whitepapers
print(len(agg_words_2gram))

95618


In [19]:
# The number of filtered 2-gram from 20 whitepapers
print(len(agg_wl_words_2))

4835


## Three Gram

In [20]:
agg_words_3gram = []
agg_wl_words_3 = []
for coin in words_bank:
    words_ls = words_bank[coin]
    words_context_dict = context_ref[coin]
    deduped_words_3, deduped_appearance_dict_3 = dedupe(list(ngrams(words_ls, 3)), words_context_dict)
    raw_words_3gram, wl_words_3, wl_appearance_dict_3 = filter_wl_bl_words(deduped_words_3, deduped_appearance_dict_3, wl, bl)
    agg_words_3gram.extend(raw_words_3gram)
    agg_wl_words_3.extend(wl_words_3)
    enrich_lookup_dict(lookup_dict, wl_words_3, coin, wl_appearance_dict_3)

In [21]:
# The number of raw 3-gram extracted from 20 whitepapers
print(len(agg_words_3gram))

111328


In [22]:
# The number of filtered 3-gram from 20 whitepapers
print(len(agg_wl_words_3))

491


## Four Gram

In [23]:
agg_words_4gram = []
agg_wl_words_4 = []
for coin in words_bank:
    words_ls = words_bank[coin]
    words_context_dict = context_ref[coin]
    deduped_words_4, deduped_appearance_dict_4 = dedupe(list(ngrams(words_ls, 4)), words_context_dict)
    raw_words_4gram, wl_words_4, wl_appearance_dict_4 = filter_wl_bl_words(deduped_words_4, deduped_appearance_dict_4, wl, bl)
    agg_words_4gram.extend(raw_words_4gram)
    agg_wl_words_4.extend(wl_words_4)
    enrich_lookup_dict(lookup_dict, wl_words_4, coin, wl_appearance_dict_4)


In [24]:
# The number of raw 4-gram extracted from 20 whitepapers
print(len(agg_words_4gram))

114807


In [25]:
# The number of filtered 4-gram from 20 whitepapers
print(len(agg_wl_words_4))

98
