# Import library

In [1]:
# import nltk
# nltk.download()

In [2]:
import warnings
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import collections
import re
import os
import string
pd.set_option('display.max_colwidth', 200)
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords # Import the stop word list
from collections import Counter
from nltk.util import ngrams
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# Parse Whitelist Words

In [3]:
def read_words_list(filenames):
    result_ls = set()
    for filename in filenames:
        with open(filename, 'r') as f:
            for line in f:
                words = line.strip().split(',')
                if len(words) == 1:
                    result_ls.add(words[0])
                else:
                    result_ls.add(tuple(words))
    return result_ls

In [4]:
wl = read_words_list(["whitelist/word1_w.txt", "whitelist/word2_w.txt", "whitelist/word3_w.txt", "whitelist/word4_w.txt"])

# Parse Blacklist Words

In [5]:
bl = read_words_list(["blacklist/word1_b.txt", "blacklist/word2_b.txt", "blacklist/word3_b.txt", "blacklist/word4_b.txt"])

# Parse incoming whitepapers

In [6]:
def read_whitepapers(filename, stopwords):
    directory = "../whitepapers/top20_whitepapers/"
    words_list = []
    words_context_dict = {}
    # context_tuples_ref = []
    word_idx = 0
    context_idx = 0
    for entry in os.scandir(directory):
        if (entry.path.endswith(filename) and entry.is_file()):
            with open(entry.path, "r") as f:
                for line in f:
                    # context_tuples_ref += [line]
                    temp_words = extract_and_clean(line, stopwords)
                    words_list.extend(temp_words)
                    for i in range(len(temp_words)):
                        words_context_dict[word_idx] = context_idx
                        word_idx += 1
                    context_idx += 1
    return words_list, words_context_dict #, context_tuples_ref

In [7]:
def extract_and_clean(line, stopwords):
    # words = [x.strip() for x in re.split(',| |\. |\: ', line) if x]
    # words = map(str.lower, words)
    # words = [x.replace('-', '') for x in words]
    words = word_tokenize(re.sub(r'[^\w\s]', '', line.lower()))
    # words = [x.replace('-', '') for x in words]
    words = [word for word in words if not word in stopwords]
    # ADJ, ADJ_SAT, ADV, NOUN, VERB = 'a', 's', 'r', 'n', 'v'
    words = [lemmatizer.lemmatize(w, pos='s') for w in words]
    words = [lemmatizer.lemmatize(w, pos='n') for w in words]
    words = [lemmatizer.lemmatize(w, pos='v') for w in words]
    words = [lemmatizer.lemmatize(w, pos='a') for w in words]
    return words

In [8]:
def dedupe(words, context_ref):
    appearance_dict = {}
    for i in range(len(words)):
        appearance_dict.setdefault(words[i],[]).append(context_ref[i])
    return list(set(words)), appearance_dict

In [9]:
def filter_wl_bl_words(words, context_ref, wl, bl):
    filtered_words = []
    filtered_context = {}
    for w in words:
        if w in wl and w not in bl:
            filtered_words.append(w)
            filtered_context[w] = context_ref[w]
    return filtered_words, filtered_context

In [10]:
stopwords_set = set(stopwords.words('english'))

In [11]:
filename = 'Bitcoin.txt'

In [12]:
words_ls, words_context_dict = read_whitepapers(filename, stopwords_set)

In [13]:
def enrich_lookup_dict(lookup_dict, words, paper_name, appearance_dict):
    for w in words:
        lookup_dict.setdefault(w,[]).append((paper_name, appearance_dict[w]))

In [14]:
lookup_dict = {}

## Single word white list

In [15]:
deduped_words, deduped_appearance_dict = dedupe(words_ls, words_context_dict)
wl_words, wl_appearance_dict = filter_wl_bl_words(deduped_words, deduped_appearance_dict, wl, bl)
enrich_lookup_dict(lookup_dict, wl_words, filename, wl_appearance_dict)

In [16]:
wl_words[:10]

['serve',
 'great',
 'essentially',
 'decision',
 'vulnerable',
 'doesnt',
 '4th',
 'representation',
 'minimal',
 'ii']

## 2-gram

In [17]:
deduped_words_2, deduped_appearance_dict_2 = dedupe(list(ngrams(words_ls, 2)), words_context_dict)
wl_words_2, wl_appearance_dict_2 = filter_wl_bl_words(deduped_words_2, deduped_appearance_dict_2, wl, bl)
enrich_lookup_dict(lookup_dict, wl_words_2, filename, wl_appearance_dict_2)

In [18]:
wl_words_2[:10]

[('transaction', 'block'),
 ('chain', 'fast'),
 ('history', 'transaction'),
 ('block', 'ahead'),
 ('two', 'node'),
 ('block', 'header'),
 ('node', 'receive'),
 ('payment', 'verification'),
 ('propose', 'solution'),
 ('party', 'involve')]

## 3-gram

In [19]:
deduped_words_3, deduped_appearance_dict_3 = dedupe(list(ngrams(words_ls, 3)), words_context_dict)
wl_words_3, wl_appearance_dict_3 = filter_wl_bl_words(deduped_words_3, deduped_appearance_dict_3, wl, bl)
enrich_lookup_dict(lookup_dict, wl_words_3, filename, wl_appearance_dict_3)

In [20]:
wl_words_3[:10]

[('transaction', 'hash', 'merkle'),
 ('proof', 'instead', 'trust'),
 ('symposium', 'security', 'privacy'),
 ('simplify', 'payment', 'verification'),
 ('root', 'include', 'block'),
 ('block', 'header', 'transaction'),
 ('directly', 'without', 'need'),
 ('without', 'need', 'trust'),
 ('ieee', 'computer', 'society'),
 ('key', 'private', 'key')]

## 4-gram

In [21]:
deduped_words_4, deduped_appearance_dict_4 = dedupe(list(ngrams(words_ls, 4)), words_context_dict)
wl_words_4, wl_appearance_dict_4 = filter_wl_bl_words(deduped_words_4, deduped_appearance_dict_4, wl, bl)
enrich_lookup_dict(lookup_dict, wl_words_4, filename, wl_appearance_dict_4)

In [22]:
wl_words_4[:10]

[('trust', 'allow', 'two', 'will'),
 ('peertopeer', 'electronic', 'cash', 'system'),
 ('bitcoin', 'peertopeer', 'electronic', 'cash'),
 ('allow', 'two', 'will', 'party'),
 ('conference', 'computer', 'communication', 'security'),
 ('cryptographic', 'proof', 'instead', 'trust'),
 ('merkle', 'root', 'merkle', 'root'),
 ('party', 'transact', 'directly', 'without'),
 ('acm', 'conference', 'computer', 'communication'),
 ('base', 'cryptographic', 'proof', 'instead')]

In [23]:
lookup_dict[('verify', 'one')]

[('Bitcoin.txt', [55])]

# Aggregate & Look up context

In [24]:
def lookup_context(lookup_dict, words, filename):
    directory = "../whitepapers/top20_whitepapers/"
    appearances = lookup_dict[words]
    # For each white paper, read and print relevant contents
    for appear in appearances:
        filename = appear[0]
        idxs = appear[1]
        context_idx = 0
        for entry in os.scandir(directory):
            if (entry.path.endswith(filename) and entry.is_file()):
                with open(entry.path, "r") as f:
                    for line in f:
                        if context_idx in idxs:
                            print(filename + " [line " + str(context_idx) + "] : " + line)
                        context_idx += 1

In [25]:
lookup_context(lookup_dict, ('computer', 'communication', 'security'), filename)

Bitcoin.txt [line 346] : on Computer and Communications Security, pages 28-35, April 1997.



In [28]:
lookup_context(lookup_dict, 'sha256', filename)

Bitcoin.txt [line 82] : The proof-of-work involves scanning for a value that when hashed, such as with SHA-256, the 



In [29]:
lookup_context(lookup_dict, 'anonymous', filename)

Bitcoin.txt [line 202] : another place: by keeping public keys anonymous.  The public can see that someone is sending 



In [30]:
agg_extracted_word = wl_words + wl_words_2 + wl_words_3 + wl_words_4
agg_extracted_word[:10]

['serve',
 'great',
 'essentially',
 'decision',
 'vulnerable',
 'doesnt',
 '4th',
 'representation',
 'minimal',
 'ii']

In [31]:
agg_extracted_word[600:610]

[('ieee', 'computer'),
 ('contain', 'multiple'),
 ('third', 'party'),
 ('check', 'transaction'),
 ('gambler', 'ruin'),
 ('number', 'coin'),
 ('rely', 'trust'),
 ('main', 'benefit'),
 ('solve', 'problem'),
 ('one', 'block')]