In [1]:
import gensim
import json
import os
import nltk
import string
import re
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk.data

In [2]:
whitepaper_names = ['avalanche.txt', 'binance_coin.txt', 'bitcoin.txt', 'cardano.txt', 
               'ethereum.txt', 'polkadot.txt', 'ripple.txt', 'shiba_inu.txt', 
               'solana.txt', 'terra.txt', 'tether.txt', 'usdcoin.txt', 
               'wrapped_tokens.txt']

In [3]:
# Load in most frequent tokens
with open('word_freq.json') as f:
    word_freq_dict = json.load(f)

In [4]:
stopword = nltk.corpus.stopwords.words('english')

def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

def tokenization(text):
    text = text.strip()
    text = re.split('\W+', text)
    return text

def remove_stopwords(text):
    text = [word for word in text if word not in stopword]
    return text

ps = nltk.PorterStemmer()

def stemming(text):
    text = [ps.stem(word) for word in text]
    return text

wn = nltk.WordNetLemmatizer()

def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text]
    return text

In [5]:
def split_sentences(text):
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    return tokenizer.tokenize(text)

def tokenize_sentences(sentences):
    tokens = []
    for sentence in sentences:
        tokens.append(lemmatizer(remove_stopwords(tokenization(remove_punct(sentence).lower()))))
    return tokens

res = []
for paper in whitepaper_names: 
    with open(paper) as f:
        data = f.read()
        sentences = split_sentences(data)
        res += tokenize_sentences(sentences)
        
print(len(res))

5366


In [6]:
gensim_model = gensim.models.Word2Vec(res, window=10, min_count=5, seed=42)

In [7]:
gensim_model.wv.most_similar('bitcoin', topn=10)

[('ethereum', 0.9995549917221069),
 ('blockchain', 0.9993343353271484),
 ('would', 0.9993300437927246),
 ('account', 0.9993246793746948),
 ('system', 0.9993075132369995),
 ('miner', 0.9993074536323547),
 ('provide', 0.9992964863777161),
 ('currency', 0.9992947578430176),
 ('fee', 0.9992919564247131),
 ('decentralized', 0.9992455244064331)]

In [8]:
# note this is manually created
# current issue: two word phrases aren't captured 
ontology_classes = ['cryptocurrency', 'person', 'protocol', 'electroniccoin', 'distributionscheme', 
                    'repository', 'organization', 'version', 'tradingplatform', 'wallet', 'algorithm', 
                    'cryptographichashfunction', 'protectionscheme', 'hashfunction', 'posscheme',
                    'poascheme', 'powscheme']

In [9]:
in_vocab = []
oo_vocab = []
for class_name in ontology_classes:
    if class_name in gensim_model.wv:
        in_vocab.append(class_name)
    else:
        oo_vocab.append(class_name)
print(in_vocab)
print(oo_vocab)

['cryptocurrency', 'person', 'protocol', 'organization', 'version', 'wallet', 'algorithm']
['electroniccoin', 'distributionscheme', 'repository', 'tradingplatform', 'cryptographichashfunction', 'protectionscheme', 'hashfunction', 'posscheme', 'poascheme', 'powscheme']


In [10]:
# now we find alternative names for the ontology:
# electroniccoin -> coin
print('coin' in gensim_model.wv)

# distributionscheme -> distribution
print('distribution' in gensim_model.wv)

# repository? -> drop from ontology for now

# tradingplatform -> platform
print('platform' in gensim_model.wv)

# cryptographichashfunction -> hash 
print('hash' in gensim_model.wv)

# protectionscheme -> protection
print('protection' in gensim_model.wv)

# posscheme
# poascheme
# powscheme
# convert all to -> po_ (ex. poa, pos, pow)
print('poa' in gensim_model.wv) #drop poa
print('state' in gensim_model.wv) #drop pos
print('pow' in gensim_model.wv)

True
True
True
True
True
False
True
True


In [11]:
new_ontology_classes = in_vocab + ['coin', 'distribution', 'platform', 'hash', 'protection', 'pow']
print(new_ontology_classes)

['cryptocurrency', 'person', 'protocol', 'organization', 'version', 'wallet', 'algorithm', 'coin', 'distribution', 'platform', 'hash', 'protection', 'pow']


In [12]:
print(word_freq_dict['currency'])
print(gensim_model.wv.most_similar('currency', topn=10))

148
[('bitcoin', 0.9992946982383728), ('token', 0.9991058707237244), ('fiat', 0.9990580677986145), ('exchange', 0.9990465641021729), ('asset', 0.9990149140357971), ('unit', 0.9989375472068787), ('account', 0.9989220499992371), ('decentralized', 0.9989050030708313), ('fee', 0.9988842606544495), ('ethereum', 0.9988497495651245)]


In [13]:
def match_finder(token, remaining_depth, neighbor_thresh=10):
    if token in new_ontology_classes:   
        return True
    else:
        if remaining_depth >= 0:
            neighbors = gensim_model.wv.most_similar(token, topn=neighbor_thresh)
            status = False
            for neighbor, _ in neighbors:
                if match_finder(neighbor, remaining_depth-1, neighbor_thresh):
                    status = True
            return status
        else:
            return False
        


def count_matches(freq_thresh=30, neighbor_thresh = 10, depth=3):
    total = 0.0
    found = 0
    for token, freq in word_freq_dict.items():
        if freq >= freq_thresh and token in gensim_model.wv:
            if match_finder(token, depth, neighbor_thresh):    
                found += 1
            total += 1
    print(found)
    print(total)
    return found/total

count_matches()

429
458.0


0.9366812227074236

In [14]:
count_matches(100, 10, 3)

72
82.0


0.8780487804878049