In [2]:
import string
from collections import Counter
import re
import nltk
from unidecode import unidecode

In [4]:
# Load stopwords into set (from http://www.ranks.nl/stopwords)
with open('stopwords') as f:
    sw = f.read()
stopwords = set(sw.split())

In [5]:
def clean_text(_txt):
    # string.punctuaction => '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    punc = string.punctuation
    trans = {ord(c): ' ' for c in punc}
    trans[ord("'")] = None

    s = unidecode(_txt).translate(trans)
    s = re.sub(r'\d+', '', s)
    return ' '.join(s.lower().split())

def remove_stopwords(_txt):
    return ' '.join(w for w in _txt.split() if not w in stopwords)

In [6]:
with open('bounties_project_overview_for_research.txt') as f:
    txt = f.read()

In [7]:
# Remove questions
questions_raw = '''What type of apps do you build:
Eg: Are they primarily blockchain-based? Blockchain backend with a "regular" web front-end? Front-end that uses the blockchain directly via Metamask et al?
How do you test things which interact with the blockchain:
Automated testing? How do the facilitate manual testing? (to get an idea how sophisticated their dev process is)
Brief walk through of the tools you use daily
If not already mentioned, are there any libraries you rely on regularly?
What isn't possible with current tools, but would be nice
Try to remember the last time you developed a feature for ethereum and you ran into some unexpected obstacle. What was it? How did you feel?
This Q used to be: what is most frustrating about developing on ETH? (changed so we’re not asking a leading question).
Are you running into problems with any of the following:
What tool’s / libraries are most frustrating
How could they be improved
What surprised you on the other end, what was way easier than expected?
What process do you use for validating the security of your smart contracts?
What was the hardest part about learning to develop with Ethereum
What was the first thing you built related to Ethereum, and when did you build it? (NEW)
What applications are you most excited about in the near term (ie. what do you think will be working soon)
Who are other people you think we should talk to (alternative:  Who is the best developer you know)
What other questions should we be asking?
What really pisses you off about ETH development
'''
questions = []
for q in questions_raw.split('\n'):
    questions.append(clean_text(q))

def remove_questions(_txt):
    for q in questions:
        _txt = re.sub(q, '', _txt)
    return _txt

In [8]:
with open('contacts.txt') as f:
    contacts_raw = f.read()

contacts = set()
for c in contacts_raw.split('\n'):
    m = re.match('[0-9]+\. ([a-zA-Z ]+).*', c)
    if m is not None:
        contacts.add(m.group(1).lower().strip())

In [9]:
# Chain the cleanup
txt2 = remove_stopwords(remove_questions(clean_text(txt)))

# Bigrams

In [10]:
bigrams = nltk.bigrams(nltk.word_tokenize(txt2))
counter = Counter()
for b in bigrams:
    counter[b] += 1

In [11]:
counter.most_common(20)

[(('e', 'g'), 96),
 (('smart', 'contracts'), 73),
 (('smart', 'contract'), 58),
 (('right', 'now'), 45),
 (('state', 'channels'), 40),
 (('gas', 'limit'), 34),
 (('open', 'source'), 32),
 (('unit', 'tests'), 24),
 (('best', 'practices'), 24),
 (('code', 'coverage'), 21),
 (('front', 'end'), 21),
 (('dont', 'know'), 20),
 (('ui', 'issues'), 20),
 (('web', 'js'), 19),
 (('developing', 'eth'), 18),
 (('make', 'sure'), 18),
 (('frustrating', 'developing'), 17),
 (('geth', 'parity'), 17),
 (('chain', 'computation'), 17),
 (('json', 'rpc'), 17)]

# Trigrams

In [12]:
bigrams = nltk.trigrams(nltk.word_tokenize(txt2))
counter = Counter()
for b in bigrams:
    counter[b] += 1

In [13]:
counter.most_common(20)

[(('frustrating', 'developing', 'eth'), 17),
 (('specifically', 'talk', 'ui'), 15),
 (('talk', 'ui', 'issues'), 15),
 (('ui', 'issues', 'scaling'), 15),
 (('developing', 'eth', 'specifically'), 14),
 (('eth', 'specifically', 'talk'), 14),
 (('https', 'github', 'com'), 14),
 (('hardest', 'part', 'teaching'), 9),
 (('code', 'coverage', 'tool'), 7),
 (('chain', 'computation', 'state'), 6),
 (('computation', 'state', 'channels'), 6),
 (('people', 'think', 'talk'), 6),
 (('writing', 'smart', 'contracts'), 6),
 (('tools', 'libraries', 'frameworks'), 6),
 (('static', 'analysis', 'tools'), 5),
 (('ui', 'issues', 'struggled'), 5),
 (('exist', 'right', 'now'), 5),
 (('x', 'code', 'coverage'), 4),
 (('questions', 'frustrating', 'developing'), 4),
 (('gas', 'limit', 'chain'), 4)]

# LDA

In [14]:
# Separate inteview into docs (nees more work?)
docs_raw = []
current = None

cnt = 0

for l in txt.split('\n'):
    if l == '%%%%':
        if current is not None and len(current) > 1:
            docs_raw.append(current)
        current = []
    if current is not None and l != '':
        current.append(l)

if current is not None and len(current) > 1:
    docs_raw.append(current)

docs = []
for doc in docs_raw:
    docs.append(remove_stopwords(remove_questions(clean_text('\n'.join(doc)))).split())

In [23]:
from gensim import corpora, models, similarities

In [24]:
gdict = corpora.Dictionary(docs)

In [25]:
corpus = [gdict.doc2bow(doc) for doc in docs]

In [27]:
tfidf = models.TfidfModel(corpus)

In [28]:
corpus_tfidf = tfidf[corpus]

In [29]:
lsi = models.LsiModel(corpus_tfidf, id2word=gdict, num_topics=10)

In [30]:
lsi.show_topics()

[(0,
  '0.076*"bounty" + 0.074*"geth" + 0.072*"remix" + 0.072*"data" + 0.071*"parity" + 0.067*"really" + 0.066*"tools" + 0.063*"metamask" + 0.063*"need" + 0.060*"chain"'),
 (1,
  '-0.406*"wasm" + -0.135*"ewasm" + 0.115*"john" + -0.108*"opcodes" + 0.102*"marcus" + -0.081*"stu" + -0.081*"stack" + 0.081*"metamask" + -0.080*"greg" + -0.080*"easily"'),
 (2,
  '-0.227*"marcus" + -0.187*"john" + -0.143*"ricardo" + -0.118*"david" + -0.110*"remix" + -0.110*"limit" + 0.101*"erc" + -0.092*"computation" + -0.089*"ethers" + -0.081*"frustrating"'),
 (3,
  '-0.185*"wasm" + -0.153*"david" + 0.138*"zeppelin" + -0.115*"chainsafe" + -0.111*"geth" + -0.103*"stu" + -0.094*"greg" + -0.093*"metamask" + -0.093*"phishing" + -0.088*"parity"'),
 (4,
  '-0.141*"john" + -0.133*"bounty" + -0.112*"phishing" + -0.097*"gitcoin" + 0.092*"auction" + -0.091*"open" + 0.085*"matt" + 0.083*"ricardo" + -0.082*"mew" + 0.081*"whisper"'),
 (5,
  '-0.215*"wasm" + 0.146*"david" + 0.131*"land" + 0.131*"chainsafe" + -0.128*"ewasm" 

In [None]:
lda = models.ldamulticore.LdaMulticore(corpus, id2word=gdict, num_topics=10)