In [1]:
import google_bigquery_access as gbq
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
import gensim as gs

# Ways of characterising what Github projects are actually about

How do we know what repositories are about? Can we know even know whether a repository has any software in it?

In [2]:
query = """select repository_name, repository_description, repository_language 
from [publicdata:samples.github_timeline]
limit 5000;"""

repo_df = gbq.query_table(query, 5000)

executing query:
select repository_name, repository_description, repository_language 
from [publicdata:samples.github_timeline]
limit 5000;
has a rows attribute
5000 of   5000 (5000, 3)


In [13]:
repo_df.repository_description = repo_df.repository_description.fillna(' ')
stoplist = set('for from but an on is or that a of the and to in with this that be using -'.split())
texts = [[word for word in document.lower().split() if word not in stoplist] for document in repo_df.repository_description.tolist()]

# remove words that appear only once
all_tokens = sum(texts, [])
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
texts = [[word for word in text if word not in tokens_once]
          for text in texts]
print(texts[0:10])

[[u'foo'], [u'php', u'bindings'], [u'playground', u'redis', u'c.'], [u'simple', u'powerful', u'android'], [], [u'personal', u'configuration', u'files.'], [], [u'python', u'api', u'getting', u'zipcodes'], [], [u'automated', u'checker']]


In [14]:
dictionary = gs.corpora.Dictionary(texts)
print(dictionary)
git_corpus = [dictionary.doc2bow(text) for text in texts]
gs.corpora.BleiCorpus.serialize('data/github_desc.lda_c', git_corpus)

Dictionary(2915 unique tokens: [u'zendesk', u'assembler', u'dynamic', u'monte', u'breakout']...)


In [15]:
lda_model = gs.models.ldamodel.LdaModel(corpus=git_corpus, id2word=dictionary, num_topics=15,update_every=0, passes=50)


In [17]:
[[i, lda_model.print_topic(i)] for i in range(15)]

[[0,
  u'0.080*plugin + 0.052*interface + 0.043*mirror + 0.038*email + 0.036*adding + 0.035*git-svn + 0.035*trac + 0.035*email2trac, + 0.032*jquery + 0.016*website'],
 [1,
  u'0.019*your + 0.016*test + 0.013*extension + 0.012*password + 0.010*ios + 0.010*node + 0.010*like + 0.009*estimation + 0.009*realistic + 0.009*strength'],
 [2,
  u'0.026*server + 0.021*data + 0.020*use + 0.014*easy + 0.013*api + 0.011*engine + 0.011*project + 0.011*source + 0.011*open + 0.010*node.js'],
 [3,
  u'0.034*javascript + 0.028*web + 0.023*application + 0.022*framework + 0.021*python + 0.017*testing + 0.014*into + 0.011*open + 0.011*node.js + 0.010*xml'],
 [4,
  u'0.021*as + 0.017*blog + 0.015*rails + 0.014*library + 0.013*your + 0.012*personal + 0.010*java + 0.010*tool + 0.009*simple + 0.009*data'],
 [5,
  u'0.031*framework + 0.016*web + 0.015*building + 0.014*management + 0.014*your + 0.011*php + 0.011*html5 + 0.011*work + 0.010*applications + 0.010*simple,'],
 [6,
  u'0.032*see + 0.030*here + 0.030*not