## Introduction

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
import numpy as np
import pandas as pd
from gensim.models import LdaModel
from gensim.corpora.dictionary import Dictionary
from gensim.corpora.mmcorpus import MmCorpus
from gensim.test.utils import datapath
from gensim.sklearn_api import LdaTransformer
import csv
from itertools import chain
from collections import OrderedDict
import shelve

# Fit all 4 LDA models

In [0]:
path = '/content/drive/My Drive/sample_data/'
out_path = "/content/drive/My Drive/Word regression/word_regr_objects/"

SO_recent_dict = Dictionary.load(path + 'SO_recent_full_activity_gensimDictionary.dict')
SO_recent_corpus = MmCorpus(datapath(path + 'corpus_SO_recent.mm'))
SO_recent_texts = []
with open(path + 'texts_SO_recent.csv', 'r') as f:
    SO_recent_texts = list(csv.reader(f))
k = 4
beta = 1
model = LdaTransformer(id2word=SO_recent_dict, num_topics=k, alpha='auto', eta=beta, iterations=100, random_state=2019)
SO_recent_lda = model.fit(SO_recent_corpus)



SO_past_dict = Dictionary.load(path + 'SO_past_full_activity_gensimDictionary.dict')
SO_past_corpus = MmCorpus(datapath(path + 'corpus_SO_past.mm'))
SO_past_texts = []
with open(path + 'texts_SO_past.csv', 'r') as f:
    SO_past_texts = list(csv.reader(f))
k = 20
beta = 0.05
model = LdaTransformer(id2word=SO_past_dict, num_topics=k, alpha='auto', eta=beta, iterations=100, random_state=2019)
SO_past_lda = model.fit(SO_past_corpus)



GH_recent_dict = Dictionary.load(path + 'GH_recent_full_activity_gensimDictionary.dict')
GH_recent_corpus = MmCorpus(datapath(path + 'corpus_GH_recent.mm'))
GH_recent_texts = []
with open(path + 'texts_GH_recent.csv', 'r') as f:
    GH_recent_texts = list(csv.reader(f))
k = 25
beta = 0.001
model = LdaTransformer(id2word=GH_recent_dict, num_topics=k, alpha='auto', eta=beta, iterations=100, random_state=2019)
GH_recent_lda = model.fit(GH_recent_corpus)



GH_past_dict = Dictionary.load(path + 'GH_past_full_activity_gensimDictionary.dict')
GH_past_corpus = MmCorpus(datapath(path + 'corpus_GH_past.mm'))
GH_past_texts = []
with open(path + 'texts_GH_past.csv', 'r') as f:
    GH_past_texts = list(csv.reader(f))
k = 25
beta = 0.005
model = LdaTransformer(id2word=GH_past_dict, num_topics=k, alpha='auto', eta=beta, iterations=100, random_state=2019)
GH_past_lda = model.fit(GH_past_corpus)


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
  diff = np.log(self.expElogbeta)


# RQ2: SO - GH comparison

## model trained on GH_past data ... infer SO_past distr.

In [0]:
GH_past_word_regression = []
SO_past_word_regression = []

unseen_corpus = [GH_past_dict.doc2bow(text) for text in SO_past_texts] # get common corpus for inference

for user_i in range(0, 83550):
  if user_i % 10000 == 0:
    print(user_i)
  GH_past_regr = get_word_regression(GH_past_lda.gensim_model, GH_past_corpus[user_i], coeff_threshold = 0)
  SO_past_regr = get_word_regression(GH_past_lda.gensim_model, unseen_corpus[user_i], coeff_threshold = 0)   # perform inference

  unique_words = remove_duplicate(GH_past_regr)
  GH_past_word_regression.append(unique_words)

  unique_words = remove_duplicate(SO_past_regr)
  SO_past_word_regression.append(unique_words)

In [0]:
out_path = "/content/drive/My Drive/Word regression/word_regr_objects/"

In [0]:
store_word_regression_data(out_path + "RQ2_GH_past_word_regression.shlf", GH_past_word_regression)
store_word_regression_data(out_path + "RQ2_SO_past_word_regression.shlf", SO_past_word_regression)

In [0]:
load_word_regression_data(out_path + "RQ2_SO_past_word_regression.shlf", 83549)
#load_word_regression_data(out_path + "RQ2_GH_past_word_regression.shlf", 83549)

{'': 0.0237885322,
 'buildpack': 0.0094098756,
 'client': 0.0035514417,
 'elixir': 0.0276337732,
 'erlang': 0.0246416386,
 'functional': 0.005820177,
 'haskell': 0.0491768643,
 'heroku': 0.0126538109,
 'implementation': 0.0023452812,
 'language': 0.0060202638,
 'library': 0.0080864532,
 'nix': 0.0045355731,
 'package': 0.002946109,
 'parser': 0.0027511988,
 'phoenix': 0.0032930241,
 'programming': 0.0054384912,
 'purescript': 0.0044085486,
 'simple': 0.0023033172,
 'type': 0.0052554081,
 'web': 0.0031257907}

## model trained on GH_recent data ... infer SO_recent distr.

In [0]:
GH_recent_word_regression = []
SO_recent_word_regression = []

unseen_corpus = [GH_recent_dict.doc2bow(text) for text in SO_recent_texts] # get common corpus for inference

for user_i in range(0, 83550):
  if user_i % 10000 == 0:
    print(user_i)
  GH_recent_regr = get_word_regression(GH_recent_lda.gensim_model, GH_recent_corpus[user_i], coeff_threshold = 0)
  SO_recent_regr = get_word_regression(GH_recent_lda.gensim_model, unseen_corpus[user_i], coeff_threshold = 0)  # perform inference

  unique_words = remove_duplicate(GH_recent_regr)
  GH_recent_word_regression.append(unique_words)

  unique_words = remove_duplicate(SO_recent_regr)
  SO_recent_word_regression.append(unique_words)

store_word_regression_data(out_path + "RQ2_GH_recent_word_regression.shlf", GH_recent_word_regression)
store_word_regression_data(out_path + "RQ2_SO_recent_word_regression.shlf", SO_recent_word_regression)

In [0]:
load_word_regression_data(out_path + "RQ2_GH_recent_word_regression.shlf", 125)
#load_word_regression_data(out_path + "RQ2_SO_recent_word_regression.shlf", 859)

{'amazon': 0.0012867418,
 'angular': 0.0050027617,
 'angularjs': 0.0009826729,
 'api': 0.0021960936,
 'app': 0.005749191,
 'assistant': 0.0007266592,
 'atom': 0.0052818246,
 'aws': 0.0047304109,
 'bootstrap': 0.0018724048,
 'browser': 0.0019516614,
 'build': 0.0029866964,
 'color': 0.0029396568,
 'common': 0.003246625,
 'component': 0.0060926754,
 'conda': 0.000893589,
 'config': 0.0038704758,
 'configuration': 0.0030196421,
 'create': 0.0021638863,
 'css': 0.0026767738,
 'dart': 0.0009208519,
 'demo': 0.0009453199,
 'dotfile': 0.0080754384,
 'drive': 0.0006759015,
 'elasticsearch': 0.0007920112,
 'electron': 0.001746046,
 'elixir': 0.0307881534,
 'emac': 0.022376338,
 'erlang': 0.0085338484,
 'file': 0.004122281,
 'graphql': 0.0017851229,
 'home': 0.0012457537,
 'javascript': 0.053678025,
 'js': 0.014670685,
 'lambda': 0.0018765685,
 'language': 0.0036205119,
 'lisp': 0.0130438348,
 'material': 0.0009862843,
 'mode': 0.0067171496,
 'module': 0.0021978107,
 'node': 0.0096710576,
 'ocam

# RQ4: past - recent comparison

## model trained on GH_past, infer GH_recent distr.

In [0]:
GH_past_word_regression = []
GH_recent_word_regression = []

unseen_corpus = [GH_past_dict.doc2bow(text) for text in GH_recent_texts] # get common corpus for inference

for user_i in range(0, 83550):
  if user_i % 10000 == 0:
    print(user_i)

  GH_past_regr = get_word_regression(GH_past_lda.gensim_model, GH_past_corpus[user_i], coeff_threshold = 0)
  GH_recent_regr = get_word_regression(GH_past_lda.gensim_model, unseen_corpus[user_i], coeff_threshold = 0) # perform inference

  unique_words = remove_duplicate(GH_past_regr)
  GH_past_word_regression.append(unique_words)

  unique_words = remove_duplicate(GH_recent_regr)
  GH_recent_word_regression.append(unique_words)

store_word_regression_data(out_path + "RQ4_GH_past_word_regression.shlf", GH_past_word_regression)
store_word_regression_data(out_path + "RQ4_GH_recent_word_regression.shlf", GH_recent_word_regression)

0


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


10000
20000
30000
40000
50000
60000
70000
80000


In [0]:
#load_word_regression_data(out_path + "RQ4_GH_past_word_regression.shlf", 6)
load_word_regression_data(out_path + "RQ4_GH_recent_word_regression.shlf", 6)

{'action': 0.0008827072,
 'add': 0.0009525315,
 'api': 0.0071389731,
 'attribute': 0.0008851025,
 'awesome': 0.0141964955,
 'base': 0.0053869509,
 'class': 0.0014162188,
 'code': 0.0034181331,
 'controller': 0.0008385227,
 'create': 0.0010460034,
 'curate': 0.0049053538,
 'datum': 0.0063079167,
 'email': 0.0008852978,
 'end': 0.0008841671,
 'flask': 0.0092511885,
 'form': 0.0013869156,
 'framework': 0.0052413559,
 'hash': 0.0010632548,
 'helper': 0.0010530005,
 'learn': 0.0032631389,
 'library': 0.0088921441,
 'list': 0.0094571495,
 'method': 0.0017355529,
 'model': 0.0029195445,
 'network': 0.0042063603,
 'object': 0.0007920472,
 'project': 0.0044240803,
 'py': 0.0094305091,
 'python': 0.2335200757,
 'ruby': 0.0011501622,
 'script': 0.0033761661,
 'simple': 0.0063847443,
 'software': 0.0037529266,
 'test': 0.0030670527,
 'tool': 0.005122175,
 'url': 0.0008164259,
 'user': 0.0024629948,
 'validation': 0.0008718641,
 'view': 0.000803977,
 'web': 0.0064281155}

## model trained on SO_past, infer SO_recent distr.

In [0]:
SO_past_word_regression = []
SO_recent_word_regression = []

unseen_corpus = [SO_past_dict.doc2bow(text) for text in SO_recent_texts] # get common corpus for inference

for user_i in range(0, 83550):
  if user_i % 10000 == 0:
    print(user_i)
  SO_past_regr = get_word_regression(SO_past_lda.gensim_model, SO_past_corpus[user_i], coeff_threshold = 0)
  SO_recent_regr = get_word_regression(SO_past_lda.gensim_model, unseen_corpus[user_i], coeff_threshold = 0) # perform inference

  unique_words = remove_duplicate(SO_past_regr)
  SO_past_word_regression.append(unique_words)

  unique_words = remove_duplicate(SO_recent_regr)
  SO_recent_word_regression.append(unique_words)

store_word_regression_data(out_path + "RQ4_SO_past_word_regression.shlf", SO_past_word_regression)
store_word_regression_data(out_path + "RQ4_SO_recent_word_regression.shlf", SO_recent_word_regression)

0


  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


10000
20000
30000
40000
50000
60000
70000
80000


In [0]:
load_word_regression_data(out_path + "RQ4_SO_past_word_regression.shlf", 83549)
#load_word_regression_data(out_path + "RQ4_SO_recent_word_regression.shlf", 6)

{'': 0.0400844216,
 'array': 0.0093632219,
 'autobiographer': 0.0120928707,
 'caucus': 0.0157425925,
 'citizen': 0.0044998135,
 'code': 0.0273610223,
 'codefunction': 0.00337094,
 'codevar': 0.0043504983,
 'commentator': 0.0134238461,
 'console': 0.004614945,
 'constituent': 0.0036912041,
 'critic': 0.0115582095,
 'curious': 0.0076107634,
 'custodian': 0.0078194896,
 'datum': 0.0035620942,
 'element': 0.0048173727,
 'enthusiast': 0.0051418482,
 'event': 0.006072463,
 'excavator': 0.0036483933,
 'famous': 0.0078850919,
 'function': 0.0267341528,
 'inform': 0.0038490444,
 'javascript': 0.0115806554,
 'jquery': 0.005289976,
 'js': 0.0034958138,
 'log': 0.0033767393,
 'necromancer': 0.0111162681,
 'nice': 0.0072422293,
 'notable': 0.0382184796,
 'object': 0.0072750384,
 'organizer': 0.0035382025,
 'patrol': 0.0044980538,
 'property': 0.0024744666,
 'prototype': 0.002803324,
 'return': 0.0081084212,
 'revival': 0.010696589,
 'test': 0.0025157311,
 'tumbleweed': 0.0105090374,
 'var': 0.01751

# Get each user's topic-word "regression model"

In [0]:
def get_word_regression(lda_model, corpus_data, coeff_threshold):
  word_regression = []
  users_topic_distr = lda_model.get_document_topics(corpus_data, minimum_probability=0.1)

  for num_relevant_topics in users_topic_distr:
    topic_num = num_relevant_topics[0]
    topic_prob = num_relevant_topics[1]
    
    topic_words = lda_model.show_topic(topicid=topic_num, topn=20)
    for words in topic_words:
      t_word = words[0]
      t_word_prob = words[1]

      t_word_coeff = topic_prob * t_word_prob
      if t_word_coeff > coeff_threshold:
        content = (t_word, t_word_coeff)
        word_regression.append(content)
  return word_regression

In [0]:
def remove_duplicate(word_regression):
  unique = {}
  for i,j in word_regression:
    unique[i] = 0

  for i,j in word_regression:
    unique[i] = round((unique[i] + j), 10)
  return unique

In [0]:
def load_word_regression_data(path, user_id):
  with shelve.open(path) as db:
    my_dict = db[str(user_id)]
  return my_dict

In [0]:
def store_word_regression_data(path, word_regression):
  with shelve.open(path) as db:  
    for user_id in range(0, 83550):
      db[str(user_id)] = word_regression[user_id]