In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!pip install pyLDAvis

In [0]:
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models.keyedvectors import KeyedVectors
from gensim.parsing.preprocessing import remove_stopwords, strip_numeric, preprocess_string
import numpy as np
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from gensim.matutils import jaccard_distance
import pickle
import pyLDAvis.gensim
from joblib import Parallel, delayed, cpu_count
from gensim.sklearn_api import LdaTransformer
from gensim.models import CoherenceModel, LdaModel
from gensim.corpora.dictionary import Dictionary
from gensim.corpora.mmcorpus import MmCorpus
from gensim.test.utils import datapath
import csv
import time
import statistics
import warnings
warnings.filterwarnings('ignore')  # To ignore all warnings that arise here to enhance clarity


# Fit LDA model and create pyLDAvis prepared object's topicInfo

In [0]:
path = '/content/drive/My Drive/sample_data/'

dictionary = Dictionary.load(path + 'SO_full_processed_Dictionary.dict')
corpus = MmCorpus(datapath(path + 'corpus_processed_SO_full.mm'))

texts = []
with open(path + 'new_SO_full_processed_corpus.csv', 'r') as f:
    reader = csv.reader(f)
    texts = list(reader)

terms = []
for (key, value) in dictionary.iteritems():
  terms.append(value)

In [0]:
k = [17, 14, 21, 28, 19, 11]
lda_models = {}
term_emb = {}
#preparedObj = {}

for topic_num in k:
  print(topic_num)
  model = LdaTransformer(id2word=dictionary, num_topics=topic_num, alpha='auto', eta='auto', iterations=400, random_state=2019)
  lda = model.fit(corpus)
  lda = lda.gensim_model
  term_topic_matrix = lda.get_topics()

  lda_models[topic_num] = lda
  term_emb[topic_num] = pd.DataFrame(term_topic_matrix, columns=terms)

  #t0 = time.time()
  #topic_info = get_topic_info(lda, corpus, dictionary)
  #preparedObj[topic_num] = topic_info
  #print(time.time() - t0, " seconds for an iteration")

17
14
21
28
19
11


In [0]:
import shelve

def load_data(path, num_topics):
  with shelve.open(path) as db:
    my_dict = db[str(num_topics)]
  return my_dict

def store_data(path, k):
  with shelve.open(path) as db:  
    for num_topics in k:
      db[str(num_topics)] = preparedObj[num_topics]

#store_data("/content/drive/My Drive/Expertise Experiments/topicInfo.shlf", k = [17, 14, 21, 28, 19, 11])

In [0]:
k = [17, 14, 21, 28, 19, 11]
preparedObj = {}
for i in k:
  preparedObj[i] = load_data("/content/drive/My Drive/Expertise Experiments/topicInfo.shlf", i)

# Load in human annotations

In [0]:
def intersect(a, b):
    """ return the intersection of two lists """
    return list(set(a) & set(b))

def union(a, b):
    """ return the union of two lists """
    return list(set(a) | set(b))

def load_annotations():
  SO_annotation = pd.read_csv('/content/drive/My Drive/Expertise Experiments/SO_annotations_processed.csv', header = 0,
                        names = ["sample_ID","profile_url","unified_Id","internal_ID","Annotator_1","Annotator_2", "Processed_Annotator_1", "Processed_Annotator_2"])

  GH_annotation = pd.read_csv('/content/drive/My Drive/Expertise Experiments/GH_annotations_processed.csv', header = 0, 
                        names = ["sample_ID","profile_url","unified_Id","internal_ID","Annotator_1","Annotator_2", "Processed_Annotator_1", "Processed_Annotator_2"])
  
  GH_IDs = GH_annotation["internal_ID"]
  SO_IDs = SO_annotation["internal_ID"]

  GH_annotation_intersect = {}
  GH_annotation_union = {}
  SO_annotation_intersect = {}
  SO_annotation_union = {}

  for index, row in SO_annotation.iterrows():
    a1 = row['Processed_Annotator_1'].split(";")
    a2 = row['Processed_Annotator_2'].split(";")

    if '' in a1:
      a1.remove('')
    if '' in a2:
      a2.remove('')

    SO_annotation_intersect[row['internal_ID']] = intersect(a1, a2)
    SO_annotation_union[row['internal_ID']] = union(a1, a2)

  for index, row in GH_annotation.iterrows():
    a1 = row['Processed_Annotator_1'].split(";")
    a2 = row['Processed_Annotator_2'].split(";")

    if '' in a1:
      a1.remove('')
    if '' in a2:
      a2.remove('')

    GH_annotation_intersect[row['internal_ID']] = intersect(a1, a2)
    GH_annotation_union[row['internal_ID']] = union(a1, a2)

  return GH_IDs, SO_IDs, GH_annotation_intersect, GH_annotation_union, SO_annotation_intersect, SO_annotation_union

In [0]:
GH_IDs, SO_IDs, GH_annotation_intersect, GH_annotation_union, SO_annotation_intersect, SO_annotation_union = load_annotations()

In [0]:
union_length = []
for k, v in SO_annotation_union.items():
  union_length.append(len(v))

intersect_length = []
for k, v in SO_annotation_intersect.items():
  intersect_length.append(len(v))

intersect_np = np.array(intersect_length)
print("intersect_avg = ", np.average(intersect_np))

union_np = np.array(union_length)
print("union_avg = ", np.average(union_np))

intersect_avg =  7.28
union_avg =  32.99


# Create user and topic embeddings using SO_Word2Vec_200

In [0]:
word_vectors = KeyedVectors.load("/content/drive/My Drive/embeddings/SO_pre-trained_vectors.kv", mmap='r')

CUSTOM_FILTERS = [lambda x: strip_numeric, remove_stopwords]

def word2vec_embedding_lookup(words):
  vectors = []
  for w in words:
    try:
      vec = word_vectors[w]
      vectors.append(vec)
    except:
      try:
        w_transformed = w.replace(".", "").replace("=", "").replace("-", "").replace("*", "").replace("'", "").replace("`", "").replace("|", "").replace('\\', "").replace("/", "").replace("$", "").replace("^", "").replace("#", "").replace("&", "").replace("@", "")
        vec = word_vectors[w_transformed]
        vectors.append(vec)
      except:
        try:
          w_stripped = preprocess_string(w_transformed, CUSTOM_FILTERS)
          vec = word_vectors[w_stripped]
          vectors.append(vec)
        except:
          continue
  return np.array(vectors)

In [0]:
def get_user_emb():
  user_embeddings = []
  for i in range(0,len(texts)):
    word_vectors = word2vec_embedding_lookup(list(set(texts[i]).intersection(terms)))
    try:
      feature_vector = np.max(word_vectors, axis=0)
      user_embeddings.append(feature_vector)
    except ValueError:
      user_embeddings.append(np.zeros((200,)))  # 200 x 1 vector of 0's, since the word2vec model is 200 dimensional
  return np.array(user_embeddings)

def get_topic_emb(lda):
  avg_topic_emb = []
  max_topic_emb = []
  number_of_topicWords = 20
  topic_num = lda.num_topics 

  for topic in range(0, topic_num):  # for each topic inside a specific model
    results = lda.show_topic(topic, topn=number_of_topicWords)

    topic_words = []
    for i in range(0,number_of_topicWords): # for each topic word inside a topic
      topic_words.append(results[i][0])

    word_vectors = word2vec_embedding_lookup(topic_words)
    avg_feature_vector = np.average(word_vectors, axis=0)
    max_feature_vector = np.max(word_vectors, axis=0)

    avg_topic_emb.append(avg_feature_vector)
    max_topic_emb.append(max_feature_vector)
  return np.asarray(avg_topic_emb), np.asarray(max_topic_emb)

# Create topic info data frame by calling get_topic_info(lda_model, corpus, dictionary)

In [0]:
from __future__ import absolute_import
import funcy as fp
import numpy as np
from scipy.sparse import issparse

def get_topic_info(topic_model, corpus, dictionary, doc_topic_dist=None):
  opts = fp.merge(pyLDAvis_prepare(topic_model, corpus, dictionary, doc_topic_dist))
  return my_prepare(**opts)
  
def _chunks(l, n):
    """ Yield successive n-sized chunks from l.
    """
    for i in range(0, len(l), n):
        yield l[i:i + n]


def _job_chunks(l, n_jobs):
    n_chunks = n_jobs
    if n_jobs < 0:
        # so, have n chunks if we are using all n cores/cpus
        n_chunks = cpu_count() + 1 - n_jobs

    return _chunks(l, n_chunks)


def _find_relevance(log_ttd, log_lift, R, lambda_):
    relevance = lambda_ * log_ttd + (1 - lambda_) * log_lift
    return relevance.T.apply(lambda s: s.sort_values(ascending=False).index).head(R)


def _find_relevance_chunks(log_ttd, log_lift, R, lambda_seq):
    return pd.concat([_find_relevance(log_ttd, log_lift, R, l) for l in lambda_seq])



def _df_with_names(data, index_name, columns_name):
    if type(data) == pd.DataFrame:
        # we want our index to be numbered
        df = pd.DataFrame(data.values)
    else:
        df = pd.DataFrame(data)
    df.index.name = index_name
    df.columns.name = columns_name
    return df


def _series_with_name(data, name):
    if type(data) == pd.Series:
        data.name = name
        # ensures a numeric index
        return data.reset_index()[name]
    else:
        return pd.Series(data, name=name)


def _topic_info(topic_term_dists, topic_proportion, term_frequency, term_topic_freq,
                vocab, lambda_step, R, n_jobs):
    # marginal distribution over terms (width of blue bars)
    term_proportion = term_frequency / term_frequency.sum()

    # compute the distinctiveness and saliency of the terms:
    # this determines the R terms that are displayed when no topic is selected
    topic_given_term = topic_term_dists / topic_term_dists.sum()
    kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
    distinctiveness = kernel.sum()
    saliency = term_proportion * distinctiveness
    # Order the terms for the "default" view by decreasing saliency:
    default_term_info = pd.DataFrame({
        'saliency': saliency,
        'Term': vocab,
        'Freq': term_frequency,
        'Total': term_frequency,
        'Category': 'Default'})
    default_term_info = default_term_info.sort_values(
        by='saliency', ascending=False).head(R).drop('saliency', 1)
    # Rounding Freq and Total to integer values to match LDAvis code:
    default_term_info['Freq'] = np.floor(default_term_info['Freq'])
    default_term_info['Total'] = np.floor(default_term_info['Total'])
    ranks = np.arange(R, 0, -1)
    default_term_info['logprob'] = default_term_info['loglift'] = ranks

    # compute relevance and top terms for each topic
    log_lift = np.log(topic_term_dists / term_proportion)
    log_ttd = np.log(topic_term_dists)
    lambda_seq = np.arange(0, 1 + lambda_step, lambda_step)

    def topic_top_term_df(tup):
        new_topic_id, (original_topic_id, topic_terms) = tup
        term_ix = topic_terms.unique()
        return pd.DataFrame({'Term': vocab[term_ix],
                             'Freq': term_topic_freq.loc[original_topic_id, term_ix],
                             'Total': term_frequency[term_ix],
                             'logprob': log_ttd.loc[original_topic_id, term_ix].round(4),
                             'loglift': log_lift.loc[original_topic_id, term_ix].round(4),
                             'Category': 'Topic%d' % new_topic_id})

    top_terms = pd.concat(Parallel(n_jobs=n_jobs)
                          (delayed(_find_relevance_chunks)(log_ttd, log_lift, R, ls)
                          for ls in _job_chunks(lambda_seq, n_jobs)))
    topic_dfs = map(topic_top_term_df, enumerate(top_terms.T.iterrows(), 1))
    return pd.concat([default_term_info] + list(topic_dfs), sort=True)

def pyLDAvis_prepare(topic_model, corpus, dictionary, doc_topic_dists=None):
    import gensim
    if not gensim.matutils.ismatrix(corpus):
        corpus_csc = gensim.matutils.corpus2csc(corpus, num_terms=len(dictionary))
    else:
        corpus_csc = corpus
        # Need corpus to be a streaming gensim list corpus for len and inference functions below:
        corpus = gensim.matutils.Sparse2Corpus(corpus_csc)

    vocab = list(dictionary.token2id.keys())
    # TODO: add the hyperparam to smooth it out? no beta in online LDA impl.. hmm..
    # for now, I'll just make sure we don't ever get zeros...
    beta = 0.01
    fnames_argsort = np.asarray(list(dictionary.token2id.values()), dtype=np.int_)
    term_freqs = corpus_csc.sum(axis=1).A.ravel()[fnames_argsort]
    term_freqs[term_freqs == 0] = beta
    doc_lengths = corpus_csc.sum(axis=0).A.ravel()

    if hasattr(topic_model, 'lda_alpha'):
        num_topics = len(topic_model.lda_alpha)
    else:
        num_topics = topic_model.num_topics

    if doc_topic_dists is None:
        # If its an HDP model.
        if hasattr(topic_model, 'lda_beta'):
            gamma = topic_model.inference(corpus)
        else:
            gamma, _ = topic_model.inference(corpus)
        doc_topic_dists = gamma / gamma.sum(axis=1)[:, None]
    else:
        if isinstance(doc_topic_dists, list):
            #doc_topic_dists = np.matrix(gensim.matutils.corpus2dense(doc_topic_dists, num_topics).T)
            doc_topic_dists = gensim.matutils.corpus2dense(doc_topic_dists, num_topics).T
        elif issparse(doc_topic_dists):
            doc_topic_dists = doc_topic_dists.T.todense()
        doc_topic_dists = doc_topic_dists / doc_topic_dists.sum(axis=1)

    # get the topic-term distribution straight from gensim without
    # iterating over tuples
    if hasattr(topic_model, 'lda_beta'):
        topic = topic_model.lda_beta
    else:
        topic = topic_model.state.get_lambda()
    topic = topic / topic.sum(axis=1)[:, None]
    topic_term_dists = topic[:, fnames_argsort]

    topic_freq = (doc_topic_dists.T * doc_lengths).T.sum()
    topic_proportion = (topic_freq / topic_freq.sum())

    term_topic_freq = (topic_term_dists.T * topic_freq).T

    return {'topic_term_dists': topic_term_dists, 'doc_topic_dists': doc_topic_dists, 
            'doc_lengths': doc_lengths, 'vocab': vocab, 'term_frequency': term_freqs}

def my_prepare(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency,
            R=30, lambda_step=0.01, n_jobs=-1, sort_topics=True):

    topic_term_dists = _df_with_names(topic_term_dists, 'topic', 'term')
    doc_topic_dists = _df_with_names(doc_topic_dists, 'doc', 'topic')
    term_frequency = _series_with_name(term_frequency, 'term_frequency')
    doc_lengths = _series_with_name(doc_lengths, 'doc_length')
    vocab = _series_with_name(vocab, 'vocab')

    topic_freq = (doc_topic_dists.T * doc_lengths).T.sum()
    if (sort_topics):
        topic_proportion = (topic_freq / topic_freq.sum()).sort_values(ascending=False)
    else:
        topic_proportion = (topic_freq / topic_freq.sum())

    topic_order = topic_proportion.index
    # reorder all data based on new ordering of topics
    topic_freq = topic_freq[topic_order]
    topic_term_dists = topic_term_dists.iloc[topic_order]
    doc_topic_dists = doc_topic_dists[topic_order]

    # token counts for each term-topic combination (widths of red bars)
    term_topic_freq = (topic_term_dists.T * topic_freq).T
    term_frequency = np.sum(term_topic_freq, axis=0)

    topic_info = _topic_info(topic_term_dists, topic_proportion,
                             term_frequency, term_topic_freq, vocab, lambda_step, R, n_jobs)
    return topic_info

# Define techniques

## Word2Vec user and topic Embeddings using Max and Avg pooling

In [0]:
def run_Word2Vec_emb(lda, threshold, maxPool):
  user_vectors = get_user_emb()
  avg_topic_vectors, max_topic_vectors = get_topic_emb(lda)
  
  if maxPool:
    topic_vectors = max_topic_vectors
  else:
    topic_vectors = avg_topic_vectors
  
  cos_sim = cosine_similarity(user_vectors, topic_vectors)
  #np.savetxt("/content/drive/My Drive/Expertise Experiments/SO_Full_word2vec_k=17_Manual_CosSim_user_vectors_topic_vectors.csv", cos_sim, delimiter=",")
  user_topic_mapping = create_user_topic_mapping(cos_sim, threshold)
  return user_topic_mapping

## LDA_topicEmbedding using Max-pooling and Avg-pooling



In [0]:
def embedding_lookup(word, k):
  term_embeddings = term_emb[k]
  return np.array(term_embeddings[word])

def get_LDA_user_emb(topic_num):
  user_embeddings = []
  for i in range(0,len(texts)):
    word_vectors = embedding_lookup(list(set(texts[i]).intersection(terms)), topic_num)
    try:
      feature_vector = np.max(word_vectors, axis=1)
      user_embeddings.append(feature_vector)
    except ValueError:
      user_embeddings.append(np.zeros((topic_num,)))
  return np.array(user_embeddings)

def get_LDA_topic_emb(topic_num):
  avg_topic_emb = []
  max_topic_emb = []
  number_of_topicWords = 20

  for topic in range(0, topic_num):  # for each topic inside a specific model
    lda = lda_models[topic_num]
    results = lda.show_topic(topic, topn=number_of_topicWords)

    topic_words = []
    for i in range(0,number_of_topicWords):
      topic_words.append(results[i][0])

    word_vectors = embedding_lookup(topic_words, topic_num)
    avg_feature_vector = np.average(word_vectors, axis=1)
    max_feature_vector = np.max(word_vectors, axis=1)

    avg_topic_emb.append(avg_feature_vector)
    max_topic_emb.append(max_feature_vector)

  avg_topic_vectors = np.array(avg_topic_emb)
  max_topic_vectors = np.array(max_topic_emb)
  return avg_topic_vectors, max_topic_vectors

In [0]:
def manual_cos_sim(a, b):
	"""Takes 2 vectors a, b and returns the cosine similarity according 
	to the definition of the dot product
	"""
	dot_product = np.dot(a, b)
	norm_a = np.linalg.norm(a)
	norm_b = np.linalg.norm(b)
	return dot_product / (norm_a * norm_b)

In [0]:
def jaccard_sim(A, B):
  return 1 - jaccard_distance(set(A), set(B))     # jaccard sim = 1 - jaccard distance

In [0]:
def run_LDA_emb(topic_num, threshold, maxPool = True):
  user_vectors = get_LDA_user_emb(topic_num)
  avg_topic_vectors, max_topic_vectors = get_LDA_topic_emb(topic_num)

  if maxPool:
    topic_vectors = max_topic_vectors
  else:
    topic_vectors = avg_topic_vectors

  cos_sim = cosine_similarity(user_vectors, topic_vectors)
  np.savetxt("/content/drive/My Drive/Expertise Experiments/LDA_k=17_cos_sim_user_vectors_topic_vectors.csv", cos_sim, delimiter=",")
  user_topic_mapping = create_user_topic_mapping(cos_sim, threshold)
  return user_topic_mapping

In [0]:
def create_user_topic_mapping(cos_sims, threshold):
  user_topic_mapping = {}
  for user_i in range(0, 83550):   # counting for users 0 --> 83549
    user_topic_mapping[user_i] = ['Topic' + str(index+1) for index, value in enumerate(cos_sims[user_i]) if value > threshold]
  return user_topic_mapping

In [0]:
def get_user_expertise(topicInfo, user_i, topic_terms):
  optimal_lambda_val = 1.0
  expertise = get_relevant_terms(topicInfo, user_i, optimal_lambda_val, topic_terms)
  return expertise

In [0]:
def get_relevant_terms(topic_info, topics, _lambda, term_num):
  """Retuns a list of top-n keywords (where n = term_num) that have the highest relevance score for the topics the the user is in."""

  tdf = pd.DataFrame(topic_info[topic_info.Category.isin(topics)])
  stdf = tdf.assign(relevance=_lambda * tdf['logprob'] + (1 - _lambda) * tdf['loglift'])
  new_df = stdf.sort_values('relevance', ascending=False)

  term_list = new_df['Term'].tolist()
  if '-PRON-' in term_list:
    term_list.remove('-PRON-')
  if ' ' in term_list:
    term_list.remove(' ')
  return term_list[0:term_num]

## LDA Topic Distribution based Expertise

In [0]:
def LDA_topicDistr(lda_model, topicInfo, user_i, threshold, topic_terms):
  user_topic_membership = create_user_topic_thresholding(lda_model, user_i, threshold)
  user_expertise = get_expertise_for_user_i(topicInfo, user_topic_membership, topic_terms)
  return user_expertise

In [0]:
def create_user_topic_thresholding(lda_model, user_i, threshold):
  user_i_topic_distr = lda_model.get_document_topics(bow = corpus[user_i], minimum_probability = threshold)

  topic_memberships = []
  for topic in user_i_topic_distr:
    topic_memberships.append('Topic' + str(topic[0]+1))   # topics are 0 to k-1, so offset by 1, since pyLDAvis indexes from 1 to k

  return topic_memberships

In [0]:
def get_expertise_for_user_i(topicInfo, user_topic_membership, topic_terms):
  optimal_lambda_val = 1.0
  expertise = get_relevant_terms(topicInfo, user_topic_membership, optimal_lambda_val, topic_terms)
  return expertise

# Create Evaluation functions

In [0]:
def getExistingWordsFromModel(words):
  """ Checks if a list of words are in the dictionary of the word2vec model """
  CUSTOM_FILTERS = [lambda x: strip_numeric, remove_stopwords]
  res = []
  for w in words:
    try:
      vec = word_vectors[w]
      res.append(w)
    except:
      try:
        w_transformed = w.replace(".", "").replace("=", "").replace("-", "").replace("*", "").replace("'", "").replace("`", "").replace("|", "").replace('\\', "").replace("/", "").replace("$", "").replace("^", "").replace("&", "").replace("@", "").replace("%", "")
        vec = word_vectors[w_transformed]
        res.append(w_transformed)
      except:
         try:
          w_stripped = preprocess_string(w_transformed, CUSTOM_FILTERS)
          vec = word_vectors[w_stripped]
          res.append(w_stripped)
         except:
           continue
  return res

In [0]:
def evaluate_LDA_topicDistr(lda_model, topicInfo, threshold_t):
  BLEU_scores = []
  jacc_similarity = []
  cos_similarity = []

  for user_i in SO_IDs:
    # or if you want intersect, use SO_annotation_intersect
    annotation = SO_annotation_union[user_i]

    model_hypothesis = LDA_topicDistr(lda_model, topicInfo, user_i, threshold_t, topic_terms = len(annotation))

    # 1-gram individual BLEU with smoothing function
    smooth = SmoothingFunction()
    BLEU_score = sentence_bleu(references = [annotation], hypothesis = model_hypothesis, 
                               weights = (1, 0, 0, 0), smoothing_function = smooth.method1)
    BLEU_scores.append(BLEU_score)

    # calculate Jaccard similarity between annotation and model hypothesis 
    jaccard_simm = jaccard_sim(annotation, model_hypothesis)
    jacc_similarity.append(jaccard_simm)

    # Compute cosine similarity between annotation and model hypothesis 
    a = getExistingWordsFromModel(annotation)
    b = getExistingWordsFromModel(model_hypothesis)

    if len(a) > 0 and len(b) > 0:
      cos_sim = word_vectors.n_similarity(a, b)
      cos_similarity.append(cos_sim)


    # could use chrf_precision_recall_fscore_support(annotation_1, model_hypothesis, n=1, beta=3.0) # adjust beta
    # to get the recall, precision, fscore

  return BLEU_scores, jacc_similarity, cos_similarity

In [0]:
def evaluate_LDA_topicEmb(topic_num, topicInfo, threshold, maxPool):
  user_topic_mapping = run_LDA_emb(topic_num, threshold, maxPool)

  BLEU_scores = []
  jacc_similarity = []
  cos_similarity = []

  for user_i in SO_IDs:
    # or if you want intersect, use SO_annotation_intersect
    annotation = SO_annotation_union[user_i]

    model_hypothesis = get_user_expertise(topicInfo, user_topic_mapping[user_i], topic_terms = len(annotation))

    # 1-gram individual BLEU with smoothing function
    smooth = SmoothingFunction()
    BLEU_score = sentence_bleu(references = [annotation], hypothesis = model_hypothesis, 
                               weights = (1, 0, 0, 0), smoothing_function = smooth.method1)
    BLEU_scores.append(BLEU_score)

    # calculate Jaccard similarity between annotation and model hypothesis 
    jaccard_simm = jaccard_sim(annotation, model_hypothesis)
    jacc_similarity.append(jaccard_simm)

    # Compute cosine similarity between annotation and model hypothesis 
    a = getExistingWordsFromModel(annotation)
    b = getExistingWordsFromModel(model_hypothesis)
    
    if len(a) > 0 and len(b) > 0:
      cos_sim = word_vectors.n_similarity(a, b)
      cos_similarity.append(cos_sim)

    # could use chrf_precision_recall_fscore_support(annotation_1, model_hypothesis, n=1, beta=3.0) # adjust beta
    # to get the recall, precision, fscore

  return BLEU_scores, jacc_similarity, cos_similarity

In [0]:
def evaluate_Word2Vec_Emb(lda, topicInfo, threshold, maxPool):
  user_topic_mapping = run_Word2Vec_emb(lda, threshold, maxPool)

  BLEU_scores = []
  jacc_similarity = []
  cos_similarity = []

  for user_i in SO_IDs:
    # or if you want intersect, use SO_annotation_intersect
    annotation = SO_annotation_union[user_i]
    model_hypothesis = get_user_expertise(topicInfo, user_topic_mapping[user_i], topic_terms = len(annotation))

    # 1-gram individual BLEU with smoothing function
    smooth = SmoothingFunction()
    BLEU_score = sentence_bleu(references = [annotation], hypothesis = model_hypothesis, 
                               weights = (1, 0, 0, 0), smoothing_function = smooth.method1)
    BLEU_scores.append(BLEU_score)

        # calculate Jaccard similarity between annotation and model hypothesis 
    jaccard_simm = jaccard_sim(annotation, model_hypothesis)
    jacc_similarity.append(jaccard_simm)

    # Compute cosine similarity between annotation and model hypothesis 
    a = getExistingWordsFromModel(annotation)
    b = getExistingWordsFromModel(model_hypothesis)
    
    if len(a) > 0 and len(b) > 0:
      cos_sim = word_vectors.n_similarity(a, b)
      cos_similarity.append(cos_sim)

    # could use chrf_precision_recall_fscore_support(annotation_1, model_hypothesis, n=1, beta=3.0) # adjust beta
    # to get the recall, precision, fscore

  return BLEU_scores, jacc_similarity, cos_similarity

# Main

## Run Experiments for LDA_topic distribution based expertise on SO data

In [0]:
threshold = [0.10, 0.12, 0.14, 0.16, 0.18, 0.20,
             0.22, 0.24, 0.26, 0.28, 0.30,
             0.32, 0.34, 0.36, 0.38, 0.40,
             0.42, 0.44, 0.46, 0.48, 0.50]
k = [17, 14, 21, 28, 19, 11]
results = []

for topic_num in k:
  for t in threshold:
    BLEU_scores, jacc_sim, cos_sim = evaluate_LDA_topicDistr(lda_models[topic_num], preparedObj[topic_num], t)
    bleu_np = np.asarray(BLEU_scores)
    jacc_np = np.asarray(jacc_sim)
    cos_np = np.asarray(cos_sim)
    results.append( [topic_num, t, np.mean(bleu_np), np.var(bleu_np), np.mean(jacc_np), np.var(jacc_np), np.mean(cos_np), np.var(cos_np)])

In [0]:
for r in results:
  print(r)

## Run Experiments for LDA_topicEmbedding using Avg-pooling on SO data, using SO_full model

In [0]:
results = []
k = [17, 14, 21, 28, 19, 11]
threshold_values = [0.40, 0.42, 0.44, 0.46, 0.48, 0.50, 0.52, 0.54, 0.56, 
                    0.58, 0.60, 0.62, 0.64, 0.66, 0.68, 0.70, 0.72, 0.74, 
                    0.76, 0.78, 0.80, 0.82, 0.84, 0.86, 0.88, 0.90]

for topic_num in k:
  print("---- Progress: topic", str(topic_num))
  for threshold in threshold_values:
    BLEU_scores, jacc_sim, cos_sim = evaluate_LDA_topicEmb(topic_num, preparedObj[topic_num], threshold, maxPool=False)
    bleu_np = np.asarray(BLEU_scores)
    jacc_np = np.asarray(jacc_sim)
    cos_np = np.asarray(cos_sim)
    results.append( [topic_num, threshold, np.mean(bleu_np), np.var(bleu_np), np.mean(jacc_np), np.var(jacc_np), np.mean(cos_np), np.var(cos_np)])

---- Progress: topic 17
---- Progress: topic 14
---- Progress: topic 21
---- Progress: topic 28
---- Progress: topic 19
---- Progress: topic 11


In [0]:
for r in results:
  print(r)

[17, 0.4, 0.09088861153616179, 0.0015346656608894466, 0.051251540438158064, 0.0005332329417627794, 0.4027974, 0.011205442]
[17, 0.42, 0.09301466685076074, 0.0014867400986326932, 0.05255089171382931, 0.0005208975942716019, 0.40186673, 0.011729114]
[17, 0.44, 0.09454045803247363, 0.0014728256422036616, 0.0535988053798621, 0.0005225967960449174, 0.39836612, 0.011825573]
[17, 0.46, 0.09342023035410219, 0.001504514025397699, 0.05309307467205324, 0.0005386200048949759, 0.39307514, 0.012150557]
[17, 0.48, 0.09343243971772303, 0.0014903423193287299, 0.05315971303795441, 0.0005312675096649901, 0.39086333, 0.011399973]
[17, 0.5, 0.09479941069477467, 0.0014786211307749495, 0.054221744836289026, 0.0005285133316857612, 0.39120826, 0.011446116]
[17, 0.52, 0.09282937868430148, 0.0016121390229093605, 0.053347346045712694, 0.0005904937720700099, 0.3856197, 0.012245741]
[17, 0.54, 0.09462066102735532, 0.00176858657231998, 0.05451701407345581, 0.0006498533380492621, 0.38543364, 0.0126084685]
[17, 0.56, 0

## Run Experiments for LDA_topicEmbedding using Max-pooling on SO data

In [0]:
results = []
k = [17, 14, 21, 28, 19, 11]

#threshold_values = [0.40, 0.42, 0.44, 0.46, 0.48, 0.50, 0.52, 0.54, 0.56, 
#                    0.58, 0.60, 0.62, 0.64, 0.66, 0.68, 0.70, 0.72, 0.74, 
#                    0.76, 0.78, 0.80, 0.82, 0.84, 0.86, 0.88, 0.90]

threshold_values = [0.40, 0.45, 0.50, 0.55, 0.60, 0.65, 0.70, 0.75, 0.80, 0.85, 0.90, 0.95] 

for topic_num in k:
  print("---- Progress: topic", topic_num)
  for threshold in threshold_values:
    BLEU_scores, jacc_sim, cos_sim = evaluate_LDA_topicEmb(topic_num, preparedObj[topic_num], threshold, maxPool=True)
    bleu_np = np.asarray(BLEU_scores)
    jacc_np = np.asarray(jacc_sim)
    cos_np = np.asarray(cos_sim)
    results.append( [topic_num, threshold, np.mean(bleu_np), np.var(bleu_np), np.mean(jacc_np), np.var(jacc_np), np.mean(cos_np), np.var(cos_np)])

---- Progress: topic 17


In [0]:
for r in results:
  print(r)

## Run Experiments for Word2vec user and topic Embedding using Avg-pooling on SO data

In [0]:
results = []
k = [17, 14, 21, 28, 19, 11]

#threshold_values = [0.40, 0.42, 0.44, 0.46, 0.48, 0.50, 0.52, 0.54, 0.56, 
#                    0.58, 0.60, 0.62, 0.64, 0.66, 0.68, 0.70, 0.72, 0.74, 
#                    0.76, 0.78, 0.80, 0.82, 0.84, 0.86, 0.88, 0.90]

threshold_values = [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.10,
                    0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.20]

for topic_num in k:
  print("---- Progress: topic", topic_num)
  for threshold in threshold_values:
    BLEU_scores, jacc_sim, cos_sim = evaluate_Word2Vec_Emb(lda=lda_models[topic_num], topicInfo=preparedObj[topic_num], threshold=threshold, maxPool=False)
    bleu_np = np.asarray(BLEU_scores)
    jacc_np = np.asarray(jacc_sim)
    cos_np = np.asarray(cos_sim)
    results.append( [topic_num, threshold, np.mean(bleu_np), np.var(bleu_np), np.mean(jacc_np), np.var(jacc_np), np.mean(cos_np), np.var(cos_np)])

---- Progress: topic 17
---- Progress: topic 14
---- Progress: topic 21
---- Progress: topic 28
---- Progress: topic 19
---- Progress: topic 11


In [0]:
for r in results:
  print(r)

[17, 0.01, 0.09481754114690341, 0.0015336889914866153, 0.05213649223255277, 0.0004947999528714702, 0.39977062, 0.012481103]
[17, 0.02, 0.095590855230706, 0.001440750071169982, 0.052584021696471674, 0.00046258290505339536, 0.38827136, 0.014516412]
[17, 0.03, 0.09399736185546224, 0.0015324581111691527, 0.05189462427002564, 0.0005052122076190652, 0.37027332, 0.016099859]
[17, 0.04, 0.09669811898915177, 0.0015370334983348815, 0.05379026682763755, 0.000512753521233058, 0.34738728, 0.019900478]
[17, 0.05, 0.09755181586546863, 0.0017702319225664487, 0.054793194332872275, 0.0006048827506757597, 0.3260689, 0.021290455]
[17, 0.06, 0.0923251862336234, 0.001955912914357653, 0.05173724354184764, 0.0006712660602565782, 0.30302534, 0.019150693]
[17, 0.07, 0.08672877471100705, 0.002296518338963376, 0.04833416308966289, 0.0007777768010513571, 0.27738392, 0.021016503]
[17, 0.08, 0.07889932782925416, 0.0026444974116101526, 0.043639082230519446, 0.0008568526477730986, 0.25573194, 0.019515635]
[17, 0.09, 0

## Run Experiments for Word2vec user and topic Embedding using Max-pooling on SO data

In [0]:
results = []
k = [17, 14, 21, 28, 19, 11]

#threshold_values = [0.40, 0.42, 0.44, 0.46, 0.48, 0.50, 0.52, 0.54, 0.56, 
#                    0.58, 0.60, 0.62, 0.64, 0.66, 0.68, 0.70, 0.72, 0.74, 
#                    0.76, 0.78, 0.80, 0.82, 0.84, 0.86, 0.88, 0.90]

threshold_values = [0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.30,
                    0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.40,
                    0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48, 0.49, 0.50]
for topic_num in k:
  print("---- Progress: topic", topic_num)
  for threshold in threshold_values:
    BLEU_scores, jacc_sim, cos_sim = evaluate_Word2Vec_Emb(lda_models[topic_num], topicInfo=preparedObj[topic_num], threshold = threshold, maxPool=True)
    bleu_np = np.asarray(BLEU_scores)
    jacc_np = np.asarray(jacc_sim)
    cos_np = np.asarray(cos_sim)
    results.append( [topic_num, threshold, np.mean(bleu_np), np.var(bleu_np), np.mean(jacc_np), np.var(jacc_np), np.mean(cos_np), np.var(cos_np)])

---- Progress: topic 17
---- Progress: topic 14
---- Progress: topic 21
---- Progress: topic 28
---- Progress: topic 19
---- Progress: topic 11


In [0]:
for r in results:
  print(r)

[17, 0.21, 0.07345874841888501, 0.0012703138504166697, 0.03854347049797217, 0.0003686561272977755, 0.44905323, 0.007173287]
[17, 0.22, 0.07345874841888501, 0.0012703138504166697, 0.03854347049797217, 0.0003686561272977755, 0.44905323, 0.007173287]
[17, 0.23, 0.07345874841888501, 0.0012703138504166697, 0.03854347049797217, 0.0003686561272977755, 0.44905323, 0.007173287]
[17, 0.24, 0.07345874841888501, 0.0012703138504166697, 0.03854347049797217, 0.0003686561272977755, 0.44905323, 0.007173287]
[17, 0.25, 0.07345874841888501, 0.0012703138504166697, 0.03854347049797217, 0.0003686561272977755, 0.44905323, 0.007173287]
[17, 0.26, 0.07345874841888501, 0.0012703138504166697, 0.03854347049797217, 0.0003686561272977755, 0.44905323, 0.007173287]
[17, 0.27, 0.07345874841888501, 0.0012703138504166697, 0.03854347049797217, 0.0003686561272977755, 0.44905323, 0.007173287]
[17, 0.28, 0.07345874841888501, 0.0012703138504166697, 0.03854347049797217, 0.0003686561272977755, 0.44905323, 0.007173287]
[17, 0.2