In [1]:
#-------------------------------------IMPORTS-------------------------------------#
import torch
import gzip
import numpy as np
from random import randint
import torch.utils.data as data_utils
import torch
import torch.autograd as autograd
import torch.nn.functional as F
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
from torch.autograd import Variable
from sklearn.metrics.pairwise import cosine_similarity
from numpy import linalg as LA

In [2]:
#-------------------------------------HELPER FUNCTIONS-------------------------------------#
NUM_NEGATIVE_SAMPLES=20

def N_random_values_in_list(full_list, N):
    x=0
    lower_bound  = 0
    upper_bound = len(full_list)-1
    sample_list=[]
    random_nums=[]
    while x < min(N,len(full_list)):
        random_num = randint(lower_bound, upper_bound) # inclusive range
        if random_num in random_nums:
            continue
        else:
            random_nums.append(random_num)
            x += 1
    return [full_list[i] for i in random_nums]

def convert_to_list(filename, is_android = False):
    if filename.endswith('gz'):
        with gzip.open(filename,'r')as f:
            text_tokens = f.readlines()
    else:
        with open(filename, 'r') as f:
            text_tokens = f.readlines()
    
    text_tokens = [token.replace('\n','').split('\t') for token in text_tokens]
    if is_android:
        text_tokens = [token[0].split(' ') for token in text_tokens]
    else:
        text_tokens = [[token[0], token[1].split(' '), token[2].split(' ')] for token in text_tokens]
                   
    return text_tokens

#Sample:question_id, similar_question_id, negative_question_id
def convert_to_samples_ubuntu(filename):
    my_list=convert_to_list(filename)
    new_samples=[]
    for original_sample in my_list:
        for similar in original_sample[1]:
            random_negative_samples = N_random_values_in_list(original_sample[2],NUM_NEGATIVE_SAMPLES)
            new_samples.append([original_sample[0], similar, random_negative_samples])# change this to include all negative 
                                                                                 # examples later
    return new_samples

def convert_to_samples_android(pos_filename, neg_filename):
    pos_list = convert_to_list(pos_filename, is_android = True)
    neg_list = convert_to_list(neg_filename, is_android = True)

    target_id_to_list_of_negative_ids = {}
    for id_pair in neg_list:
        target_id, negative_id = id_pair
        if target_id in target_id_to_list_of_negative_ids:
            target_id_to_list_of_negative_ids[target_id].append(negative_id)
        else:
            target_id_to_list_of_negative_ids[target_id] = [negative_id]
    new_samples = []
    for id_pair in pos_list:
        target_id, positive_id = id_pair
        negative_ids = target_id_to_list_of_negative_ids[target_id]
        new_sample = [target_id, positive_id, negative_ids]
        new_samples.append(new_sample)
    return new_samples
def make_lookup_table(filename):
    lookup={}
    text_token_list=convert_to_list(filename)
    for token in text_token_list:
        lookup[token[0]] = {'title':token[1],'question':token[2]}
    return lookup
        
#takes  sample_ids of [[q1,p1,n1],[q2,p2,n2]....]
#outputs titles like [[q1_title, p1_title, n1_title],[q2_title,p2_title,n2_title]...]
def convert_sampleids_to_titles(sample_ids,lookup):
    #each sample_id [question_id, pos_id, [neg_ids]]
    #print type(sample_ids)==list, "first"
   
    titles = []
    for sample_id in sample_ids:
         #flatten list: [question_id, pos_id, [neg_ids]] --> [question_id, pos_id, neg_id1, neg_id2, ...]
        sample_id= sample_id[:2]+sample_id[2][:]
        #sample_id : question_id, similar_question_id, negative_question_id
        try:
            titles.append([[item.lower() for item in lookup[str(_)]['title']] for _ in sample_id])
        except:
            print "Lookup failed"
    return titles
def remove_non_ascii(text):
    return ''.join([i if ord(i) < 128 else '' for i in text])

def extract_features(word):
    try:
        word=remove_non_ascii(word)
        word=word.encode('utf-8')
    except:
        print(word)
    return word_to_vec.get(word,[0.0 for i in range(200)])

def find_maximum_title_and_body_length(lookup_table):
    max_len_title = -1
    max_len_question = -1
    max_len_question_id = 0
    for key, dict_val in lookup_table.iteritems():
        len_title = len(dict_val['title'])
        len_question = len(dict_val['question'])
        if len_title > max_len_title:
             max_len_title = len_title
        if len_question > max_len_question:
            max_len_question = len_question
            max_len_question_id = key
    return max_len_title, max_len_question

def title_to_feature_matrix(title_word_list):
    feature_matrix = []
    for idx, word in enumerate(title_word_list):
        if idx == PARAMETER_MAX_TITLE_LENGTH:
            break
        else:
            word_features = extract_features(word)
            feature_matrix.append(word_features)
        
    #Pad the feature with zeros to ensure all inputs to the net have the same dimension
    feature_matrix += [[0] * NUM_FEATURES_PER_WORD] * (PARAMETER_MAX_TITLE_LENGTH - len(title_word_list))
    #print np.array(feature_matrix).T.shape
    return np.array(feature_matrix).T

#array is structured like a batch of features 50x200x38
def find_start_of_padding_for_batch(batch):
    vec_lengths_in_batch = []
    for batch_num in range(0, len(batch)):
        single_vec = batch[batch_num]
        length = find_start_of_padding_single_vec(single_vec) + 1
        vec_lengths_in_batch.append(length)
    return vec_lengths_in_batch

#batch = 200x38
def find_start_of_padding_single_vec(single_vec):
    for idx in range(len(single_vec[0])-1, -1, -1):
        if single_vec[0][idx] != 0.:
            return idx
    #if the whole sequence is 0s
    return 0
def create_mask(word_length):
    return np.array([[1. / word_length] * CNN_HIDDEN_DIM] * word_length + [[0] * CNN_HIDDEN_DIM] * (MAX_TITLE_LENGTH - word_length)).T
#-------------------------------------CREATE DATA BATCHER-------------------------------------#
# where samples[0] = 1 (target) + 1 (positive) + n (negative) 
def create_data_loader(ubuntu_samples, android_samples, shuffle_data = True):
    
    features = []
    #0: ubuntu dataset
    #1: android dataset
    for idx, samples in enumerate([ubuntu_samples, android_samples]):
        for sample in samples:
            from_dataset = idx
            target_title = sample[0]
            positive_title = sample[1]
            negative_titles = sample[2:22] #:22 because there are more negatives in the android dataset, but a TA
                                           # said the extras aren't necessary

            target_features = title_to_feature_matrix(target_title)
            positive_features = title_to_feature_matrix(positive_title)
            n_negative_features = [title_to_feature_matrix(negative_title) for negative_title in negative_titles]

            from_dataset = [[from_dataset] * MAX_TITLE_LENGTH] * NUM_FEATURES_PER_WORD
            all_features = [from_dataset, target_features, positive_features] + n_negative_features
            
            features.append(all_features)
    
    targets = torch.LongTensor(len(features)).zero_()
    dataset = data_utils.TensorDataset(torch.FloatTensor(np.array(features)), targets)
    data_loader = data_utils.DataLoader(dataset, batch_size = BATCH_SIZE, shuffle = shuffle_data)
    return data_loader

def create_data_loader_test(android_samples, shuffle_data = True):
    features = []

    for sample in android_samples:
        target_title = sample[0]
        positive_title = sample[1]
        negative_titles = sample[2:22] #:22 because there are more negatives in the android dataset, but a TA
                                       # said the extras aren't necessary

        target_features = title_to_feature_matrix(target_title)
        positive_features = title_to_feature_matrix(positive_title)
        n_negative_features = [title_to_feature_matrix(negative_title) for negative_title in negative_titles]

        all_features = [target_features, positive_features] + n_negative_features

        features.append(all_features)
        
    return features

def sample_and_label_to_cosine_sims(sample, label, model = None):
    #RE-ORDER DIMENSIONS OF THE SAMPLE
    sample = sample.permute(1, 0, 2, 3)
    if model != None:
        net = model
    
    target_question_features = sample[0] # 50 x 200 x 38
    positive_question_features = sample[1] # 50 x 200 x 38
    N_negative_question_features = sample[2:] #20 x 50 x 200 x 38

    #Determine lengths to know how many vectors to take the average across.
    target_question_lengths = find_start_of_padding_for_batch(target_question_features.data)
    positive_question_lengths = find_start_of_padding_for_batch(positive_question_features.data)
    N_negative_questions_lengths = [find_start_of_padding_for_batch(negative.data) for negative in N_negative_question_features]

    #RUN THROUGH NET
    target_question_net_output = net(target_question_features)
    positive_question_net_output = net(positive_question_features)
    N_negative_question_net_outputs = [net(negative) for negative in N_negative_question_features]

    #CREATE MASKS
    target_questions_masks = [create_mask(_) for _ in target_question_lengths] #DIM = 50 x 100 x 38
    positive_questions_masks = [create_mask(_) for _ in positive_question_lengths] #DIM = 50 x 100 x 38
    N_negative_questions_masks = [[create_mask(length) for length in length_list] #DIM = 50 x 20 x 100 x 38
                                  for length_list in N_negative_questions_lengths]

    #APPLY MASKS
    #Should the multiplicands, the masks, be Float Tensors or Variables? May have to be float tensors to ensure
    #    pytorch's directed graph back-prop is maintained.

    target_question_net_output_masked = target_question_net_output * Variable(torch.FloatTensor(target_questions_masks))
    positive_question_net_output_masked = positive_question_net_output * Variable(torch.FloatTensor(positive_questions_masks))
    N_negative_questions_net_output_masked = [N_negative_question_net_outputs[idx] * 
                                              Variable(torch.FloatTensor(N_negative_questions_masks[idx]))
                                              for idx in range(NUM_NEGATIVE_SAMPLES)]
    #SUM OVER WORDS
    target_question_net_output_masked_summed = torch.sum(target_question_net_output_masked, dim = 2) #DIM = 50 x 100
    positive_question_net_output_masked_summed = torch.sum(positive_question_net_output_masked, dim = 2) #DIM = 50 x 100
    N_negative_questions_net_output_masked_summed = [torch.sum(
                                                    N_negative_questions_net_output_masked[idx], dim = 2
                                                    )for idx in range(NUM_NEGATIVE_SAMPLES)] #DIM = 20 x 50 x 100

    net_outputs_batch = torch.stack([target_question_net_output_masked_summed, positive_question_net_output_masked_summed] + N_negative_questions_net_output_masked_summed)
    
    cosine_similarity_pos = cos(target_question_net_output_masked_summed, positive_question_net_output_masked_summed)
    # ^ DIM = 50
    cosine_similarities_neg = [cos(target_question_net_output_masked_summed, N_negative_questions_net_output_masked_summed[idx])
                              for idx in range(NUM_NEGATIVE_SAMPLES)]
    # ^ DIM = 20 x 50
    #

    cosine_similarities = torch.stack([cosine_similarity_pos] + cosine_similarities_neg) # DIM = 21 x 50
    return cosine_similarities, net_outputs_batch
# helper class used for computing information retrieval metrics, including MAP / MRR / and Precision @ x
class Evaluation():

    def __init__(self,data):

        self.data = data


    def Precision(self,precision_at):
        scores = []
        for item in self.data:
            temp = item[:precision_at]
            if any(val==1 for val in item):
                scores.append(sum([1 if val==1 else 0 for val in temp])*1.0 / len(temp) if len(temp) > 0 else 0.0)
        return sum(scores)/len(scores) if len(scores) > 0 else 0.0


    def MAP(self):
        scores = []
        missing_MAP = 0
        for item in self.data:
            temp = []
            count = 0.0
            for i,val in enumerate(item):
                if val == 1:
                    count += 1.0
                    temp.append(count/(i+1))
            if len(temp) > 0:
                scores.append(sum(temp) / len(temp))
            else:
                missing_MAP += 1
        return sum(scores)/len(scores) if len(scores) > 0 else 0.0


    def MRR(self):

        scores = []
        for item in self.data:
            for i,val in enumerate(item):
                if val == 1:
                    scores.append(1.0/(i+1))
                    break

        return sum(scores)/len(scores) if len(scores) > 0 else 0.0

def mse_loss(input, target):
    return torch.sum((input - target) ** 2) / input.data.nelement()

In [4]:
#text_tokenized.txt.gz has id \t title \t question body
ubuntu_text_tokenized_filename = 'askubuntu/text_tokenized.txt.gz'
android_text_tokenized_filename = 'Android/corpus.tsv.gz'
ubuntu_data_lookup_table = make_lookup_table(ubuntu_text_tokenized_filename)
android_data_lookup_table = make_lookup_table(android_text_tokenized_filename)
MAX_TITLE_LENGTH, MAX_BODY_LENGTH = find_maximum_title_and_body_length(ubuntu_data_lookup_table)

In [5]:
#-------------------------------------GENERATE EMBEDDINGS-------------------------------------#

word_embeddings = 'askubuntu/vector/vectors_pruned.200.txt.gz'
f = gzip.open(word_embeddings, 'r')
wv_text = [ ]
lines = f.readlines()
for line in lines:
    wv_text.append(line.strip())

word_to_vec = {}

for line in wv_text:
    parts = line.split()
    word = parts[0]
    vector = np.array([float(v) for v in parts[1:]])
    word_to_vec[word] = vector
f.close()

In [6]:
android_dev_pos_filename = "Android/dev.pos.txt"
android_dev_neg_filename = "Android/dev.neg.txt"

android_test_pos_filename = "Android/test.pos.txt"
android_test_neg_filename = "Android/test.neg.txt"

android_dev_samples = convert_to_samples_android(android_dev_pos_filename, android_dev_neg_filename)
android_test_samples = convert_to_samples_android(android_test_pos_filename, android_test_neg_filename)

android_dev_titles_only = convert_sampleids_to_titles(android_dev_samples, android_data_lookup_table)
android_test_titles_only = convert_sampleids_to_titles(android_test_samples, android_data_lookup_table)

In [99]:
from sklearn.feature_extraction.text import TfidfVectorizer
words = [word for titles in android_dev_titles_only for title in titles for word in title]
transformer = TfidfVectorizer(ngram_range = (1,1), analyzer='word')
X_train_counts_word = transformer.fit_transform(words)


In [100]:
from scipy.spatial.distance import cosine

cos_sims = []
for sample in android_test_titles_only:
    question_embeddings = []
    for question in sample:
        question_embedding = transformer.transform(question)
        question_embedding = np.sum(question_embedding, axis = 0)
        question_embeddings.append(question_embedding)
    target_vec = question_embeddings[0]
    cos_sims_sample = []
    for idx in range(1, len(question_embeddings)):
        vec = question_embeddings[idx]
        cos = 1. - cosine(target_vec, vec)
        cos_sims_sample.append(cos)
    cos_sims.append(cos_sims_sample)
        

In [101]:
outputs = []
targets = []
for cos_sim_list in cos_sims:
    for idx, cos_sim in enumerate(cos_sim_list):
        outputs.append(cos_sim)
        if idx == 0:
            targets.append(1)
        else:
            targets.append(0)

In [102]:
# from AUC import AUCMeter
auc_evaluator = AUCMeter()
correct = 0.
total = 0.
for idx in range(len(outputs)):
    out = outputs[idx]
    targ = targets[idx]
    auc_evaluator.add(out, targ)
auc_evaluator.scores = np.array(outputs)
auc_evaluator.targets = np.array(targets)
print "AUC Value (0.05): {}".format(auc_evaluator.value(0.05))


AUC Value (0.05): 0.414222526082


In [91]:

'''
Code took from PyTorchNet (https://github.com/pytorch/tnt)

'''

import math
import numbers
import numpy as np
import torch

class Meter(object):
    def reset(self):
        pass

    def add(self):
        pass

    def value(self):
        pass


class AUCMeter(Meter):
    """
    The AUCMeter measures the area under the receiver-operating characteristic
    (ROC) curve for binary classification problems. The area under the curve (AUC)
    can be interpreted as the probability that, given a randomly selected positive
    example and a randomly selected negative example, the positive example is
    assigned a higher score by the classification model than the negative example.

    The AUCMeter is designed to operate on one-dimensional Tensors `output`
    and `target`, where (1) the `output` contains model output scores that ought to
    be higher when the model is more convinced that the example should be positively
    labeled, and smaller when the model believes the example should be negatively
    labeled (for instance, the output of a signoid function); and (2) the `target`
    contains only values 0 (for negative examples) and 1 (for positive examples).
    """
    def __init__(self):
        super(AUCMeter, self).__init__()
        self.reset()

    def reset(self):
        # self.scores = torch.DoubleTensor(torch.DoubleStorage()).numpy()
        # self.targets = torch.LongTensor(torch.LongStorage()).numpy()
        self.scores = []
        self.targets = []

    def add(self, output, target):
        if torch.is_tensor(output):
            output = output.cpu().squeeze().numpy()
        if torch.is_tensor(target):
            target = target.cpu().squeeze().numpy()
        # elif isinstance(target, numbers.Number):
        #     target = np.asarray([target])
        # assert np.ndim(output) == 1, \
        #     'wrong output size (1D expected)'
        # assert np.ndim(target) == 1, \
        #     'wrong target size (1D expected)'
        # assert output.shape[0] == target.shape[0], \
        #     'number of outputs and targets does not match'
        # assert np.all(np.add(np.equal(target, 1), np.equal(target, 0))), \
        #     'targets should be binary (0, 1)'

        # self.scores = np.append(self.scores, output)
        # self.targets = np.append(self.targets, target)
        self.sortind = None
        self.scores.append(output)
        self.scores.append(target)


    def value(self, max_fpr=1.0):
        # self.scores = torch.FloatTensor(self.scores)
        # self.target = torch.LongTensor(self.targets)
        
        assert max_fpr > 0

        # case when number of elements added are 0
        if self.scores.shape[0] == 0:
            return 0.5

        # sorting the arrays
        if self.sortind is None:
            scores, sortind = torch.sort(torch.from_numpy(self.scores), dim=0, descending=True)
            scores = scores.numpy()
            self.sortind = sortind.numpy()
        else:
            scores, sortind = self.scores, self.sortind

        # creating the roc curve
        tpr = np.zeros(shape=(scores.size + 1), dtype=np.float64)
        fpr = np.zeros(shape=(scores.size + 1), dtype=np.float64)

        for i in range(1, scores.size + 1):
            if self.targets[sortind[i - 1]] == 1:
                tpr[i] = tpr[i - 1] + 1
                fpr[i] = fpr[i - 1]
            else:
                tpr[i] = tpr[i - 1]
                fpr[i] = fpr[i - 1] + 1

        tpr /= (self.targets.sum() * 1.0)
        fpr /= ((self.targets - 1.0).sum() * -1.0)

        for n in range(1, scores.size + 1):
            if fpr[n] >= max_fpr:
                break

        # calculating area under curve using trapezoidal rule
        #n = tpr.shape[0]
        h = fpr[1:n] - fpr[0:n - 1]
        sum_h = np.zeros(fpr.shape)
        sum_h[0:n - 1] = h
        sum_h[1:n] += h
        area = (sum_h * tpr).sum() / 2.0

        return area / max_fpr
