In [11]:
NUM_NEGATIVE_SAMPLES=20

from random import randint
def N_random_values_in_list(full_list, N):
    x=0
    lower_bound  = 0
    upper_bound = len(full_list)-1
    sample_list=[]
    random_nums=[]
    while x < min(N,len(full_list)):
        random_num = randint(lower_bound, upper_bound) # inclusive range
        if random_num in random_nums:
            continue
        else:
            random_nums.append(random_num)
            x += 1
    return [full_list[i] for i in random_nums]

In [12]:
import torch
import gzip
import numpy as np

def convert_to_list(filename):
    if filename.endswith('gz'):
        with gzip.open(filename,'r')as f:
            text_tokens = f.readlines()
    else:
        with open(filename, 'r') as f:
            text_tokens = f.readlines()
    text_tokens = [token.replace('\n','').split('\t') for token in text_tokens]
    text_tokens = [[token[0], token[1].split(' '), token[2].split(' ')] for token in text_tokens]
                   
    return text_tokens

#Sample:question_id, similar_question_id, negative_question_id
def convert_to_samples(filename):
    my_list=convert_to_list(filename)
    new_samples=[]
    for original_sample in my_list:
        for similar in original_sample[1]:
            random_negative_samples = N_random_values_in_list(original_sample[2],NUM_NEGATIVE_SAMPLES)
            new_samples.append([original_sample[0], similar, random_negative_samples])# change this to include all negative 
                                                                                     # examples later
    return new_samples
def make_lookup_table_for_training_data(filename):
    lookup={}
    text_token_list=convert_to_list(filename)
    for token in text_token_list:
        lookup[token[0]] = {'title':token[1],'question':token[2]}
    return lookup
        
#takes  sample_ids of [[q1,p1,n1],[q2,p2,n2]....]
#outputs titles like [[q1_title, p1_title, n1_title],[q2_title,p2_title,n2_title]...]
def convert_sampleids_to_titles(sample_ids,lookup):
    #each sample_id [question_id, pos_id, [neg_ids]]
    #print type(sample_ids)==list, "first"
   
    titles = []
    for sample_id in sample_ids:
        
         #flatten list: [question_id, pos_id, [neg_ids]] --> [question_id, pos_id, neg_id1, neg_id2, ...]
        sample_id= sample_id[:2]+sample_id[2][:]
        #sample_id : question_id, similar_question_id, negative_question_id
        try:
            titles.append([lookup[str(identity)]['title'] for identity in sample_id])
           # print type(sample_id)==list
        except:
            print sample_id, "is sample id", type(sample_id)==list
    return titles
    

In [13]:
#text_tokenized.txt.gz has id \t title \t question body
text_tokenized='askubuntu/text_tokenized.txt.gz'

#train_random.txt
#(1) the query question ID, (2) the list of similar question IDs, and (3) the list of randomly selected question IDs.
train_random_filename='askubuntu/train_random.txt'

#Each line contains (1) the query question ID, (2) the list of similar question IDs, (3) the list of 20 candidate question IDs and (4) the associated BM25 scores of these questions computed by the Lucene search engine. The second field (the set of similar questions) is a subset of the third field.
dev_filename='askubuntu/dev.txt'
test_filename='askubuntu/test.txt'

train_samples = convert_to_samples(train_random_filename)
dev_samples = convert_to_samples(dev_filename)
test_samples = convert_to_samples(test_filename)

lookup = make_lookup_table_for_training_data(text_tokenized)
train_list = convert_to_list(train_random_filename)
train_titles_only = convert_sampleids_to_titles(train_samples, lookup)

In [14]:
def remove_non_ascii(text):
    return ''.join([i if ord(i) < 128 else '' for i in text])

word_embeddings = 'askubuntu/vector/vectors_pruned.200.txt.gz'
f = gzip.open(word_embeddings, 'r')
wv_text = [ ]
lines = f.readlines()
for line in lines:
    wv_text.append(line.strip())

word_to_vec = {}

for line in wv_text:
    parts = line.split()
    word = parts[0]
    vector = np.array([float(v) for v in parts[1:]])
    word_to_vec[word] = vector
f.close()

def extract_features(word):
    try:
        word=remove_non_ascii(word)
        word=word.encode('utf-8')
    except:
        print(word)
    return word_to_vec.get(word,[0.0 for i in range(200)])

In [15]:
def find_maximum_title_and_body_length(lookup_table):
    max_len_title = -1
    max_len_question = -1
    max_len_question_id = 0
    for key, dict_val in lookup_table.iteritems():
        len_title = len(dict_val['title'])
        len_question = len(dict_val['question'])
        if len_title > max_len_title:
             max_len_title = len_title
        if len_question > max_len_question:
            max_len_question = len_question
            max_len_question_id = key
    return max_len_title, max_len_question

def title_to_feature_matrix(title_word_list):
    feature_matrix = []
    for word in title_word_list:
        word_features = extract_features(word)
        feature_matrix.append(word_features)
    #Pad the feature with zeros to ensure all inputs to the net have the same dimension
    feature_matrix += [[0.] * NUM_FEATURES_PER_WORD] * (MAX_TITLE_LENGTH - len(title_word_list))
    #print np.array(feature_matrix).T.shape
    return np.array(feature_matrix).T

In [16]:
MAX_TITLE_LENGTH, MAX_BODY_LENGTH = find_maximum_title_and_body_length(lookup)
NUM_FEATURES_PER_WORD = 200
INPUT_DIM = (MAX_TITLE_LENGTH, NUM_FEATURES_PER_WORD)

In [None]:
import torch.utils.data as data_utils
BATCH_SIZE=50

#for each tuple of titles make a feature vector that is num_titles x 200 x 38
# where num_titles = 1 (target) + 1 (positive) + n (negative) 
features = []
for i in range(len(train_titles_only[:10000])):   # we should include all but this is just for simplicity 
    sample=train_titles_only[i]
    if i%1000==0:
        print i
    target_title = sample[0]
    positive_title = sample[1]
    negative_titles = sample[2:]
        
    target_features = title_to_feature_matrix(target_title)
    positive_features = title_to_feature_matrix(positive_title)
    n_negative_features = [title_to_feature_matrix(negative_title) for negative_title in negative_titles]
    
    #all features should be size num_titles x feature size ....
    # where num_titles = 1 (target) + 1 (positive) + n (negative) 
    all_features=[[[0]*MAX_TITLE_LENGTH] * NUM_FEATURES_PER_WORD]*(2+NUM_NEGATIVE_SAMPLES)

    all_features[0]=target_features
    all_features[1]=positive_features
    all_features[2:len(n_negative_features)+2]=n_negative_features
    features.append(all_features)
print("created features", len(all_features))

print torch.FloatTensor(features).size(0)
targets = torch.LongTensor(torch.FloatTensor(features).size(0), 1).zero_()
train = data_utils.TensorDataset(torch.FloatTensor(features), targets)
print("created features")
train_loader = data_utils.DataLoader(train, batch_size=BATCH_SIZE, shuffle=True)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000


In [None]:
len(features[1])

In [9]:
import torch
import torch.autograd as autograd
import torch.nn.functional as F
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
from torch.autograd import Variable


class CNN(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(CNN, self).__init__()
        
        self.conv1d = nn.Conv1d(200, 200, KERNEL_SIZE)
        
        self.layer1 = nn.Sequential(
            nn.Conv1d(200, hidden_dim, KERNEL_SIZE),
            nn.ReLU(),
            nn.AvgPool1d(KERNEL_SIZE)
        )
    def forward(self, x):
        x = self.layer1(x)
        return x

In [76]:
from sklearn.metrics.pairwise import cosine_similarity
KERNEL_SIZE = 3
INPUT_SIZE = 200
HIDDEN_SIZE = 600
LEARNING_RATE = 1e-3
NUM_EPOCHS = 25
net = CNN(INPUT_SIZE, HIDDEN_SIZE)

criterion = nn.MultiMarginLoss(p=1, margin=0.2, weight=None, size_average=True) #HAHA just put these in to look smart 
optimizer = torch.optim.Adam(net.parameters(), lr = LEARNING_RATE)

print "going in to training"
# ----TRAINING
for epoch in range(NUM_EPOCHS):
    running_loss = 0.0
   # for sample in features:
    for i,(sample,label) in enumerate(train_loader):
        if i%25==0:
            print i
        target_features=Variable(sample[:,0])
        positive_features=Variable(sample[:,1])
        n_negative_features=[Variable(sample[:,i]) for i in range(2, len(sample[0]))]
        #print target_features.data.numpy().shape, "is target features shape"
        target_matrix = net(target_features)
        #print target_matrix.data.numpy().shape, "is target matrix shape"
        positive_matrix = net(positive_features)
        n_negative_matrices = [net(negative_features) for negative_features in n_negative_features]
        
        #target_vec = target_matrix.data.numpy()[0].reshape(50,1,-1)
        target_vec = target_matrix.data.numpy().flatten()
        #print target_vec.shape, "is target vec shape"
        positive_vec = positive_matrix.data.numpy().flatten()
        n_negative_vecs = [negative_matrix.data.numpy().flatten() for negative_matrix in n_negative_matrices]
        #print target_vec, "is target_vec"
        #print target_vec.shape, "is for target vec"
        #print positive_vec.shape, "is for pos vec"
        cos_sim_positive = [cosine_similarity(target_vec[i], positive_vec[i]) for i in range(BATCH_SIZE)]
        all_cos_sim_negative=[]
        for i in range(BATCH_SIZE):
            batch_of_cos_sims=[]
            for j in range(len(n_negative_vecs)):
                negative_vec=n_negative_vecs[j]
                #print negative_vec[0], "is negative_vec", i
                batch_of_cos_sims.append(cosine_similarity(target_vec[i], negative_vec[i]))
            all_cos_sim_negative.append(batch_of_cos_sims)

        all_cos_sim_negative=np.array(all_cos_sim_negative)[:,:,:,0]
        cos_sim_positive = np.array([cos_sim_positive[i] for i in range(BATCH_SIZE)])
        # cos sims should be BATCH_SIZE x num questions (i.e. 2 when only using 1 negative example or 21 when using 20)        
        cos_sims=np.concatenate((cos_sim_positive, all_cos_sim_negative),axis=1)
        # print cos_sims[0], "is cos_sims for first in batch"
        max_idx = Variable(torch.FloatTensor(np.array(cos_sims)), requires_grad=True) #use axis = 1 when we use more negative examples later
        max_idx=max_idx[:,:,0]
       # y = Variable(torch.LongTensor([0]),requires_grad=True)
        y = Variable(label)
        y=y[:,0]
        """
        because we know the 0th index in cos_sims is always the example we expect to most closely match 
        the target question
        """

        optimizer.zero_grad()
        
        loss = criterion(max_idx, y)
        #print loss, max_idx, "is loss and max idx"
        loss.backward()
    
        running_loss += loss.data[0]
#         print running_loss
        optimizer.step()

    print "Loss after epoch " + str(epoch) + " :" + str(running_loss)
# ----END TRAINING



going in to training
0
Loss after epoch 0 :4.37520591915
0
Loss after epoch 1 :4.33634459972
0
Loss after epoch 2 :4.43634518981
0


KeyboardInterrupt: 

In [74]:
len(train_titles_only)

22853

In [41]:
criterion(Variable(torch.FloatTensor([1,2,3,4,5,6,7,8,9,10,10.2])),Variable(torch.LongTensor([10])))

Variable containing:
 0
[torch.FloatTensor of size 1]

In [75]:
sample[0][0:3]


( 0 ,.,.) = 
  0.1273  0.1425  0.1396  ...   0.0000  0.0000  0.0000
 -0.0432 -0.1104 -0.0985  ...   0.0000  0.0000  0.0000
  0.0310  0.0730  0.0684  ...   0.0000  0.0000  0.0000
           ...             ⋱             ...          
  0.2174 -0.0451 -0.0313  ...   0.0000  0.0000  0.0000
 -0.0273  0.0040 -0.0797  ...   0.0000  0.0000  0.0000
 -0.0238  0.0159 -0.0015  ...   0.0000  0.0000  0.0000

( 1 ,.,.) = 
  0.1273  0.1425  0.0631  ...   0.0000  0.0000  0.0000
 -0.0432 -0.1104 -0.0884  ...   0.0000  0.0000  0.0000
  0.0310  0.0730 -0.0981  ...   0.0000  0.0000  0.0000
           ...             ⋱             ...          
  0.2174 -0.0451  0.0737  ...   0.0000  0.0000  0.0000
 -0.0273  0.0040 -0.0971  ...   0.0000  0.0000  0.0000
 -0.0238  0.0159 -0.0388  ...   0.0000  0.0000  0.0000

( 2 ,.,.) = 
  0.0827 -0.0385  0.0000  ...   0.0000  0.0000  0.0000
 -0.0481 -0.0178  0.0000  ...   0.0000  0.0000  0.0000
  0.0284 -0.1324  0.0000  ...   0.0000  0.0000  0.0000
           ...         

In [None]:
cosine_similarity(sample[0][0].numpy().flatten(), sample[0][1].numpy().flatten())[0][0]

In [59]:
sample[0][0].numpy().shape

(200, 38)