In [3]:
import torch
import gzip
import numpy as np

def convert_to_list(filename):
    if filename.endswith('gz'):
        with gzip.open(filename,'r')as f:
            text_tokens = f.readlines()
    else:
        with open(filename, 'r') as f:
            text_tokens = f.readlines()
    text_tokens = [token.replace('\n','').split('\t') for token in text_tokens]
    text_tokens = [[token[0], token[1].split(' '), token[2].split(' ')] for token in text_tokens]
                   
    return text_tokens

#Sample:question_id, similar_question_id, negative_question_id
def convert_to_samples(filename):
    my_list=convert_to_list(filename)
    new_samples=[]
    for original_sample in my_list:
        for similar in original_sample[1]:
            new_samples.append([original_sample[0], similar, original_sample[2][0]])# change this to include all negative 
                                                                                     # examples later
    return new_samples
def make_lookup_table_for_training_data(filename):
    lookup={}
    text_token_list=convert_to_list(filename)
    for token in text_token_list:
        lookup[token[0]] = {'title':token[1],'question':token[2]}
    return lookup
        
#takes  sample_ids of [[q1,p1,n1],[q2,p2,n2]....]
#outputs titles like [[q1_title, p1_title, n1_title],[q2_title,p2_title,n2_title]...]
def convert_sampleids_to_titles(sample_ids,lookup):
    #each sample_id [question_id, pos_id, neg_id]
    titles = []
    for sample_id in sample_ids:
        #sample_id : question_id, similar_question_id, negative_question_id
        try:
            titles.append([lookup[str(identity)]['title'] for identity in sample_id])
        except:
            print sample_id
    return titles
    

In [4]:
#text_tokenized.txt.gz has id \t title \t question body
text_tokenized='askubuntu/text_tokenized.txt.gz'

#train_random.txt
#(1) the query question ID, (2) the list of similar question IDs, and (3) the list of randomly selected question IDs.
train_random_filename='askubuntu/train_random.txt'

#Each line contains (1) the query question ID, (2) the list of similar question IDs, (3) the list of 20 candidate question IDs and (4) the associated BM25 scores of these questions computed by the Lucene search engine. The second field (the set of similar questions) is a subset of the third field.
dev_filename='askubuntu/dev.txt'
test_filename='askubuntu/test.txt'

train_samples = convert_to_samples(train_random_filename)
dev_samples = convert_to_samples(dev_filename)
test_samples = convert_to_samples(test_filename)

lookup = make_lookup_table_for_training_data(text_tokenized)
train_list = convert_to_list(train_random_filename)
train_titles_only = convert_sampleids_to_titles(train_samples, lookup)

In [28]:
def remove_non_ascii(text):
    return ''.join([i if ord(i) < 128 else '' for i in text])

word_embeddings = 'askubuntu/vector/vectors_pruned.200.txt.gz'
f = gzip.open(word_embeddings, 'r')
wv_text = [ ]
lines = f.readlines()
for line in lines:
    wv_text.append(line.strip())

word_to_vec = {}

for line in wv_text:
    parts = line.split()
    word = parts[0]
    vector = np.array([float(v) for v in parts[1:]])
    word_to_vec[word] = vector
f.close()

def extract_features(word):
    try:
        word=remove_non_ascii(word)
        word=word.encode('utf-8')
    except:
        print(word)
    return word_to_vec.get(word,[0.0 for i in range(200)])

In [6]:
extract_features('laptop')

array([  6.81100000e-03,  -1.91690000e-02,   3.72350000e-02,
         1.88340000e-02,   7.88700000e-03,  -6.64640000e-02,
         8.59540000e-02,  -8.24730000e-02,   1.83440000e-02,
        -8.74610000e-02,  -1.11447000e-01,  -3.71180000e-02,
        -8.30300000e-03,   1.40190000e-02,   8.45250000e-02,
         2.03400000e-03,  -2.82450000e-02,  -1.62900000e-02,
         7.20170000e-02,   6.04480000e-02,  -1.39264000e-01,
        -1.63540000e-02,   1.74352000e-01,   3.68500000e-03,
        -9.88580000e-02,  -4.12110000e-02,  -4.93310000e-02,
        -2.85100000e-02,  -9.08550000e-02,  -2.40180000e-02,
        -9.21600000e-03,  -3.20450000e-02,  -3.30730000e-02,
        -2.61440000e-02,   1.22893000e-01,  -3.29470000e-02,
        -9.37550000e-02,   1.87461000e-01,   1.35400000e-03,
         7.04130000e-02,   3.89100000e-03,  -2.50700000e-02,
        -1.38250000e-02,   5.29970000e-02,  -9.65200000e-02,
         5.73710000e-02,  -1.60480000e-02,   3.27330000e-02,
         5.89300000e-02,

In [22]:
def find_maximum_title_and_body_length(lookup_table):
    max_len_title = -1
    max_len_question = -1
    max_len_question_id = 0
    for key, dict_val in lookup_table.iteritems():
        len_title = len(dict_val['title'])
        len_question = len(dict_val['question'])
        if len_title > max_len_title:
             max_len_title = len_title
        if len_question > max_len_question:
            max_len_question = len_question
            max_len_question_id = key
    return max_len_title, max_len_question

def title_to_feature_matrix(title_word_list):
    feature_matrix = []
    for word in title_word_list:
        word_features = extract_features(word)
        feature_matrix.append(word_features)
    #Pad the feature with zeros to ensure all inputs to the net have the same dimension
    feature_matrix += [[0.] * NUM_FEATURES_PER_WORD] * (MAX_TITLE_LENGTH - len(title_word_list))
    #print np.array(feature_matrix).T.shape
    return np.array(feature_matrix).T

In [8]:
MAX_TITLE_LENGTH, MAX_BODY_LENGTH = find_maximum_title_and_body_length(lookup)
NUM_FEATURES_PER_WORD = 200
INPUT_DIM = (MAX_TITLE_LENGTH, NUM_FEATURES_PER_WORD)

In [9]:
print train_titles_only[0][0]
print title_to_feature_matrix(train_titles_only[0][0])

['system', 'running', 'in', 'low', 'graphic', 'mode', '(', 'ubuntu', 'without', 'monitor', ')']
(200, 38)
[[ 0.101999  0.00388  -0.026436 ...,  0.        0.        0.      ]
 [-0.104434 -0.07965   0.013091 ...,  0.        0.        0.      ]
 [-0.012801 -0.044619 -0.037213 ...,  0.        0.        0.      ]
 ..., 
 [ 0.034353 -0.021587 -0.059916 ...,  0.        0.        0.      ]
 [-0.013605  0.023161  0.027431 ...,  0.        0.        0.      ]
 [-0.037034 -0.135637  0.020814 ...,  0.        0.        0.      ]]


In [10]:
import torch
import torch.autograd as autograd
import torch.nn.functional as F
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
from torch.autograd import Variable


class CNN(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(CNN, self).__init__()
        
        self.conv1d = nn.Conv1d(200, 200, KERNEL_SIZE)
        
        self.layer1 = nn.Sequential(
            nn.Conv1d(200, 200, KERNEL_SIZE),
            nn.ReLU(),
            nn.AvgPool1d(KERNEL_SIZE)
        )
    def forward(self, x):
        x = self.layer1(x)
        return x

In [34]:
from sklearn.metrics.pairwise import cosine_similarity
KERNEL_SIZE = 3
INPUT_SIZE = 200
HIDDEN_SIZE = 400
LEARNING_RATE = 1e-3
NUM_EPOCHS = 1
net = CNN(INPUT_SIZE, HIDDEN_SIZE)

criterion = nn.MultiMarginLoss(p=1, margin=1, weight=None, size_average=True) #HAHA just put these in to look smart 
optimizer = torch.optim.Adam(net.parameters(), lr = LEARNING_RATE)

# ----TRAINING
for epoch in range(NUM_EPOCHS):
    for sample in train_titles_only:
        target_title = sample[0]
        positive_title = sample[1]
        negative_title = sample[2]
        
        target_features = title_to_feature_matrix(target_title)
        positive_features = title_to_feature_matrix(positive_title)
        negative_features = title_to_feature_matrix(negative_title)
        
        target_features = Variable(
                        torch.FloatTensor(
                            [target_features]
                        )
                        )
        positive_features = Variable(
                        torch.FloatTensor(
                            [positive_features]
                        )
                        )
        negative_features = Variable(
                        torch.FloatTensor(
                            [negative_features]
                        )
                        )

        target_matrix = net(target_features)
        positive_matrix = net(positive_features)
        negative_matrix = net(negative_features)
                
        #target_vec = [item for sublist in target_matrix.data.numpy()[0] for item in sublist]
        target_vec=target_matrix.data.numpy()[0].reshape(1,-1)
        positive_vec=target_matrix.data.numpy()[0].reshape(1,-1)
        negative_vec=target_matrix.data.numpy()[0].reshape(1,-1)
        #positive_vec = [item for sublist in positive_matrix.data.numpy()[0] for item in sublist]
        #negative_vec = [item for sublist in negative_matrix.data.numpy()[0] for item in sublist]
        #print "target_vec: " + str(target_vec)
        
        cos_sim_positive = cosine_similarity(target_vec, positive_vec)
        cos_sim_negative = cosine_similarity(target_vec, negative_vec)
        
        cos_sims = [cos_sim_positive, cos_sim_negative]
        #print "cos_sims: " + str(cos_sims)
        #max_idx = Variable(torch.FloatTensor([np.argmax(cos_sims)]), requires_grad=True) #use axis = 1 when we use more negative examples later
        max_idx = Variable(torch.FloatTensor([np.argmaxcos_sims]), requires_grad=True)
        #print "max_idx: " + str(max_idx)
        y = Variable(torch.LongTensor([1]),requires_grad=True)
        #print y
        
        #print type(max_idx)
        #print type(y)
        """
        because we know the 0th index in cos_sims is always the example we expect to most closely match 
        the target question
        """
        
        running_loss = 0.0

        optimizer.zero_grad()

        loss = criterion(max_idx, y)

        loss.backward()

        optimizer.step()

    print "Loss after epoch " + str(epoch) + " :" + str(loss.data[0])
# ----END TRAINING



RuntimeError: invalid argument 3: target out of range at /Users/soumith/code/builder/wheel/pytorch-src/torch/lib/THNN/generic/MultiMarginCriterion.c:43

In [36]:
X_scores = torch.stack(torch.FloatTensor([torch.FloatTensor(1),2,3]), 0)

AttributeError: 'float' object has no attribute 'unsqueeze'