In [23]:
import torch
import gzip
import numpy as np

def convert_to_list(filename):
    if filename.endswith('gz'):
        with gzip.open(filename,'r')as f:
            text_tokens = f.readlines()
    else:
        with open(filename, 'r') as f:
            text_tokens = f.readlines()
    text_tokens = [token.replace('\n','').split('\t') for token in text_tokens]
    text_tokens = [[token[0], token[1].split(' '), token[2].split(' ')] for token in text_tokens]
                   
    return text_tokens

#Sample:question_id, similar_question_id, negative_question_id
def convert_to_samples(filename):
    my_list=convert_to_list(filename)
    new_samples=[]
    for original_sample in my_list:
        for similar in original_sample[1]:
            new_samples.append([original_sample[0], similar, original_sample[2][0]])# change this to include all negative 
                                                                                     # examples later
    return new_samples
def make_lookup_table_for_training_data(filename):
    lookup={}
    text_token_list=convert_to_list(filename)
    for token in text_token_list:
        lookup[token[0]] = {'title':token[1],'question':token[2]}
    return lookup
        
#takes  sample_ids of [[q1,p1,n1],[q2,p2,n2]....]
#outputs titles like [[q1_title, p1_title, n1_title],[q2_title,p2_title,n2_title]...]
def convert_sampleids_to_titles(sample_ids,lookup):
    #each sample_id [question_id, pos_id, neg_id]
    titles = []
    for sample_id in sample_ids:
        #sample_id : question_id, similar_question_id, negative_question_id
        try:
            titles.append([lookup[str(identity)]['title'] for identity in sample_id])
        except:
            print sample_id
    return titles
    

In [26]:
#text_tokenized.txt.gz has id \t title \t question body
text_tokenized='askubuntu/text_tokenized.txt.gz'

#train_random.txt
#(1) the query question ID, (2) the list of similar question IDs, and (3) the list of randomly selected question IDs.
train_random_filename='askubuntu/train_random.txt'

#Each line contains (1) the query question ID, (2) the list of similar question IDs, (3) the list of 20 candidate question IDs and (4) the associated BM25 scores of these questions computed by the Lucene search engine. The second field (the set of similar questions) is a subset of the third field.
dev_filename='askubuntu/dev.txt'
test_filename='askubuntu/test.txt'

train_samples = convert_to_samples(train_random_filename)
dev_samples = convert_to_samples(dev_filename)
test_samples = convert_to_samples(test_filename)

lookup = make_lookup_table_for_training_data(text_tokenized)
train_list = convert_to_list(train_random_filename)
train_titles_only = convert_sampleids_to_titles(train_samples, lookup)

In [28]:
word_embeddings = 'askubuntu/vector/vectors_pruned.200.txt.gz'
f = gzip.open(word_embeddings, 'r')
wv_text = [ ]
lines = f.readlines()
for line in lines:
    wv_text.append(line.strip())

word_to_vec = {}

for line in wv_text:
    parts = line.split()
    word = parts[0]
    vector = np.array([float(v) for v in parts[1:]])
    word_to_vec[word] = vector
f.close()

def extract_features(word):
    word=word.encode('utf-8')
    return word_to_vec.get(word,[0.0 for i in range(200)])

In [29]:
extract_features('laptop')

array([  6.81100000e-03,  -1.91690000e-02,   3.72350000e-02,
         1.88340000e-02,   7.88700000e-03,  -6.64640000e-02,
         8.59540000e-02,  -8.24730000e-02,   1.83440000e-02,
        -8.74610000e-02,  -1.11447000e-01,  -3.71180000e-02,
        -8.30300000e-03,   1.40190000e-02,   8.45250000e-02,
         2.03400000e-03,  -2.82450000e-02,  -1.62900000e-02,
         7.20170000e-02,   6.04480000e-02,  -1.39264000e-01,
        -1.63540000e-02,   1.74352000e-01,   3.68500000e-03,
        -9.88580000e-02,  -4.12110000e-02,  -4.93310000e-02,
        -2.85100000e-02,  -9.08550000e-02,  -2.40180000e-02,
        -9.21600000e-03,  -3.20450000e-02,  -3.30730000e-02,
        -2.61440000e-02,   1.22893000e-01,  -3.29470000e-02,
        -9.37550000e-02,   1.87461000e-01,   1.35400000e-03,
         7.04130000e-02,   3.89100000e-03,  -2.50700000e-02,
        -1.38250000e-02,   5.29970000e-02,  -9.65200000e-02,
         5.73710000e-02,  -1.60480000e-02,   3.27330000e-02,
         5.89300000e-02,

In [41]:
def find_maximum_title_and_body_length(lookup_table):
    max_len_title = -1
    max_len_question = -1
    max_len_question_id = 0
    for key, dict_val in lookup_table.iteritems():
        len_title = len(dict_val['title'])
        len_question = len(dict_val['question'])
        if len_title > max_len_title:
             max_len_title = len_title
        if len_question > max_len_question:
            max_len_question = len_question
            max_len_question_id = key
    return max_len_title, max_len_question

def title_to_feature_matrix(title_word_list):
    feature_matrix = []
    for word in title_word_list:
        word_features = extract_features(word)
        feature_matrix.append(word_features)
    #Pad the feature with zeros to ensure all inputs to the net have the same dimension
    feature_matrix += [[0.] * NUM_FEATURES_PER_WORD] * (MAX_TITLE_LENGTH - len(title_word_list))
    return feature_matrix

In [40]:
MAX_TITLE_LENGTH, MAX_BODY_LENGTH = find_maximum_title_and_body_length(lookup)
NUM_FEATURES_PER_WORD = 200
INPUT_DIM = (MAX_TITLE_LENGTH, NUM_FEATURES_PER_WORD)

In [46]:
print train_titles_only[0][0]
print title_to_feature_matrix(train_titles_only[0][0])

['system', 'running', 'in', 'low', 'graphic', 'mode', '(', 'ubuntu', 'without', 'monitor', ')']
[array([ 0.101999, -0.104434, -0.012801,  0.068122, -0.055403,  0.04133 ,
        0.146494,  0.025313,  0.001449,  0.067583,  0.015489, -0.011643,
        0.079966,  0.020786,  0.012335,  0.084839, -0.01937 ,  0.013663,
        0.093786, -0.127042,  0.035466,  0.00035 ,  0.16625 ,  0.0992  ,
       -0.010311,  0.00118 , -0.002777,  0.000727, -0.052451,  0.035446,
        0.030237, -0.003303,  0.043869,  0.011238,  0.080022,  0.088194,
        0.016981, -0.011537,  0.046581, -0.010625,  0.110818,  0.083209,
        0.002172, -0.052772, -0.046101, -0.091841, -0.173323, -0.068792,
       -0.046995,  0.116353,  0.025877,  0.002772, -0.119018, -0.018811,
       -0.142059, -0.095646,  0.134619, -0.093484, -0.066167, -0.018144,
        0.036287, -0.053314, -0.063813,  0.066953,  0.064343, -0.015581,
       -0.060295,  0.054359, -0.003971, -0.01272 ,  0.037697, -0.002866,
       -0.031938,  0.020962

In [64]:
import torch
import torch.autograd as autograd
import torch.nn.functional as F
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
from torch.autograd import Variable


class CNN(nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(CNN, self).__init__()

        self.layer1 = nn.Sequential(
            nn.Conv1d(200, 200, KERNEL_SIZE),
            nn.ReLU(),
            nn.AvgPool1d(KERNEL_SIZE)
        )
    def forward(self, x):
        x = self.layer1(x)
        return x

In [65]:
from sklearn.metrics.pairwise import cosine_similarity
KERNEL_SIZE = 3
INPUT_SIZE = 200
HIDDEN_SIZE = 400
LEARNING_RATE = 1e-3
NUM_EPOCHS = 1
net = CNN(INPUT_SIZE, HIDDEN_SIZE)

criterion = nn.MultiMarginLoss(p=1, margin=1, weight=None, size_average=True) #HAHA just put these in to look smart 
optimizer = torch.optim.Adam(net.parameters(), lr = LEARNING_RATE)

# ----TRAINING
for epoch in range(NUM_EPOCHS):
    for sample in train_titles_only:
        target_title = sample[0]
        positive_title = sample[1]
        negative_title = sample[2]
        
        print target_features
        target_features = torch.FloatTensor(title_to_feature_matrix(target_title))
        print target_features.shape
        positive_features = torch.FloatTensor(title_to_feature_matrix(positive_title))
        negative_features = torch.FloatTensor(title_to_feature_matrix(negative_title))
        
        target_vec = net(target_features)
        positive_vec = net(positive_features)
        negative_vec = net(negative_features)
        
        cos_sim_positive = cosine_similarity(target_vec, positive_vec)
        cos_sim_negative = cosine_similarity(target_vec, negative_vec)
        
        cos_sims = [cos_sim_positive, cos_sim_negative]
        
        max_idx = np.argmax(cos_sims) #use axis = 1 when we use more negative examples later
        
        y = 0 
        """
        because we know the 0th index in cos_sims is always the example we expect to most closely match 
        the target question
        """
        
        running_loss = 0.0

        optimizer.zero_grad()

        loss = criterion(max_idx, y)

        loss.backward()

        print "Done batch " + str(batch_num)

        optimizer.step()

    print "Loss after epoch " + str(epoch) + " :" + str(loss.data[0])
# ----END TRAINING




 0.1020 -0.1044 -0.0128  ...   0.0344 -0.0136 -0.0370
 0.0039 -0.0796 -0.0446  ...  -0.0216  0.0232 -0.1356
-0.0264  0.0131 -0.0372  ...  -0.0599  0.0274  0.0208
          ...             ⋱             ...          
 0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
 0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
 0.0000  0.0000  0.0000  ...   0.0000  0.0000  0.0000
[torch.FloatTensor of size 38x200]

torch.Size([38, 200])


ValueError: Expected 3D tensor as input, got 2D tensor instead.

[['system', 'running', 'in', 'low', 'graphic', 'mode', '(', 'ubuntu', 'without', 'monitor', ')'], ['getting', 'system', 'to', 'boot', 'in', 'headless', 'mode', 'set-up', 'without', 'display', 'problems'], ['using', 'apt-get', 'partially', 'through', 'a', 'proxy', 'server']]
