In [76]:
import datetime
import gensim
import numpy as np
import os.path
import pandas as pd
import torch, stanza
from torch import nn
import torch.nn.functional as F
import threading


# Data Processing

This section can have stuff related to data prep.


Should the MSPC Dataset be a part of this section? - Adam

In [77]:
def read_file(file_name):
    # Note: Unable to use pd.read_csv... the function complained about an issue with the formatting of the tsv file
    # train = pd.read_csv('data/msr_paraphrase_train.txt', sep='\t', encoding='latin1')
    # train

    # opting to read file in and split columns manually to create a pandas dataframe
    list = []
    with open(file_name, encoding="utf8") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            fields = line.split('\t')
            list.append(fields)

    df = pd.DataFrame(list[1:], columns=['Quality', 'ID1', 'ID2', 'String1', 'String2'])
    return df


# Model Definition

![Model Overview](./images/overview.png)


## Input Layer

In the input layer is made up of a Stanford Parser to provide a syntactic tree so that the model can extract significant words (mainly, subject, predicate, object) in the input corpus. Word2Vec is then used to map the words into vectors

In [78]:
# set 'download_method = None' to not download the resources over and over
nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', download_method=None, use_gpu=False)

def trunk_construction(str, parent_label = None):
    doc = nlp(str)
    tree = doc.sentences[0].constituency

    words = construct_sentence(tree, parent_label)
    return ' '.join(words)

def construct_sentence(tree, parent_label = None, leave_pos=False):

    sentences = []
    if 'NN' in tree.label:
        if parent_label == 'NP':
            # sentences.append(tree)
            sentences = sentences + tree.leaf_labels()
    if 'VB' in tree.label:
        if parent_label == 'VP':
            #sentences.append(tree)
            sentences = sentences + tree.leaf_labels()
    for child in tree.children:
        sentences = sentences + construct_sentence(child, tree.label)

    return sentences

def find_subject_branches(tree):
    subjects = []
    if tree.label == 'S':
        subjects.append(tree)

    for child in tree.children:
        subjects = subjects + find_subject_branches(child)

    return subjects

# #
# # # According to the paper the subject is the first NN child of NP
# def find_subject(noun_phrase):
#     subject = None
#     for child in noun_phrase.children:
#         if  'NN' in child.label:
#             subject = ' '.join(child.leaf_labels())
#             break
#
#     print(subject)
#     return subject
#
# def find_predicate(verb_phrase):
#     predicate = None
#     for child in verb_phrase.children:
#         if 'VB' in child.label:
#             predicate = child.leaf_labels()
#             break
#
#     return predicate
#
# def find_object(verb_phrase, parent_label = None):
#     objects = []
#     if 'NN' in verb_phrase.label and (parent_label == 'NP' or parent_label == 'PP' or parent_label == 'ADJP'):
#         objects.append(' '.join(verb_phrase.leaf_labels()))
#     else:
#         for child in verb_phrase.children:
#             objects = objects + find_object(child, verb_phrase.label)
#
#     return objects
#
#
# def find_spo(s):
#     subject, predicate, obj = None, None, None
#     for child in s.children:
#         if child.label == 'NP':
#             subject = find_subject(child)
#         if child.label == 'VP':
#             predicate = find_predicate(child)
#             obj = ' '.join(find_object(child))
#
#     return subject, predicate, obj
#
# def trunk_construction(str):
#     doc = nlp(str)
#     tree = doc.sentences[0].constituency
#     print(tree)
#     subject_branches = find_subject_branches(tree)
#     subjects, predicates, objects = [], [], []
#     for subject in subject_branches:
#         subject, predicate, object = find_spo(subject)
#         if subject is not None: subjects.append(subject)
#         if predicate is not None: predicates.append(predicate)
#         if objects is not None: objects.append(object)

    return subjects, predicates, objects

2023-04-08 22:19:15 INFO: Loading these models for language: en (English):
| Processor    | Package  |
---------------------------
| tokenize     | combined |
| pos          | combined |
| constituency | wsj      |

2023-04-08 22:19:15 INFO: Using device: cpu
2023-04-08 22:19:15 INFO: Loading: tokenize
2023-04-08 22:19:15 INFO: Loading: pos
2023-04-08 22:19:15 INFO: Loading: constituency
2023-04-08 22:19:16 INFO: Done loading processors!


In [79]:
def test_parser(str, valid_sentence):

    new_sentence = trunk_construction(str)
    # print(new_sentence)
    assert new_sentence == valid_sentence

## Parser Test Cases
Test the parser using some of the training data sentences as input and asserting the output sentence matches the algorithm defined in the paper.

In [82]:
test_parser('Syrian forces launch new attacks', "forces launch attacks")
test_parser("""the flat tire was replaced by the driver""","tire was replaced driver")
test_parser("""Amrozi accused his brother, whom he called "the witness", of deliberately distorting his evidence.""",
           "Amrozi accused brother called witness distorting evidence")
test_parser("""Shares of Genentech, a much larger company with several products on the market, rose more than 2 percent""",
            "Shares Genentech company products market rose percent")
test_parser("""Gyorgy Heizler, head of the local disaster unit, said the coach was carrying 38 passengers.""",
             "Gyorgy Heizler head disaster unit said coach was carrying passengers")
test_parser("""Referring to him as only "the witness", Amrozi accused his brother of deliberately distorting his evidence.""",
           "Referring witness Amrozi accused brother distorting evidence")
test_parser("""His wife said he was "100 percent behind George Bush" and looked forward to using his years of training in the war.""",
            "wife said was percent George Bush looked using years training war")
test_parser("""Sheena Young of Child, the national infertility support network, hoped the guidelines would lead to a more "fair and equitable" service for infertility sufferers""", "Sheena Young Child network hoped guidelines lead service infertility sufferers")
test_parser("""Sheena Young, for Child, the national infertility support network, said the proposed guidelines should lead to a more "fair and equitable" service for infertility sufferers.""", "Sheena Young Child network said guidelines lead service infertility sufferers")

test_parser("""Among CNN viewers, 29 percent said they were Republicans and 36 percent called themselves conservatives.""",
            "CNN viewers percent said were Republicans percent called conservatives")
test_parser("""Out of Fox viewers, 41 percent describe themselves as Republicans, 24 percent as Democrats and 30 percent as Independents""",
            "Fox viewers percent describe Republicans percent Democrats percent Independents")

# Note: stanza parser has a problem with the below sentence.  It is unable to parse it correctly
# test_parser("""Sheena Young, for Child, the national infertility support network, said the proposed guidelines should lead to a more "fair and equitable" service for infertility sufferers.""", "")
# test_parser("""Among Fox viewers, 41 percent describe themselves as Republicans, 24 percent as Democrats and 30 percent as Independents""", "Fox viewers percent describe Republicans percent Democrats percent Independents")

## Concurrency Parsing
Added support for concurrent parsing.  This can help in the performance of the preprocessing

In [83]:
class SentenceProcessingThread(threading.Thread):
    def __init__(self, sentences, output_list, begin, end):
        super(SentenceProcessingThread, self).__init__()
        self.sentences = sentences
        self.nlp = stanza.Pipeline(lang='en', processors='tokenize,pos,constituency', download_method=None, use_gpu=True)
        self.output_list = output_list
        self.begin = begin
        self.end = end

    def trunk_construction(self, str, parent_label = None):
        doc = self.nlp(str)
        tree = doc.sentences[0].constituency

        words = construct_sentence(tree, parent_label)
        return ' '.join(words)

    def run(self):
        print(f"going to process {self.begin} to {self.end}")
        for i, sentence in enumerate(self.sentences):
            new_sentence = trunk_construction(sentence)
            self.output_list[self.begin + i] = new_sentence

def process_sentences_concurrently(sentences, output, p=2):
    total = len(sentences)
    interval = int(total / p)
    threads = []
    for i in range(p):
        s = i*interval
        if i == p-1:
            e = total
        else:
            e = (i+1) * interval
        sentences_slice = sentences[s:e]
        sentence_thread = SentenceProcessingThread(sentences_slice, output, s, e)
        sentence_thread.start()
        threads.append(sentence_thread)

    for thread in threads:
        thread.join()

def preprocess_corpus(input_file='data/msr_paraphrase_train.txt', output_file='data/msr_paraphrase_train_stanza.txt', N=None):
    if os.path.exists(output_file):
        print(f"{output_file} already exists")
        return

    starttime = datetime.datetime.now()
    df = read_file(input_file)

    if N is None:
        N = len(df.String1)

    output1 = [None] * N
    output2 = [None] * N

    # we can process with more threads if we only have CPU
    p = 8

    if torch.cuda.is_available():
        # if cuda is available we don't need that many threads
        # and if the number of threads is set too large using cuda
        # we can get out of memory exceptions
        p = 2
        torch.cuda.empty_cache()

    process_sentences_concurrently(df.String1[:N], output1, p)

    # try and be careful with gpu memory
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    process_sentences_concurrently(df.String2[:N], output2, p)

    endtime = datetime.datetime.now()

    print(f"time to process {N*2} sentences is {endtime - starttime}")

    stanza_df = df[:N]

    processed_string1 = pd.Series(output1)
    # processed_string1.apply(gensim.utils.simple_preprocess)
    processed_string2 = pd.Series(output2)
    #processed_string2.apply(gensim.utils.simple_preprocess)

    stanza_df.String1 = processed_string1
    stanza_df.String2 = processed_string2

    # write the file out.  This can help in the future
    print(f"about to write out {output_file}")
    stanza_df.to_csv(output_file, sep="\t")


## Sentence Preprocessing
pass the input sentences from the training dataset through the stanford/stanza parser, extracting the relevant parts of speech and then tokenize the processed sentences using the gensim.utils.simple_preprocess utility

In [84]:
# start_time = datetime.datetime.now()
#
# processed_string1 = df[:500].String1.apply(trunk_construction)
# processed_string2 = df[:500].String2.apply(trunk_construction)
#
# end_time = datetime.datetime.now()
#
# print (f"Processing 200 sentences with stanza library took {end_time - start_time}")
#
# start_time = datetime.datetime.now()
#
# processed_string1 = processed_string1.apply(gensim.utils.simple_preprocess)
# processed_string2 = processed_string2.apply(gensim.utils.simple_preprocess)
#
# end_time = datetime.datetime.now()
#
# print (f"Processing 200 sentences with gensim.utils.simple_preprocess took {end_time - start_time}")
# print(f"Number of sentences processed in the String1 column: len(processed_string1")
# print(f"Number of sentences processed in the String2 column: len(processed_string2")

# Word2Vec Embeddings
Take the preprocessed and tokenized sentences and use Word2Vec to get the word embeddings.  Take each word embedding in a sentence and find the mean which will represent the embedding for the sentence.

In [85]:
# from gensim.models import Word2Vec
#
# corpus = pd.concat([processed_string1, processed_string2], ignore_index=True)
#
# #model = Word2Vec(sentences=corpus, vector_size=(len(processed_string1) + len(processed_string2)), min_count=2)
# # set vector size to reduce computational complexity
# model = Word2Vec(sentences=corpus, min_count=1, window=2, vector_size=50)
# #model.build_vocab(sentences=corpus)
# #model.train(corpus, total_examples=model.corpus_count, epochs=5)
#
# print(model)
# print(model.wv.key_to_index)
#
# model.wv.get_vector('president')
#
# # for index, word in enumerate(model.wv.index_to_key):
# #     if index == 120:
# #         break
# #     print(f"word #{index}/{len(model.wv.index_to_key)} is {word}")

In [86]:
from gensim.models import Word2Vec

# Function is broken out for testing purposes
def generate_word2vec_model(corpus):
    # Creating the Word2Vec model
    model = Word2Vec(sentences=corpus, min_count=1, window=2, vector_size=50)

    return model


In [87]:
# Function is broken out for testing purposes
def sentence_embeddings(w2v_model, sentence, size):
    np_embedding = np.zeros(size)
    for i, word in enumerate(sentence):
        np_embedding[i] = w2v_model.wv.get_vector(word)

    return np_embedding
    # list = []
    # for word in sentence:
    #     list.append(w2v_model.wv.get_vector(word))
    #
    # word_matrix = np.row_stack(list)
    # #return np.mean(word_matrix, axis=0)
    # return word_matrix

In [88]:
def test_word2vec():

    df = read_file('data/msr_paraphrase_train.txt')

    sentences1 = df.String1[:5].apply(gensim.utils.simple_preprocess)
    sentences2 = df.String2[:5].apply(gensim.utils.simple_preprocess)

    corpus = pd.concat([sentences1, sentences2], ignore_index=True)

    max_sentence_len = corpus.apply(len).max()

    model = generate_word2vec_model(corpus)

    embedding = sentence_embeddings(model, corpus[0], (max_sentence_len, 50))
    assert embedding.shape == (max_sentence_len, 50)

    # Not sure if this will always generate the same embedding
    # test_embedding = np.array([-0.00113049, -0.00124808,  0.00252251,  0.00058141,  0.00187964,  0.00379025,
    #           -0.00012356,  0.00347055, -0.00241507,  0.00545258, -0.00574078, -0.00489824,
    #           -0.00224492,  0.00744946,  0.00350835, -0.00139295, -0.00081134,  0.00655962,
    #           0.00244374, -0.00447209, -0.00124291, -0.00092616,  0.0021044,  -0.00092541,
    #           0.00284307,  0.00367638,  0.00364716,  0.00519976, -0.00088121,  0.00109841,
    #           -0.00219322, -0.00372483,  0.00078702, -0.00612309, -0.00312131,  0.00088071,
    #           0.00503909, -0.0009484,  -0.00068209, -0.0004782,   0.00367015,  0.00314679,
    #           -0.00302592,  0.00346377,  0.00151145, -0.00076442, -0.0012528,  -0.00087095,
    #           -0.00075365,  0.00468711])
    #
    #
    # # not sure if this will always be equal based on comment on test_embedding variable
    # assert np.allclose(embedding, test_embedding)

test_word2vec()

In [89]:
def corpus_embeddings(model, corpus, max_sentence_len):
    corpus_size = len(corpus)
    embeddings_list = []
    embedding_matrix = np.zeros((corpus_size, max_sentence_len, 50))
    for i, sentence in enumerate(corpus):
        embeddings = sentence_embeddings(model, sentence, size=(max_sentence_len, 50))
        embedding_matrix[i] = embeddings
        embeddings_list.append(embeddings)

    return embedding_matrix

In [90]:
from torch.utils.data import DataLoader, Dataset

# Dataset for the MSPC dataset
class MSPCDataset(Dataset):
    """
    Arguments:
        tsv_file (string): path to the tsv file with sentences to compare and associate quality score
        num_records (int): number of records to load.  Defaults to None which is all
    """
    def __init__(self, tsv_file, num_records=None):

        file_parts = os.path.splitext(tsv_file)
        output_file = f"{file_parts[0]}_stanza{file_parts[1]}"
        print("About to preprocess data")
        preprocess_corpus(input_file=tsv_file, output_file=output_file)
        print("Done preprocessing data")


        #df = read_file('data/msr_paraphrase_train.txt')
        df = pd.read_csv(output_file, sep="\t")

        if num_records is not None:
            processed_string1 = df[:num_records].String1
            processed_string2 = df[:num_records].String2
            self.quality = df[:num_records].Quality
        else:
            processed_string1 = df.String1
            processed_string2 = df.String2
            self.quality = df.Quality

        processed_string1 = processed_string1.apply(gensim.utils.simple_preprocess)
        processed_string2 = processed_string2.apply(gensim.utils.simple_preprocess)

        corpus = pd.concat([processed_string1, processed_string2], ignore_index=True)


        max_sentence_len = corpus.apply(len).max()

        w2v_model = generate_word2vec_model(corpus)

        sentence_embeddings1 = corpus_embeddings(w2v_model, processed_string1, max_sentence_len=max_sentence_len)
        sentence_embeddings2 = corpus_embeddings(w2v_model, processed_string2, max_sentence_len=max_sentence_len)

        self.w2v_model = w2v_model
        self.sentences_embeddings1 = sentence_embeddings1
        self.sentences_embeddings2 = sentence_embeddings2

        # print (f"Processing 200 sentences with gensim.utils.simple_preprocess took {end_time - start_time}")
        print(f"Number of sentences processed in the String1 column: {len(processed_string1)}")
        print(f"Number of sentences processed in the String2 column: {len(processed_string2)}")
        #print(self.sentences_embeddings1)

    def __len__(self):
        return len(self.sentences_embeddings1)

    def __getitem__(self, i):
        return self.sentences_embeddings1[i], self.sentences_embeddings2[i], self.quality[i]

In [91]:
def test_dataset():
    dataset = MSPCDataset('data/msr_paraphrase_train.txt', 10)
    assert len(dataset) == 10
    # for set in dataset:
    #     print(len(set[0]), len(set[1]))
    # print(len(dataset[0][0]))
    # print(dataset[0])
test_dataset()

About to preprocess data
data/msr_paraphrase_train_stanza.txt already exists
Done preprocessing data
Number of sentences processed in the String1 column: 10
Number of sentences processed in the String2 column: 10


## Dataloaders
Create training and test dataloaders

In [92]:
train_dataset = MSPCDataset('data/msr_paraphrase_train.txt')
test_dataset = MSPCDataset('data/msr_paraphrase_test.txt')

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)

About to preprocess data
data/msr_paraphrase_train_stanza.txt already exists
Done preprocessing data
Number of sentences processed in the String1 column: 4076
Number of sentences processed in the String2 column: 4076
About to preprocess data
data/msr_paraphrase_test_stanza.txt already exists
Done preprocessing data
Number of sentences processed in the String1 column: 1725
Number of sentences processed in the String2 column: 1725


In [104]:
# filters out all zero filled columns to get the proper number of words in a sentence
train_dataset[0][0][~np.all(train_dataset[0][0] == 0, axis=1)]

array([[-1.77404061e-02,  3.47889327e-02, -1.09778875e-02,
         1.27714574e-02,  1.10109346e-02, -4.84661795e-02,
         2.93790512e-02,  5.26951738e-02, -2.61720475e-02,
         5.47409151e-03,  2.55659539e-02, -2.58834083e-02,
         6.07441831e-03,  7.61951879e-03, -1.08305123e-02,
         1.58339683e-02,  3.09274253e-02,  7.77853746e-03,
        -6.11838661e-02, -8.98842793e-03, -6.94689946e-03,
         2.58505139e-02,  6.81013986e-02, -1.13576297e-02,
         2.46121306e-02,  1.10584153e-02,  5.27332630e-03,
        -8.68066400e-03, -5.39070740e-02, -1.27882352e-02,
         1.68112516e-02, -5.95621206e-03, -3.49961519e-02,
        -4.92039928e-03, -2.73613464e-02,  2.04547103e-02,
         1.47048524e-02,  2.34515313e-02,  3.96893956e-02,
        -4.59628627e-02,  1.71677582e-02,  5.12278266e-03,
        -2.52861921e-02, -4.77366149e-03,  6.09214492e-02,
         2.46468913e-02, -4.51211771e-03, -4.14408594e-02,
         8.22822750e-03,  3.58437784e-02],
       [-3.41

In [28]:
def conv_output_volume(W, F, S, P):

    """
    TODO: Given the input volume size $W$, the kernel/filter size $F$,
    the stride $S$, and the amount of zero padding $P$ used on the border,
    calculate the output volume size.
    Note the output should a integer.
    """

    # your code here
    #https://cs231n.github.io/convolutional-networks/

    return int((W-F+2*P)/S+1)

In [29]:
print(conv_output_volume(50, 3, 1, 1))

50


## Pooling Layers

The original paper used a dynamic k-max pooling method in their model. The _k_ value is determine by equation (1).

\begin{equation*} k=\max \left({k_{top},\left \lceil{ \frac {L-l}{L} \left |{ s }\right | }\right \rceil }\right)\end{equation*}

__Dynamic K-Max Pooling Implementation:__

In [92]:
# #https://gist.github.com/anna-hope/7a2b2e66c3645aa8e4f94dbf06aed8dc
# """
# TODO: Remove hardcoded values and max it dynamic.
# """
# class DynamicKMaxPooling(nn.Module):
#     def __init__(self, k_init, conv_layers, layer):
#         super().__init__()
#         # "L is the total  number  of  convolutional  layers
#         # in  the  network;
#         # ktop is the fixed pooling parameter for the
#         # topmost  convolutional  layer"
#         self.k_init = k_init
#         self.conv_layers = conv_layers
#         self.layer = layer
#
#     def forward(self, X):
#         s = 50
#         dyn_k = ((self.conv_layers - self.layer) / self.conv_layers) * 3
#         k_max = int(round(max(self.k_init, np.ceil(dyn_k))))
#         print(k_max)
#         out = F.max_pool1d(X, kernel_size=k_max)
#         return out

## Sentence Similarity Convolution Network (SSCN)

### Pooling Layers

The original paper used a dynamic k-max pooling method in their model. The _k_ value is determine by equation (1).

\begin{equation*} k=\max \left({k_{top},\left \lceil{ \frac {L-l}{L} \left |{ s }\right | }\right \rceil }\right)\end{equation*}

__Dynamic K-Max Pooling Implementation:__

In [167]:
#https://gist.github.com/anna-hope/7a2b2e66c3645aa8e4f94dbf06aed8dc
"""
TODO: Remove hardcoded values and max it dynamic.
"""
class DynamicKMaxPooling(nn.Module):
    def __init__(self, k_init, conv_layers):
        super().__init__()
        # "L is the total  number  of  convolutional  layers
        # in  the  network;
        # ktop is the fixed pooling parameter for the
        # topmost  convolutional  layer"
        self.k_init = k_init
        self.conv_layers = conv_layers

    def pool(self, X, l):
        # s is sequence length
        # l is current layer in network
        s = X.shape[2]
        dyn_k = ((self.conv_layers - l) / self.conv_layers) * s
        k_max = int(round(max(self.k_init, np.ceil(dyn_k))))
        return F.max_pool1d(X, kernel_size=k_max)

    def forward(self, X):
        for layer_i in range(self.conv_layers,0,-1):
            X = self.pool(X, layer_i)

        return X

Testing Dynamic K-Max Pooling Layer

In [168]:
NUM_OF_SAMPLES = 20
SAMPLE_SIZE = 2
OUTPUT_SIZE = 15

test_embedding = torch.rand((NUM_OF_SAMPLES, SAMPLE_SIZE, OUTPUT_SIZE))
dyn_k_layer = DynamicKMaxPooling(3, SAMPLE_SIZE)

# Call forward with convolution layer index [2,1]
out = dyn_k_layer(test_embedding)

assert out.shape[2] == 1
assert out.shape[1] == SAMPLE_SIZE
assert out.shape[0] == NUM_OF_SAMPLES

### Sentence Similarity

 \begin{align*} Man(\vec V_{x}, \vec V_{y})=&\left |{ x_{1}-y_{1} }\right |\! +\! \left |{ x_{2}-y_{2} }\right | \!+ \!\ldots \!+ \!\left |{ x_{n}-y_{n} }\right |
 \\ score=&e^{-Man(\vec V_{x}, \vec V_{y})},\quad score\in [{0,1}] \end{align*}

In [169]:
"""
* X: Pooled output of SSCN model of shape (sample_size, -1)
* For the purpose of this experiment sample_size = 2
"""
def manhattan_similarity_score(X):
    sample_count, _, M = X.shape
    Vx = X[:,0].reshape((sample_count,M))
    Vy = X[:,1].reshape((sample_count,M))
    mdist = torch.sum(torch.abs(Vx-Vy),dim=1).view(sample_count,-1)
    score = torch.exp(-1*mdist)
    return score

_Testing Similarity Scoring Function_

In [170]:
NUM_OF_SAMPLES = 20
SAMPLE_SIZE = 2
OUTPUT_SIZE = 6

test_embedding = torch.rand((NUM_OF_SAMPLES, SAMPLE_SIZE, OUTPUT_SIZE))
scores = manhattan_similarity_score(test_embedding)

assert scores.shape == (NUM_OF_SAMPLES, 1)


In [171]:
class SSCN(nn.Module):
    def __init__(self, sample_size, stride=1, kernel_size=3, padding=1):
        super().__init__()
        self.stride = stride
        self.kernel_size = kernel_size
        self.padding = padding
        self.conv_layers =sample_size

        #NN layers
        self.conv1 = nn.Conv1d(in_channels=self.conv_layers, out_channels=self.conv_layers, \
                               kernel_size=self.kernel_size, padding=self.padding)
        self.relu1 = nn.ReLU()
        self.conv2 = nn.Conv1d(in_channels=self.conv_layers, out_channels=self.conv_layers, \
                               kernel_size=self.kernel_size, padding=self.padding)
        self.relu2 = nn.ReLU()
        self.pool1 = DynamicKMaxPooling(self.kernel_size, self.conv_layers)

        self.sscn = nn.Sequential(self.conv1, self.relu1, self.conv2, self.relu2, self.pool1)

    """
    * X: Pooled output of SSCN model of shape (sample_size, -1)
    * For the purpose of this experiment sample_size = 2
    """
    def manhattan_similarity_score(self, X):
        score = manhattan_similarity_score(X)
        return score

    def forward(self, X):
        # X = self.conv1(X)
        # print(X.shape)
        # X = self.relu1(X)
        # print(X.shape)
        # X = self.conv1(X)
        # print(X.shape)
        # X = self.relu2(X)
        # print(X.shape)
        # X = self.pool1(X)
        # print(X.shape)
        # X = self.manhattan_similarity_score(X)
        # print(X.shape)
        X = self.manhattan_similarity_score(self.sscn(X))
        return X

__Testing:__

In [172]:
NUM_OF_SAMPLES = 20
SAMPLE_SIZE = 2
UNIQUE_FEATURES = 18

test_embedding = torch.rand((NUM_OF_SAMPLES, SAMPLE_SIZE, UNIQUE_FEATURES))

model = SSCN(SAMPLE_SIZE)
# shape (batch,sample,sentence,word)?
print(model)

out = model(test_embedding)

assert out.shape[0] == NUM_OF_SAMPLES
assert out.shape[1] == 1
print(out.shape)


SSCN(
  (conv1): Conv1d(2, 2, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu1): ReLU()
  (conv2): Conv1d(2, 2, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu2): ReLU()
  (pool1): DynamicKMaxPooling()
  (sscn): Sequential(
    (0): Conv1d(2, 2, kernel_size=(3,), stride=(1,), padding=(1,))
    (1): ReLU()
    (2): Conv1d(2, 2, kernel_size=(3,), stride=(1,), padding=(1,))
    (3): ReLU()
    (4): DynamicKMaxPooling()
  )
)
torch.Size([20, 1])
