In [1]:
import gensim.downloader as api
import numpy as np

from gensim.models import KeyedVectors
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

from math import *
from scipy.spatial.distance import cdist
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from keras.utils import to_categorical

model = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)

Using TensorFlow backend.


In [2]:
with open("../data/train.data", "r") as f:
    training_data = f.readlines()

In [3]:
with open("../data/test.data", "r") as f:
    test_data = f.readlines()

In [4]:
stop_words = set(stopwords.words('english')) 
  
def get_sent_vec(line):
    word_tokens = word_tokenize(line) 

    filtered_sentence = [w for w in word_tokens if not w in stop_words] 

    vector = [0] * 300
    count = 0
    
    for word in filtered_sentence:
        try:
            vector += model[word]
            count += 1
        except:
            pass
        
    vector = vector / count
    
    return vector

In [5]:
def get_diff(l1, l2):
    diff = []
    distances = ["euclidean", "minkowski", "cityblock", "cosine", "jaccard", "correlation", "chebyshev", "canberra", "braycurtis", "kulsinski", "sokalsneath"]
    for x in distances:
        try:
            diff.append(cdist(l1.reshape(1,300), l2.reshape(1,300), x))
        except Exception as E:
            print(E)

    return diff

In [6]:
def make_x_y(data, test=False):
    X = []
    Y = []

    for i in data:
        try:
            lines = i.split("\t")

            l1 = lines[2]
            l2 = lines[3]
            l1_vec = get_sent_vec(l1)
            l2_vec = get_sent_vec(l2)

            wm = model.wmdistance(l1.split(), l2.split())
            diff_vector = get_diff(l1_vec, l2_vec)
            diff_vector.append(wm)
            if test:
                label = int(lines[4])
            else:
                label = int(lines[4][1])
            X.append(diff_vector)
            Y.append(label)

        except Exception as E:
            print(E)

    return np.array(X), np.array(Y)


In [7]:
X_train, Y_train = make_x_y(training_data)

unsupported operand type(s) for /: 'list' and 'int'


In [8]:
test_data[0]

'51\t8 Mile\tAll the home alones watching 8 mile\t8 mile is on thats my movie\t3\tAll/O/DT/B-NP/O the/O/DT/I-NP/O home/O/NN/I-NP/O alones/O/VBZ/B-VP/O watching/O/VBG/I-VP/B-EVENT 8/O/CD/B-NP/O mile/O/NN/I-NP/O\t8/O/NN/B-NP/O mile/O/NN/I-NP/O is/O/VBZ/B-VP/O on/O/IN/B-PP/O thats/O/NNS/B-NP/O my/O/PRP$/B-NP/O movie/O/NN/I-NP/B-EVENT\n'

In [9]:
X_test, Y_test = make_x_y(test_data, True)

In [10]:
print(X_train.shape)
print(Y_train.shape)

(13062, 12)
(13062,)


In [11]:
print(X_test.shape)
print(Y_test.shape)

(972, 12)
(972,)


In [12]:
lr = LinearRegression(normalize=True)

lr.fit(X_train, Y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=True)

In [13]:
p = lr.predict(X_test)

In [14]:
mean_absolute_error(Y_test, p)

1.0719194971727128

In [15]:
dtr = DecisionTreeRegressor()

In [16]:
dtr.fit(X_train, Y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [17]:
p = dtr.predict(X_test)

In [18]:
mean_absolute_error(p, Y_test)

1.5651397011046133

In [19]:
clf = RandomForestClassifier(n_estimators=200)

In [20]:
clf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [21]:
pred = clf.predict(X_test)

In [22]:
accuracy_score(Y_test, pred)

0.20987654320987653

In [23]:
from keras.models import Sequential
from keras.layers import Dense
import numpy
# fix random seed for reproducibility
numpy.random.seed(7)

In [24]:
Y_train_cat = to_categorical(Y_train)

In [25]:
# create model
NNmodel = Sequential()
NNmodel.add(Dense(12, input_dim=12, activation='relu'))
NNmodel.add(Dense(8, activation='relu'))
NNmodel.add(Dense(6, activation='sigmoid'))

In [26]:
# Compile model
NNmodel.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [27]:
print(X_train.shape)
print(Y_train.shape)

(13062, 12)
(13062,)


In [28]:
print(X_test.shape)
print(Y_test.shape)

(972, 12)
(972,)


In [29]:
# Fit the model
NNmodel.fit(X_train, Y_train_cat, epochs=20, batch_size=10)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f01f44a4cf8>

In [30]:
def make_x_y_2(data, test=False):
    x1 = []
    x2 = []
    Y = []

    for i in data:
        try:
            lines = i.split("\t")
            
            l1 = lines[2]
            l2 = lines[3]
            l1s = l1.split()
            l2s = l2.split()
            lol.update(l1s)
            lol.update(l2s)
            if test:
                l = int(lines[4])
            else:
                l = int(lines[4][1])
            label = 1
            if l < 2.5:
                label = 0
            x1.append(l1s)
            x2.append(l2s)
            Y.append(label)

        except Exception as E:
            print(E)

    return x1, x2, np.array(Y)

In [31]:
lol = set([])

In [32]:
texts_1, texts_2, labels = make_x_y_2(training_data)

In [33]:
test_texts_1, test_texts_2, test_labels = make_x_y_2(training_data)

In [35]:
nb_words = len(lol)

In [36]:
MAX_SEQUENCE_LENGTH = 30
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25

act = 'relu'
re_weight = True

In [37]:
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for i, word in enumerate(lol):
    if word in model.vocab:
        embedding_matrix[i] = model.word_vec(word)

In [38]:
embedding_matrix.shape

(11708, 300)

In [39]:

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint


In [46]:

tokenizer = Tokenizer(num_words=nb_words)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)

sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)


Found 8841 unique tokens
Shape of data tensor: (13063, 30)
Shape of label tensor: (13063,)


In [47]:
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

merged = concatenate([x1, y1])
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

preds = Dense(1, activation='sigmoid')(merged)


In [48]:
LSTMmodel = Model(inputs=[sequence_1_input, sequence_2_input], \
        outputs=preds)
LSTMmodel.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])

In [49]:

data_1_train = np.vstack((data_1, data_2))
data_2_train = np.vstack((data_2, data_1))
labels_train = np.concatenate((labels, labels))


In [50]:
hist = LSTMmodel.fit([data_1_train, data_2_train], labels_train, \
        epochs=10, batch_size=10, shuffle=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [68]:
data_1_test = np.vstack((test_data_1, test_data_2))
data_2_test = np.vstack((test_data_2, test_data_1))


LSTMpreds = LSTMmodel.predict([test_data_1, test_data_2], batch_size=10, verbose=1)
LSTMpreds += LSTMmodel.predict([test_data_2, test_data_1], batch_size=10, verbose=1)
LSTMpreds /= 2





In [72]:
LSTMpreds

array([[0.9558617 ],
       [0.94754326],
       [0.97819096],
       ...,
       [0.04443993],
       [0.0987117 ],
       [0.0401309 ]], dtype=float32)

In [87]:
LSTMpreds = numpy.rint(LSTMpreds.ravel())

array([1., 1., 1., ..., 0., 0., 0.], dtype=float32)

In [84]:
test_labels.shape

(13063,)

In [88]:
accuracy_score(test_labels, LSTMpreds)

0.9203858225522468

In [89]:
print(f1_score(test_labels, LSTMpreds, average="macro"))
print(precision_score(test_labels, LSTMpreds, average="macro"))
print(recall_score(test_labels, LSTMpreds, average="macro"))    

0.9048460656681441
0.9114509431115994
0.898981588845932
