In [1]:
# gensim modules
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
# numpy
import numpy
# random
from random import shuffle
# classifier
from sklearn import linear_model 


In [2]:
import os
os.getcwd()
os.chdir('/home/owner/デスクトップ/PythonFile/imdb')

In [3]:
numEpochs = 150

In [4]:
class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
        
        flipped = {}
        
        # make sure that keys are unique
        for key, value in sources.items():
            if value not in flipped:
                flipped[value] = [key]
            else:
                raise Exception('Non-unique prefix encountered')
    
    def __iter__(self):
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
    
    def to_array(self):
        self.sentences = []
        for source, prefix in self.sources.items():
            with utils.smart_open(source) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
        return self.sentences
    
    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences

In [5]:
sources = {'data/tr_pos.txt':'TRAIN_POS_', 'data/tr_neg.txt':'TRAIN_NEG_', 'data/te_pos.txt':'TEST_POS_', 'data/te_neg.txt':'TEST_NEG_'}
#{'tr_data/tr_pos.txt':'TRAIN_POS_', 'tr_data/tr_neg.txt':'TRAIN_NEG_', 'te_data/te_pos.txt':'TEST_POS_', 'te_data/te_neg.txt':'TEST_NEG_'}
sentences = LabeledLineSentence(sources)

In [None]:
model = Doc2Vec(min_count=1, window=10, size=100, sample=1e-4, negative=5, workers=8)
model.build_vocab(sentences.to_array())



In [None]:
model.train(sentences.sentences_perm(), total_examples=model.corpus_count, epochs=numEpochs)

In [None]:
model.save('./imdb_'+str(numEpochs)+'Epochs.d2v')

In [None]:
model = Doc2Vec.load('./imdb_'+str(numEpochs)+'Epochs.d2v')

In [None]:
model.corpus_count

In [None]:
model.wv.most_similar('eat')

In [None]:
train_arrays = numpy.zeros((25000, 100))
train_labels = numpy.zeros(25000)
for i in range(12500):
    prefix_train_pos = 'TRAIN_POS__' + str(i)
    prefix_train_neg ='TRAIN_NEG__' + str(i)
    train_arrays[i] = model[prefix_train_pos]
    train_arrays[12500 + i] = model[prefix_train_neg]
    train_labels[i] = 1
    train_labels[12500 + i] = 0

In [None]:
test_arrays = numpy.zeros((25000, 100))
test_labels = numpy.zeros(25000)
for i in range(12500):
    prefix_test_pos = 'TEST_POS__' + str(i)
    prefix_test_neg = 'TEST_NEG__' + str(i)
    test_arrays[i] = model[prefix_test_pos]
    test_arrays[12500 + i] = model[prefix_test_neg]
    test_labels[i] = 1
    test_labels[12500 + i] = 0

In [None]:
print (train_labels)

In [None]:
classifier_logistic = linear_model.LogisticRegression()
classifier_logistic.fit(train_arrays, train_labels)
classifier_logistic.score(test_arrays, test_labels)

In [None]:
classifier_lasso = linear_model.Lasso()
classifier_lasso.fit(train_arrays, train_labels)
classifier_lasso.score(test_arrays, test_labels)

In [None]:
classifier_OLS = linear_model.LinearRegression()
classifier_OLS.fit(train_arrays, train_labels)
classifier_OLS.score(test_arrays, test_labels)

In [None]:
classifier_ridge = linear_model.Ridge()
classifier_ridge.fit(train_arrays, train_labels)
classifier_ridge.score(test_arrays, test_labels)

In [None]:
classifier_elastic = linear_model.ElasticNet()
classifier_elastic.fit(train_arrays, train_labels)
classifier_elastic.score(test_arrays, test_labels)

In [None]:
classifier_LARS = linear_model.Lars()
classifier_LARS.fit(train_arrays, train_labels)
classifier_LARS.score(test_arrays, test_labels)

In [None]:
classifier_OMP = linear_model.OrthogonalMatchingPursuit(n_nonzero_coefs=100)
classifier_OMP.fit(train_arrays, train_labels)
classifier_OMP.score(test_arrays, test_labels)

In [None]:
classifier_lassoLARS = linear_model.LassoLars(alpha=.1)
classifier_lassoLARS.fit(train_arrays, train_labels)
classifier_lassoLARS.score(test_arrays, test_labels)

In [None]:
classifier_BayesianRidge = linear_model.BayesianRidge()
classifier_BayesianRidge.fit(train_arrays, train_labels)
classifier_BayesianRidge.score(test_arrays, test_labels)

In [None]:
losses = [ 'hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive']
best = 0
for loss in losses:
    classifier_SGD = linear_model.SGDClassifier(loss=loss)
    classifier_SGD.fit(train_arrays, train_labels)
    score = classifier_SGD.score(test_arrays, test_labels)
    if (score>best):
        best = score
        best_loss = loss
        
print(best_loss, best)

In [None]:
classifier_Perceptron = linear_model.Perceptron()
classifier_Perceptron.fit(train_arrays, train_labels)
classifier_Perceptron.score(test_arrays, test_labels)

In [None]:
classifier_PAC = linear_model.PassiveAggressiveClassifier(loss='hinge')
classifier_PAC.fit(train_arrays, train_labels)
print(classifier_PAC.score(test_arrays, test_labels))

classifier_PAC2 = linear_model.PassiveAggressiveClassifier(loss='squared_hinge')
classifier_PAC2.fit(train_arrays, train_labels)
print(classifier_PAC2.score(test_arrays, test_labels))

In [None]:
from keras.layers import Input, merge
from keras.models import Model, Sequential
from keras.layers import Dense, Activation, Dropout
from keras.layers import LSTM
import keras.backend as K
from keras.layers import Lambda, regularizers, Average
from keras.layers import Input, Conv2D, Conv1D, MaxPooling2D, GlobalMaxPooling2D, GlobalMaxPooling1D, MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import GRU
from keras.layers.wrappers import Bidirectional, TimeDistributed
from keras.layers.core import Dropout, Dense, Lambda, Masking
from keras.layers import merge, Layer, Activation, Dot, Concatenate, Flatten, Lambda
from keras.initializers import Identity,glorot_normal
from keras import regularizers
from keras import metrics
from keras.utils import plot_model
import keras as keras

In [None]:
numOfDocs = len(train_arrays)
sizeOfVectors = len(train_arrays[0])

## ANN

In [None]:
classifier_simple_NN = Sequential()
#classifier_simple_NN.add(Input( shape = ( sizeOfVectors, 1 ) , name='Input' ))
classifier_simple_NN.add(Dense(sizeOfVectors*2, input_dim=sizeOfVectors,kernel_regularizer=regularizers.l2(0.01)))
classifier_simple_NN.add(Dropout(0.5))
classifier_simple_NN.add(Dense(1, activation='sigmoid',kernel_regularizer=regularizers.l2(0.01)))

classifier_simple_NN.compile(loss='binary_crossentropy',
                             optimizer = keras.optimizers.Adam(lr=0.0001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
                            metrics=['accuracy'])
classifier_simple_NN.fit(train_arrays, train_labels, batch_size=20, epochs=80,validation_data=(test_arrays,test_labels), shuffle=True)
score = classifier_simple_NN.evaluate(test_arrays, test_labels)

print(score)


## CNN1D 3 layers

In [None]:
classifier_CNN = Sequential()
classifier_CNN.add(Lambda(lambda x: K.expand_dims(x), input_shape=(sizeOfVectors,)))
# classifier_CNN.add(Lambda(lambda x: K.squeeze(x, 2)))
#classifier_CNN.add(Input( shape = ( sizeOfVectors, 1 ) , name='Input' ))
classifier_CNN.add(Conv1D(10, kernel_size = 30, padding='valid',input_dim=sizeOfVectors,kernel_regularizer=regularizers.l2(1e-5)))
classifier_CNN.add(Dropout(0.5))
classifier_CNN.add(Conv1D(10, kernel_size = 30, padding='valid',input_dim=sizeOfVectors,kernel_regularizer=regularizers.l2(1e-5)))
classifier_CNN.add(Dropout(0.5))
classifier_CNN.add(Conv1D(10, kernel_size = 30, padding='valid',input_dim=sizeOfVectors,kernel_regularizer=regularizers.l2(1e-5)))
classifier_CNN.add(Dropout(0.5))
classifier_CNN.add(Flatten())
classifier_CNN.add(Dense(75, activation='tanh',kernel_regularizer=regularizers.l2(1e-5)))
classifier_CNN.add(Dropout(0.5))
classifier_CNN.add(Dense(100, activation='tanh',kernel_regularizer=regularizers.l2(1e-5)))
classifier_CNN.add(Dropout(0.5))
classifier_CNN.add(Dense(1, activation='sigmoid'))

classifier_CNN.compile(loss='binary_crossentropy',
                       optimizer = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
                      metrics=['accuracy'])
classifier_CNN.fit(train_arrays, train_labels, batch_size=50, epochs=10,validation_data=(test_arrays,test_labels), shuffle=True)
score = classifier_CNN.evaluate(test_arrays, test_labels)

print(score)


In [None]:
score = classifier_CNN.evaluate(test_arrays, test_labels)
print(score)

In [None]:
classifier_CNN.fit(train_arrays, train_labels, batch_size=200,initial_epoch=10, epochs=15, validation_data=(test_arrays,test_labels), shuffle=True)
score = classifier_CNN.evaluate(test_arrays, test_labels)

In [None]:
score = classifier_CNN.evaluate(test_arrays, test_labels)
print(score)

## LSTM 1 layer

In [None]:
classifier_LSTM = Sequential()
classifier_LSTM.add(Lambda(lambda x: K.expand_dims(x), input_shape=(sizeOfVectors,)))
# classifier_CNN.add(Lambda(lambda x: K.squeeze(x, 2)))
#classifier_CNN.add(Input( shape = ( sizeOfVectors, 1 ) , name='Input' ))
classifier_LSTM.add(LSTM(10,input_dim=sizeOfVectors,kernel_regularizer=regularizers.l2(1e-5),
                       recurrent_regularizer=regularizers.l2(1e-5),bias_regularizer=regularizers.l2(1e-5),
                       dropout=0.3, recurrent_dropout=0.3))
classifier_LSTM.add(Dropout(0.5))
classifier_LSTM.add(Dense(100, activation='tanh',kernel_regularizer=regularizers.l2(1e-5)))
classifier_LSTM.add(Dropout(0.5))
classifier_LSTM.add(Dense(1, activation='sigmoid'))

classifier_LSTM.compile(loss='binary_crossentropy',
                       optimizer = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
                      metrics=['accuracy'])
classifier_LSTM.fit(train_arrays, train_labels, batch_size=100, epochs=100, validation_data=(test_arrays,test_labels), shuffle=True)
score = classifier_RNN.evaluate(test_arrays, test_labels)

print(score)


In [None]:
classifier_3LSTM.summary()

## LSTM 3 layer

In [None]:
classifier_3LSTM = Sequential()
classifier_3LSTM.add(Lambda(lambda x: K.expand_dims(x), input_shape=(sizeOfVectors,)))
# classifier_CNN.add(Lambda(lambda x: K.squeeze(x, 2)))
#classifier_CNN.add(Input( shape = ( sizeOfVectors, 1 ) , name='Input' ))
classifier_3LSTM.add(LSTM(10,input_dim=sizeOfVectors,kernel_regularizer=regularizers.l2(1e-5),
                       recurrent_regularizer=regularizers.l2(1e-5),bias_regularizer=regularizers.l2(1e-5),
                       dropout=0.3, recurrent_dropout=0.3))
classifier_3LSTM.add(Lambda(lambda x: K.expand_dims(x)))
classifier_3LSTM.add(LSTM(5,input_dim=sizeOfVectors,kernel_regularizer=regularizers.l2(1e-5),
                       recurrent_regularizer=regularizers.l2(1e-5),bias_regularizer=regularizers.l2(1e-5),
                       dropout=0.3, recurrent_dropout=0.3))
classifier_3LSTM.add(Lambda(lambda x: K.expand_dims(x)))
classifier_3LSTM.add(LSTM(10,input_dim=sizeOfVectors,kernel_regularizer=regularizers.l2(1e-5),
                       recurrent_regularizer=regularizers.l2(1e-5),bias_regularizer=regularizers.l2(1e-5),
                       dropout=0.3, recurrent_dropout=0.3))
classifier_3LSTM.add(Dropout(0.5))
classifier_3LSTM.add(Dense(100, activation='tanh',kernel_regularizer=regularizers.l2(1e-5)))
classifier_3LSTM.add(Dropout(0.5))
classifier_3LSTM.add(Dense(1, activation='sigmoid'))

classifier_3LSTM.compile(loss='binary_crossentropy',
                       optimizer = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
                      metrics=['accuracy'])
classifier_3LSTM.fit(train_arrays, train_labels, batch_size=100, epochs=120, validation_data=(test_arrays,test_labels), shuffle=True)
score = classifier_RNN.evaluate(test_arrays, test_labels)

print(score)


## BILSTM 1 layer

In [None]:
classifier_BILSTM = Sequential()
classifier_BILSTM.add(Lambda(lambda x: K.expand_dims(x), input_shape=(sizeOfVectors,)))
# classifier_CNN.add(Lambda(lambda x: K.squeeze(x, 2)))
#classifier_CNN.add(Input( shape = ( sizeOfVectors, 1 ) , name='Input' ))
classifier_BILSTM.add(Bidirectional(LSTM(10,input_dim=sizeOfVectors,kernel_regularizer=regularizers.l2(1e-5),
                       recurrent_regularizer=regularizers.l2(1e-5),bias_regularizer=regularizers.l2(1e-5),
                       dropout=0.3, recurrent_dropout=0.3), merge_mode="concat"))
classifier_BILSTM.add(Dropout(0.5))
classifier_BILSTM.add(Dense(100, activation='tanh',kernel_regularizer=regularizers.l2(1e-5)))
classifier_BILSTM.add(Dropout(0.5))
classifier_BILSTM.add(Dense(1, activation='sigmoid'))

classifier_BILSTM.compile(loss='binary_crossentropy',
                       optimizer = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
                      metrics=['accuracy'])
classifier_BILSTM.fit(train_arrays, train_labels, batch_size=100, epochs=100, validation_data=(test_arrays,test_labels), shuffle=True)
score = classifier_RNN.evaluate(test_arrays, test_labels)

print(score)


In [None]:
classifier_BILSTM = Sequential()
classifier_BILSTM.add(Lambda(lambda x: K.expand_dims(x), input_shape=(sizeOfVectors,)))
# classifier_CNN.add(Lambda(lambda x: K.squeeze(x, 2)))
#classifier_CNN.add(Input( shape = ( sizeOfVectors, 1 ) , name='Input' ))
classifier_BILSTM.add(Bidirectional(LSTM(10,input_dim=sizeOfVectors,kernel_regularizer=regularizers.l2(1e-5),
                       recurrent_regularizer=regularizers.l2(1e-5),bias_regularizer=regularizers.l2(1e-5),
                       dropout=0.3, recurrent_dropout=0.3), merge_mode="concat"))
classifier_BILSTM.add(Dropout(0.5))
classifier_BILSTM.add(Dense(100, activation='tanh',kernel_regularizer=regularizers.l2(1e-5)))
classifier_BILSTM.add(Dropout(0.5))
classifier_BILSTM.add(Dense(1, activation='sigmoid'))

classifier_BILSTM.compile(loss='binary_crossentropy',
                       optimizer = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0),
                      metrics=['accuracy'])
classifier_BILSTM.fit(train_arrays, train_labels, batch_size=100, epochs=100, validation_data=(test_arrays,test_labels), shuffle=True)
score = classifier_RNN.evaluate(test_arrays, test_labels)

print(score)
