In [303]:
import pandas as pd
df = pd.read_csv('imdb_master.txt', sep=',', engine='python')
df.head()

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,0,test,Once again Mr. Costner has dragged out a movie...,neg,0_2.txt
1,1,test,This is an example of why the majority of acti...,neg,10000_4.txt
2,2,test,"First of all I hate those moronic rappers, who...",neg,10001_1.txt
3,3,test,Not even the Beatles could write songs everyon...,neg,10002_3.txt
4,4,test,Brass pictures (movies is not a fitting word f...,neg,10003_3.txt


In [304]:
import nltk
import re
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords
stoplist = stopwords.words('english')
nltk.download('punkt')
lemmatizer = WordNetLemmatizer() 


toks = []
reviews = []
for r in df['review']:
    tokens = nltk.word_tokenize(r)
    l = []
    for t in tokens:
        tnew = t.lower()
        tnew = re.sub(r'[^\w\s]','',tnew)
        if tnew not in stoplist:
            l.append(lemmatizer.lemmatize(tnew))
    toks.append(l)
    reviews.append(' '.join(l))
df['review'] = reviews
df['tokens'] = toks


df = df.drop(columns=['Unnamed: 0', 'file'])


negs = []
poss = []
unsup = []
ls = []
for l in df['label']:
    if l == 'neg':
        negs.append(1)
        poss.append(0)
        unsup.append(0)
    elif l == 'pos':
        negs.append(0)
        poss.append(1)
        unsup.append(0)
    elif l == 'unsup':
        negs.append(0)
        poss.append(0)
        unsup.append(1)
df['neg'] = negs 
df['pos'] = poss 
df['unsup'] = unsup 


df.head()

[nltk_data] Downloading package punkt to /Users/masha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,type,review,label,tokens,neg,pos,unsup
0,test,mr costner dragged movie far longer necessary ...,neg,"[mr, costner, dragged, movie, far, longer, nec...",1,0,0
1,test,example majority action film generic boring ...,neg,"[example, majority, action, film, , generic, b...",1,0,0
2,test,first hate moronic rapper couldnt act gun pre...,neg,"[first, hate, moronic, rapper, , couldnt, act,...",1,0,0
3,test,even beatles could write song everyone liked ...,neg,"[even, beatles, could, write, song, everyone, ...",1,0,0
4,test,brass picture movie fitting word really some...,neg,"[brass, picture, , movie, fitting, word, , rea...",1,0,0


In [305]:
df_train = df[df.type == 'train']
df_test =  df[df.type == 'test']
df_train = df_train[['review', 'pos', 'tokens', 'neg', 'unsup']]
df_test = df_test[['review', 'pos', 'tokens', 'neg', 'unsup']]

In [306]:
df_train.head()

Unnamed: 0,review,pos,tokens,neg,unsup
25000,story man unnatural feeling pig start opening...,0,"[story, man, unnatural, feeling, pig, , start,...",1,0
25001,airport 77 start brand new luxury 747 plane lo...,0,"[airport, 77, start, brand, new, luxury, 747, ...",1,0
25002,film lacked something could nt put finger firs...,0,"[film, lacked, something, could, nt, put, fing...",1,0
25003,sorry everyone know supposed art film w...,0,"[sorry, everyone, , , , know, supposed, , art,...",1,0
25004,little parent took along theater see interior ...,0,"[little, parent, took, along, theater, see, in...",1,0


In [307]:
df_test.head()

Unnamed: 0,review,pos,tokens,neg,unsup
0,mr costner dragged movie far longer necessary ...,0,"[mr, costner, dragged, movie, far, longer, nec...",1,0
1,example majority action film generic boring ...,0,"[example, majority, action, film, , generic, b...",1,0
2,first hate moronic rapper couldnt act gun pre...,0,"[first, hate, moronic, rapper, , couldnt, act,...",1,0
3,even beatles could write song everyone liked ...,0,"[even, beatles, could, write, song, everyone, ...",1,0
4,brass picture movie fitting word really some...,0,"[brass, picture, , movie, fitting, word, , rea...",1,0


In [308]:
all_training_words = []
training_sentence_lengths = []
for s in df_train['tokens']:
    training_sentence_lengths.append(len(s))
    for t in s:
        all_training_words.append(t)
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s words" % max(training_sentence_lengths))

12598050 words total, with a vocabulary size of 166195
Max sentence length is 1923 words


In [309]:
all_test_words = []
test_sentence_lengths = []
for s in df_test['tokens']:
    test_sentence_lengths.append(len(s))
    for t in s:
        all_test_words.append(t)
TEST_VOCAB = sorted(list(set(all_test_words)))
print('%s words total, with a vocabulary size of %s' % (len(all_test_words), len(TEST_VOCAB)))
print('Max sentence length is %s words' % max(test_sentence_lengths))

4096662 words total, with a vocabulary size of 86080
Max sentence length is 1713 words


In [310]:
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

--2019-12-31 00:54:06--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
R'esolution de s3.amazonaws.com (s3.amazonaws.com)... 52.216.171.117
Connexion `a s3.amazonaws.com (s3.amazonaws.com)|52.216.171.117|:443... connect'e.
requ^ete HTTP transmise, en attente de la r'eponse... 416 Requested Range Not Satisfiable

    Le fichier a d'ej`a 'et'e compl`etement r'ecup'er'e ; rien `a faire.



In [311]:
from gensim import models
word2vec_path = 'GoogleNews-vectors-negative300.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [312]:
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

In [313]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(df_train['review'].tolist())
training_sequences = tokenizer.texts_to_sequences(df_train['review'].tolist())
train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))
train_cnn_data = pad_sequences(training_sequences, 
                               maxlen=MAX_SEQUENCE_LENGTH)
test_sequences = tokenizer.texts_to_sequences(df_test['review'].tolist())
test_cnn_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

Found 165885 unique tokens.


In [314]:
import numpy as np
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

(165886, 300)


In [315]:
def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)
train_embeddings = get_word2vec_embeddings(word2vec, df_train, generate_missing=True)

In [316]:
"""
train_embeddings = []
for t in df_train['tokens'].tolist():
    vec = []
    for w in t:
        if w in word2vec:
            vec.append(word2vec[w])
        else: 
            vec.append(np.random.rand(300))
        
    train_embeddings.append(np.divide(np.sum(vec, axis=0), len(vec)))
"""

"\ntrain_embeddings = []\nfor t in df_train['tokens'].tolist():\n    vec = []\n    for w in t:\n        if w in word2vec:\n            vec.append(word2vec[w])\n        else: \n            vec.append(np.random.rand(300))\n        \n    train_embeddings.append(np.divide(np.sum(vec, axis=0), len(vec)))\n"

In [317]:
def ConvNet(embeddings, max_sequence_length, num_words, embedding_dim, labels_index):
    
    embedding_layer = Embedding(num_words,
                            embedding_dim,
                            weights=[embeddings],
                            input_length=max_sequence_length,
                            trainable=False)
    
    sequence_input = Input(shape=(max_sequence_length,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)

    convs = []
    filter_sizes = [2,3,4,5,6]

    for filter_size in filter_sizes:
        l_conv = Conv1D(filters=200, kernel_size=filter_size, activation='relu')(embedded_sequences)
        l_pool = GlobalMaxPooling1D()(l_conv)
        convs.append(l_pool)


    l_merge = concatenate(convs, axis=1)

    x = Dropout(0.1)(l_merge)  
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.2)(x)
    preds = Dense(labels_index, activation='sigmoid')(x)

    model = Model(sequence_input, preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

In [318]:
label_names = ['pos', 'neg', 'unsup']

In [319]:
y_train = df_train[label_names].values

In [320]:
x_train = train_cnn_data
y_tr = y_train

In [321]:
from keras.layers import Embedding, Input, Conv1D, GlobalMaxPooling1D, concatenate, Dropout, Dense
from keras.models import Model
model = ConvNet(train_embedding_weights, MAX_SEQUENCE_LENGTH, len(train_word_index)+1, EMBEDDING_DIM, 
                len(list(label_names)))

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_14 (Embedding)        (None, 50, 300)      49765800    input_13[0][0]                   
__________________________________________________________________________________________________
conv1d_52 (Conv1D)              (None, 49, 200)      120200      embedding_14[0][0]               
__________________________________________________________________________________________________
conv1d_53 (Conv1D)              (None, 48, 200)      180200      embedding_14[0][0]               
__________________________________________________________________________________________________
conv1d_54 

In [322]:
num_epochs = 50
batch_size = 10

In [323]:
hist = model.fit(x_train, y_tr, epochs=num_epochs, validation_split=0.1, shuffle=True, batch_size=batch_size)

Train on 67500 samples, validate on 7500 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [324]:
predictions = model.predict(test_cnn_data, batch_size=100, verbose=1)



In [329]:
labels = [2, 1, 0]

In [330]:
prediction_labels=[]
for p in predictions:
    prediction_labels.append(labels[np.argmax(p)])

In [331]:
sum(df_test.pos==prediction_labels)/len(prediction_labels)

0.43936

In [332]:
sum(df_test.pos==prediction_labels)

10984

In [333]:
len(prediction_labels)

25000

In [335]:
df_test.pos.value_counts()

1    12500
0    12500
Name: pos, dtype: int64