In [1]:
import numpy as np
from tensorflow import keras
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,Activation
from tensorflow.keras.layers import Conv1D,GlobalMaxPool1D
import glob
import os
from random import shuffle
def pre_process_data(filepath):
    positive_path=os.path.join(filepath,'pos')
    negative_path=os.path.join(filepath,'neg')
    pos_label=1
    neg_label=0
    dataset=[]
    for filename in glob.glob(os.path.join(positive_path,'*.txt')):
        with open(filename,'r') as f:
            dataset.append((pos_label,f.read()))
    for filename in glob.glob(os.path.join(negative_path,'*.txt')):
        with open(filename,'r') as f:
            dataset.append((neg_label,f.read()))
    shuffle(dataset)
    return dataset


In [2]:
dataset=pre_process_data('src/data/aclImdb/train')

In [3]:
dataset[0][1]

'Next to "Star Wars" and "The Wizard of Oz," this remains one of the greatest fantasy films ever made. It\'s a true shame it\'s not as well-known as the former films (maybe because it sticks to a story based on legends rather than contemporary or sci-fi settings, and that it\'s British, meaning a smaller market for films) but its wonderful to know that it\'s deserved that reputation.<br /><br />Like all great family films, one can be a child, an adult, or even a teenager to enjoy this film (I\'m currently 18), but one must appreciate classic films first. I absolutely adore this film. It has an extraordinary music score by Miklos Rozsa (perhaps my favorite classic film score) that rivals any John Williams "Star Wars" score, a fast but not flashy pace, beautiful sets, dialog, and use of color (both the sets and cinematography won Oscars), and state-of-the-art Oscar-winning special effects (for the time, and some are still stunning). And, of course, June Duprez\'s sultry looks as the Prin

In [5]:
print(dataset.count)

<built-in method count of list object at 0x7fc2742c6a80>


In [4]:
from nltk.tokenize import TreebankWordTokenizer
from gensim.models.keyedvectors import KeyedVectors
# from nlpia.loaders import get_data
word_vector=KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin',binary=True)

In [5]:
def token_and_vectorize(dataset):
    tokenizer=TreebankWordTokenizer()
    vectorized_data=[]
    expected=[]
    for sample in dataset:
        tokens=tokenizer.tokenize(sample[1])
        sample_vec=[]
        for token in tokens:
            try:
                sample_vec.append(word_vector[token])
            except:
                pass
        expected.append(sample[0])
        vectorized_data.append(sample_vec)
    return vectorized_data,expected

In [6]:
vectored_dataset,expected=token_and_vectorize(dataset)

In [7]:
len(vectored_dataset)

25000

In [8]:
def test_len(data,maxlen):
    i=total_len=truncated=exact=padded=0

    for sample in data:
        i=i+1;
        total_len+=len(sample)
        if len(sample)>maxlen:
            truncated+=1
        elif len(sample)<maxlen:
            padded+=1
        else:
            exact+=1
        if i%1000==0:
            print('处理了:{}'.format(i))
    print("Padded:{}".format(padded))
    print("Equal:{}".format(exact))
    print("Truncated:{}".format(truncated))
    print('平均长度:{}'.format(total_len/len(data)))


In [9]:
test_len(vectored_dataset,400)

处理了:1000
处理了:2000
处理了:3000
处理了:4000
处理了:5000
处理了:6000
处理了:7000
处理了:8000
处理了:9000
处理了:10000
处理了:11000
处理了:12000
处理了:13000
处理了:14000
处理了:15000
处理了:16000
处理了:17000
处理了:18000
处理了:19000
处理了:20000
处理了:21000
处理了:22000
处理了:23000
处理了:24000
处理了:25000
Padded:22458
Equal:20
Truncated:2522
平均长度:205.2144


In [9]:
len(vectored_dataset[0])

114

In [10]:
len(expected)

25000

In [11]:
split_point=int(len(vectored_dataset)*0.8)
x_train=vectored_dataset[:split_point]
y_train=expected[:split_point]
x_test=vectored_dataset[split_point:]
y_test=expected[split_point:]


In [12]:
split_point

20000

In [13]:
len(y_train)

20000

In [14]:
maxlen=400
batch_size=32
embedding_dims=300
filtes=250
kernel_size=3
hidden_dims=250
epochs=10


In [15]:
def pad_trunc(data,maxlen):
    zero_vector=[]
    new_data=[]
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)
    for sample in data:
        if len(sample)>maxlen:
            temp=sample[:maxlen]
        elif len(sample)<maxlen:
            temp=sample
            additional_elems=maxlen-len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp=sample
        new_data.append(temp)
    return new_data

In [16]:
x_train[0]
print(len(x_train[0]))

114


In [17]:
x_train=pad_trunc(x_train,maxlen)
len(x_train)

20000

In [18]:
x_test=pad_trunc(x_test,maxlen)


In [19]:
type(x_train)

list

In [20]:
x_train=np.reshape(x_train,(len(x_train),maxlen,embedding_dims))


In [21]:
y_train=np.array(y_train)
x_test=np.reshape(x_test,(len(x_test),maxlen,embedding_dims))
y_test=np.array(y_test)

In [22]:
model=Sequential()
model.add(Conv1D(filtes,kernel_size,padding='valid',activation='relu',strides=1,input_shape=(maxlen,embedding_dims)))

In [23]:
model.add(GlobalMaxPool1D())

In [24]:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))
#输出层
model.add(Dense(1))
model.add(Activation('sigmoid'))
#编译
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

In [25]:
model.fit(x_train,y_train,batch_size=batch_size,epochs=epochs,validation_data=(x_test,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fbadc317dc0>

In [26]:
import tensorflow as tf
tf.__version__

'2.4.1'

In [27]:
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


In [5]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [1]:
import torch as t
t.__version__

'1.7.1'