In [1]:
import pandas as pd

In [2]:
import glob

def process_docs(location):
    files = glob.glob(location + '/*.txt')
    contents = []
    for name in files:
        try:
            with open(name) as f:
                text = f.read()
                contents.append(text)
        except IOError as exc:
            if exc.errno != errno.EISDIR:
                raise
    return contents

def create_df(pos, neg):
    neg_sentiment = [0 for _ in range(len(neg))]
    pos_sentiment = [1 for _ in range(len(pos))]
    
    review = pos + neg
    sentiment = pos_sentiment + neg_sentiment
    
    return pd.DataFrame({'reviews': review, 'sentiment': sentiment})

In [3]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer

stop_words = set(stopwords.words("english"))
punctuation = '!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'

def clean_text(text):
    text = text.lower()
    
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # remove the characters [\], ['] and ["]
    text = re.sub(r"\\", "", text)    
    text = re.sub(r"\'", "", text)    
    text = re.sub(r"\"", "", text)   
    
    # delete punctuation characters
    filters='!"\'#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
    translate_dict = dict((c, "") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    
    tokens = word_tokenize(text)
    filtered_stop = [i for i in tokens if not i in stop_words]
    
    stemmer = SnowballStemmer("english")
    stemmed = [stemmer.stem(word) for word in filtered_stop]
            
    text = " ".join(stemmed)
    
    return text

In [4]:
train_pos = process_docs(location='data/aclImdb/train/pos')
train_neg = process_docs(location='data/aclImdb/train/neg')

In [5]:
training_df = create_df(train_pos, train_neg)

In [6]:
training_df['cleaned_reviews'] = training_df['reviews'].apply(lambda x: clean_text(x))

In [7]:
# metrics on number of words per entry
training_df['cleaned_reviews'].str.len().describe()

count    25000.000000
mean       750.983240
std        576.608637
min         24.000000
25%        391.000000
50%        553.500000
75%        915.000000
max       8380.000000
Name: cleaned_reviews, dtype: float64

In [8]:
# convert words to numbers for RNN
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(num_words=3000)
tokenizer.fit_on_texts(training_df['cleaned_reviews'])

tokenized = tokenizer.texts_to_sequences(training_df['cleaned_reviews'])
padded = pad_sequences(tokenized, maxlen=500, truncating='post')

Using TensorFlow backend.


In [19]:
from keras.utils import to_categorical

X_train = padded
y_train = to_categorical(training_df['sentiment'])

In [20]:
y_train.shape

(25000, 2)

In [22]:
import keras 
from keras.layers import Embedding, LSTM, Dense
from keras.models import Sequential

model = Sequential()
model.add(Embedding(3000, 128, input_length=X_train.shape[1]))
model.add(LSTM(300, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))

model.compile(loss=keras.losses.categorical_crossentropy,optimizer='adam',metrics=['accuracy'])

In [25]:
model.fit(X_train,y_train,epochs=10,batch_size=256)

Epoch 1/10
 2304/25000 [=>............................] - ETA: 9:49 - loss: 0.5607 - acc: 0.7248

KeyboardInterrupt: 