Original data available at:
https://www.kaggle.com/kaushik3497/imdb-sentiment-analysis
    

In [1]:
%matplotlib inline
import sys
sys.path.append('../data/')

In [130]:
from typing import Tuple, Sequence, Dict, Callable, Any, List, Pattern, Union, Iterable, overload

import re
from pathlib import Path
import nltk
from concurrent.futures import ProcessPoolExecutor, as_completed
from nltk.stem import PorterStemmer, LancasterStemmer, WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer, PunktSentenceTokenizer
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger
from nltk.corpus import brown
import toolz as tz
from pattern.en import parse
from functools import partial
import csv
import toolz.curried as tzc
import pandas as pd

import pattern
from pattern.web import plaintext

import bs4

from sklearn.model_selection import train_test_split
from sklearn import metrics
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [15]:
train_file = Path('../data/labeledTrainData.tsv')


In [16]:
raw_dataset = pd.read_csv(train_file, sep='\t', quoting=csv.QUOTE_NONE) 
# We'll handle quotes manually.

raw_train_dataset, raw_test_dataset = train_test_split(raw_dataset, test_size=0.1)

In [46]:

def get_curated_english_contractions():
    with open('../data/curated_contractions.csv') as f:
        reader = csv.reader(f)
        contractions = [row for row in reader]
    return contractions


In [47]:
QUOTES_PATTERN = [(r'\\\"|\\\'', '')]
PARENTHESIS_PATTERN = [(r'[]}{)(]', '')]
ENGLISH_CONTRACTIONS = get_curated_english_contractions()

# We manually curate the stopwords to not remove words that may indicate some form of polarity.
STOPWORDS = [
    'the', '.', 'a', 'and', ',', 'of', 'to', 'is', 'this', 
    'it',  'that', 'i', 'but', 'for', 'with', 
    'was', 'as', 'have', 'on', "'s", 'has', 'are',
    'be', 'one', 'you', 'at', 'all', 'an', 'from', 
    'by', 'like', 'so', 'who', 'they', 'his', 'do', 
    'there', 'about', 'if',  'or', 'he', 'can', 'what',
    'when', 'would',  'had',
    'time', 'even', 'only', 'will',  'see', 'my', 
    'which', 'me', 'than', 'did', 'does',
    'were', 'their', 'could', 'get', 'been', 'other',
    'into', 'her', 'also', 'how', 'because'
]

In [56]:
def strip_quotes_and_ids(dataset):
    no_id_dataset = dataset.drop('id', 'columns')
    no_id_dataset['review'] = no_id_dataset['review'].str.strip('\""')
    return no_id_dataset





def remove_quotes(text, pattern: re.Pattern = QUOTES_PATTERN):
    '''
    removes escaped quotes. (\' or \")
    '''
    text = pattern.sub('', text)
    return text

def clean_html(text):
    '''
    removes html tags.
    '''
    return plaintext(text)

@tz.curry
def filter_stopwords(stopwords, tokens):
    return tz.filter(lambda t: t not in stopwords, tokens)

class RegexpReplacer:
    def __init__(self, patterns: Sequence[Tuple[Union[str, re.Pattern], str]]):
        self.patterns = [(re.compile(p), r) for p, r in patterns]
    
    def replace(self, text: str) -> str:
        for pattern, repl in self.patterns:
            text = pattern.sub(repl, text)
        return text
    
    def __call__(self, text):
        return self.replace(text)
    

def process_text(text, func):
    '''
    clean and tokenize a text.
    tokenize: bool indicate if the function should tokenize.
    resolve: bool indicates if the iterator should be outputted to a list or not.
    '''
    return func(text)
    
def create_corpus_processor(*steps):
    '''
    Produces a function that can be applied to a corpus of document, applying each step in series to each document.
    '''
                   
    def process_corpus(corpus: Iterable[str], 
                       map: Callable[[Callable, Iterable], Iterable] = tz.map,
                       collect=None) -> Iterable[str]:
        '''
        Process a corpus, represented as an iterable of text into a clean and tokenized corpus.
        Downstream tasks can be mapped to the return iterable.
        You can provide a custom map, for example to process the items in parallel.
        '''
        
        func = tz.compose(collect or tz.identity, *reversed(steps)) # compose applies last step first.
        apply_steps = partial(process_text, func=func)
        processed_corpus = tz.map(apply_steps, corpus)
        return processed_corpus
    
    return process_corpus

In [57]:
quote_remover = RegexpReplacer(QUOTES_PATTERN)
parenthesis_remover = RegexpReplacer(PARENTHESIS_PATTERN)
contraction_replacer = RegexpReplacer(ENGLISH_CONTRACTIONS)

In [58]:
sent_tokenizer = PunktSentenceTokenizer()
word_tokenizer = TreebankWordTokenizer()
tokenize_text = tzc.compose(tzc.mapcat(word_tokenizer.tokenize), sent_tokenizer.tokenize)

In [59]:
process_corpus = create_corpus_processor(quote_remover, 
                                        parenthesis_remover, 
                                        contraction_replacer, 
                                        clean_html,
                                        str.lower, 
                                        tokenize_text)
                                    
                        
                    

In [105]:
dataset = strip_quotes_and_ids(raw_train_dataset)
y = dataset['sentiment']
dataset.head(10)

Unnamed: 0,sentiment,review
1383,0,Some people seem to think this was the worst m...
12674,0,I registered just to make this comment (which ...
24507,1,The best Cheech & Chong movie so far!! Of all ...
13940,1,Return To The 3th Chamber is the comedic seque...
6088,1,"I wish \""that '70s show\"" would come back on t..."
182,1,This was the first PPV in a new era for the WW...
17024,1,The Sopranos (now preparing to end) is the ver...
6744,0,... You can't exactly shove her out of the way...
5802,0,"Yep, the topic is a straight quote from the mo..."
18440,1,I will start by saying that this has undeserve...


In [65]:
with ProcessPoolExecutor() as executor:
    map_func = partial(executor.map, chunksize=1000)
    processed_corpus = list(process_corpus(dataset['review'], map_func, list))

In [113]:
corpus_train, corpus_devtest, y_train, y_devtest = train_test_split(processed_corpus, y, test_size=0.2)

In [115]:
max_length = max(tz.map(len, corpus_train))
max_length

2735

In [116]:
num_features = 10000

In [117]:
tokenizer = Tokenizer(num_words=num_features, oov_token='<unk>')

In [119]:
tokenizer.fit_on_texts(corpus_train)

In [120]:
print(tokenizer.texts_to_sequences(corpus_train[:1]))

[[1111, 1539, 1, 9443, 24, 1288, 6, 2951, 116, 203, 450, 204, 16, 1141, 81, 29, 24, 1, 1466, 1, 4461, 3353, 18, 2, 6439, 7, 170, 7, 2, 245, 203, 23, 131, 102, 4, 9443, 9, 354, 2, 1693, 1905, 7, 2, 166, 9444, 4, 13, 23, 9, 17, 5319, 5, 1247, 17, 2, 1, 7, 1, 4, 12, 110, 202, 2460, 5, 2811, 8, 142, 14, 12, 290, 10, 11, 1, 3, 1, 4590, 8, 75, 46, 475, 2, 1468, 87, 31, 278, 4, 2, 1480, 9, 81, 1, 118, 9443, 1, 6, 4695, 5, 3464, 10, 4, 2, 248, 750, 19, 2, 182, 7, 1933, 1, 4, 70, 269, 39, 24, 1483, 11, 3624, 4495, 3, 123, 179, 7937, 9603, 3, 903, 5, 1, 9, 38, 275, 45, 3159, 1018, 8, 31, 1918, 3, 87, 686, 368, 98, 9443, 1288, 8, 176, 41, 116, 4038, 159, 1650, 8, 1, 30, 629, 789, 19, 6, 778, 400, 4, 2, 277, 113, 3, 6, 7347, 1545, 967, 3, 9, 255, 37, 6, 1111, 275, 19, 6, 3966, 1111, 1149, 4, 1150, 40, 120, 6, 1430, 436, 3, 29, 9, 92, 2, 6160, 1545, 967, 11, 1103, 4, 451, 2, 194, 14, 29, 2880, 1310, 352, 9870, 5, 301, 104, 90, 9031, 3, 101, 3247, 104, 90, 30, 348, 69, 5918, 9, 2, 418, 1780, 7, 6, 6

In [123]:
def transform(corpus, max_length):
    indexed_corpus = tokenizer.texts_to_sequences(corpus)
    return pad_sequences(indexed_corpus, maxlen=max_length)

In [124]:
x_train = transform(corpus_train, max_length)

## Model

In [162]:
def build(vocab_size, dim, input_length):
    model = tf.keras.models.Sequential()
    model.add(layers.Embedding(vocab_size, dim, input_length=input_length))
    model.add(layers.Conv1D(16, 5, activation='relu'))
    model.add(layers.MaxPool1D(5))
#     model.add(layers.Flatten())
#     model.add(layers.Dropout(0.33))
    model.add(layers.Conv1D(16, 5, activation='relu'))
    model.add(layers.GlobalMaxPool1D())
    model.add(layers.Dropout(0.33))
    model.add(layers.Dense(8, activation='relu'))
    model.add(layers.Dense(1, activation='sigmoid'))
    
    
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc']
                 )
    model.summary()
    return model

In [165]:
model = build(tokenizer.num_words, 64, max_length)

Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_13 (Embedding)     (None, 2735, 64)          640000    
_________________________________________________________________
conv1d_8 (Conv1D)            (None, 2731, 16)          5136      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 546, 16)           0         
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 542, 16)           1296      
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 16)                0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_20 (Dense)             (None, 8)               

In [166]:
history = model.fit(x_train, y_train, epochs=20, batch_size=64, validation_split=0.2)

Train on 14400 samples, validate on 3600 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [140]:
x_devtest = transform(corpus_devtest, max_length)

In [167]:
y_hat = model.predict_classes(x_devtest)

In [168]:
print(metrics.classification_report(y_devtest, y_hat, target_names=['neg', 'pos']))

              precision    recall  f1-score   support

         neg       0.87      0.85      0.86      2254
         pos       0.85      0.87      0.86      2246

   micro avg       0.86      0.86      0.86      4500
   macro avg       0.86      0.86      0.86      4500
weighted avg       0.86      0.86      0.86      4500

