# LSTM for sequence classification in the IMDB dataset


## References
- https://machinelearningmastery.com/sequence-classification-lstm-recurrent-neural-networks-python-keras/
- https://www.kaggle.com/c/word2vec-nlp-tutorial/data
- https://medium.com/@sabber/classifying-yelp-review-comments-using-cnn-lstm-and-pre-trained-glove-word-embeddings-part-3-53fcea9a17fa
- https://www.oreilly.com/ideas/convolutional-neural-networks-for-language-tasks
- http://ruder.io/text-classification-tensorflow-estimators/


## Problem Description 

Our dataset is IMDB movie review sentiment classification problem. Each movie review is a variable sequence of words and the sentiment of each movie review must be classified.

The dataset contains 25,000 highly-polar movie reviews (good or bad) for training and the same amount again for testing. The problem is to determine whether a given movie review has a positive or negative sentiment.

The data was collected by Stanford researchers and was used in a 2011 paperwhere an accuracy of 88.89% was achieved.



## Setup

In [None]:
!conda install -c conda-forge textacy gensim=3.4.0 spacy=1.9.0  seaborn=0.9.0 -y -q

In [None]:
!pip install -q kaggle ipywidgets textblob  swifter keras tensorflow

**Download a pretrained english model**

In [None]:
!python -m spacy download en

In [1]:
import json
import os
import shutil
import pandas as pd
import pprint
from textblob import TextBlob
import numpy as np
from swifter import swiftapply
from numba import jit

from tqdm import tqdm
import numpy
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, Conv1D
from keras.layers.embeddings import Embedding

from bs4 import BeautifulSoup

import seaborn as sns
import spacy    
from sklearn import metrics
from sklearn.model_selection import train_test_split

  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
sns.__version__

'0.9.0'

In [3]:
spacy.__version__

'1.9.0'

In [4]:
nlp = spacy.load("en")
#nlp = spacy.load("en"

In [5]:
#pd.set_option('display.height', 1000)
#pd.set_option('display.max_rows', 500)
#pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)
#pd.set_option('display.expand_frame_repr', False)
pd.options.display.max_colwidth = 500

**Download Dataset**

In [6]:
kaggle_token=json.loads("""
{"username":"npatta01","key":"61ebcf3b4a3fc7ebfe7aee570d8dc07c"}
""")

In [7]:
COMPETITION="word2vec-nlp-tutorial"

In [10]:
!mkdir -p ~/.kaggle
!echo {json.dumps(kaggle_token)} > ~/.kaggle/kaggle.json
#!chmod 600 ~/.kaggle/kaggle.json

In [8]:
PATH = os.path.expanduser("~/.kaggle")

In [9]:
shutil.rmtree(PATH,ignore_errors=True)
os.makedirs(PATH)

In [11]:
with open(f'{PATH}/kaggle.json', 'w') as outfile:
    json.dump(kaggle_token, outfile)
    
#with open(f'/content/.kaggle/kaggle.json', 'w') as outfile:
#    json.dump(kaggle_token, outfile)    
    
    


In [12]:
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c {COMPETITION} --path {PATH}/{COMPETITION} 

Downloading sampleSubmission.csv to /Users/nidhin.pattaniyil/.kaggle/word2vec-nlp-tutorial
  0%|                                                | 0.00/276k [00:00<?, ?B/s]
100%|████████████████████████████████████████| 276k/276k [00:00<00:00, 27.6MB/s]
Downloading unlabeledTrainData.tsv.zip to /Users/nidhin.pattaniyil/.kaggle/word2vec-nlp-tutorial
 99%|█████████████████████████████████████▍| 26.0M/26.4M [00:02<00:00, 11.7MB/s]
100%|██████████████████████████████████████| 26.4M/26.4M [00:02<00:00, 11.8MB/s]
Downloading testData.tsv.zip to /Users/nidhin.pattaniyil/.kaggle/word2vec-nlp-tutorial
 93%|███████████████████████████████████▌  | 12.0M/12.8M [00:01<00:00, 12.2MB/s]
100%|██████████████████████████████████████| 12.8M/12.8M [00:01<00:00, 12.1MB/s]
Downloading labeledTrainData.tsv.zip to /Users/nidhin.pattaniyil/.kaggle/word2vec-nlp-tutorial
100%|██████████████████████████████████████| 13.1M/13.1M [00:01<00:00, 11.9MB/s]



In [13]:
#@jit
def parse_html(doc):
    """Remove html from text"""
    soup = BeautifulSoup(doc, "lxml")
    doc = soup.get_text()
    return doc

def prepare_single(doc, parts_of_speech=None):
    """
    Call prepare on single doc
    """
    return list (prepare([doc],parts_of_speech=parts_of_speech))[0]


def prepare(docs, n_threads=10, parts_of_speech=None, process_entity=False, process_tag=True, process_parse=False):
    """
    Use Spacy pipleines to 
    - ignore stopwords
    - take word lemma
    """
    for doc in nlp.pipe(tqdm(docs), entity=process_entity, tag=process_tag
                        , parse=process_parse, n_threads=n_threads, batch_size=1000):
        tokens = []
        for token in doc:
            if not token.is_stop:
                tokens.append(token.lemma_)
                #print(token, token.lemma, token.lemma_,token.pos_,token.is_stop)

        yield tokens




In [14]:
prepare_single(
        parse_html (
        "<b>hi</b> Tom. How are you? The book is a good read."
        )
)   

100%|██████████| 1/1 [00:00<00:00, 603.15it/s]


['hi', 'tom', '.', '?', 'book', 'good', 'read', '.']

In [15]:
df = pd.read_csv(f"{PATH}/{COMPETITION}/labeledTrainData.tsv.zip", compression='zip',sep='\t')

In [16]:
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle m..."
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for different thin..."
2,7759_3,0,"The film starts with a manager (Nicholas Bell) giving welcome investors (Robert Carradine) to Primal Park . A secret project mutating a primal animal using fossilized DNA, like ¨Jurassik Park¨, and some scientists resurrect one of nature's most fearsome predators, the Sabretooth tiger or Smilodon . Scientific ambition turns deadly, however, and when the high voltage fence is opened the creature escape and begins savagely stalking its prey - the human visitors , tourists and scientific.Meanwh..."
3,3630_4,0,"It must be assumed that those who praised this film (\the greatest filmed opera ever,\"" didn't I read somewhere?) either don't care for opera, don't care for Wagner, or don't care about anything except their desire to appear Cultured. Either as a representation of Wagner's swan-song, or as a movie, this strikes me as an unmitigated disaster, with a leaden reading of the score matched to a tricksy, lugubrious realisation of the text.<br /><br />It's questionable that people with ideas as to w..."
4,9495_8,1,"Superbly trashy and wondrously unpretentious 80's exploitation, hooray! The pre-credits opening sequences somewhat give the false impression that we're dealing with a serious and harrowing drama, but you need not fear because barely ten minutes later we're up until our necks in nonsensical chainsaw battles, rough fist-fights, lurid dialogs and gratuitous nudity! Bo and Ingrid are two orphaned siblings with an unusually close and even slightly perverted relationship. Can you imagine playfully..."


In [17]:
row =510
pprint.pprint(df.iloc[row].to_dict())

{'id': '4750_7',
 'review': 'In Stand By Me, Vern and Teddy discuss who was tougher, Superman '
           'or Mighty Mouse. My friends and I often discuss who would win a '
           'fight too. Sometimes we get absurd and compare guys like MacGyver '
           'and The Terminator or Rambo and Matrix. But now it seems that we '
           'discuss guys like Jackie Chan, Bruce Lee and Jet Li. It is a '
           'pointless comparison seeing that Lee is dead, but it is a fun one. '
           'And if you go by what we have seen from Jet Li in Lethal 4 and '
           'Black Mask, you have to at least say that he would match up well '
           'against Chan. In this film he comes across as a martial arts '
           'God.<br /><br />Black Mask is about a man that was created along '
           'with many other men, to be supreme fighting machines. Their only '
           'purpose is to win wars that other people lose. They are invincible '
           'in some ways. Now that is the

## Textblob

In [18]:
res = TextBlob(df.iloc[row]["review"])

In [19]:
res.sentiment

Sentiment(polarity=0.13822544642857143, subjectivity=0.5142857142857142)

In [20]:
len(df.iloc[0]["review"])

2302

The polarity score is a float within the range [-1.0, 1.0].      
The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.

In [21]:
TextBlob("good movie").sentiment

Sentiment(polarity=0.7, subjectivity=0.6000000000000001)

In [22]:
TextBlob("good movie".lower()).sentiment

Sentiment(polarity=0.7, subjectivity=0.6000000000000001)

In [23]:
# Spacy

In [24]:
#tqdm.pandas(desc="my bar!")

In [25]:
df['review'] = swiftapply( df['review'], parse_html)

In [26]:
df['processed'] = list(prepare(df['review']))
df['processed'] = df['processed'].astype(str)

100%|██████████| 25000/25000 [01:33<00:00, 268.46it/s]


In [27]:
#reviews = df['review'].apply(prepare)

In [28]:
def sentiment_textblob(text):
    sentiment = TextBlob(text).sentiment
    return sentiment.polarity

In [29]:
df['review'].head()

0    With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle m...
1    \The Classic War of the Worlds\" by Timothy Hines is a very entertaining film that obviously goes to great effort and lengths to faithfully recreate H. G. Wells' classic book. Mr. Hines succeeds in doing so. I, and those who watched his film with me, appreciated the fact that it was not the standard, predictable Hollywood fare that comes out every year, e.g. the Spielberg version with Tom Cruise that had only the slightest resemblance to the book. Obviously, everyone looks for differen

In [30]:
df.columns

Index(['id', 'sentiment', 'review', 'processed'], dtype='object')

In [None]:
#df['sentiment_textblob'] = swiftapply( df['processed'].to_string(), sentiment_textblob)
df['sentiment_textblob'] = df['processed'].apply(sentiment_textblob)

In [None]:
metrics.accuracy_score(y_true=df["sentiment"], y_pred=df['sentiment_textblob'] > 0.2)

In [None]:
metrics.confusion_matrix(y_true=df["sentiment"]
                         , y_pred=df['sentiment_textblob'] > 0.2
                         )

In [None]:
X = df['review']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, )

# Google Cloud Sentiment Analysis

[Demo Page](https://cloud.google.com/natural-language/)

[Pricing](https://cloud.google.com/natural-language/)

- 5k requests, 1$ (upto 1M requests)
- 1 request, can contain upto 1000 characters

How much would our dataset count for?

In [None]:
# get characther count for each row, count up
num_requests = np.ceil(df["review"].str.len()/1000)
print (f"Row 1 will count as {num_requests.iloc[0]} docs")
total_number_of_doc_requests = num_requests.sum()
print(f"We will be making {total_number_of_doc_requests} requests")
total_cost = total_number_of_doc_requests/5000
print(f"Project will cost {total_cost} ")

# Naive Bayes

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier

In [None]:
# classifier
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss="log")),
])

In [None]:
pipeline.fit(X=X_train,y=y_train)

In [None]:
feature_names = pipeline.named_steps["vect"].get_feature_names()

In [None]:
coefficients = pipeline.named_steps["clf"].coef_[0]

In [None]:
analysis_df = pd.DataFrame({'feature_names':feature_names, 'coefficients':coefficients})
analysis_df['abs'] = np.abs(analysis_df['coefficients'])
analysis_df= analysis_df.sort_values(by=['abs'],ascending=False)

In [None]:
analysis_df.head()

In [None]:
len(analysis_df)

In [None]:
top_words_df=analysis_df.head(20).sort_values(by=['coefficients'])
colors = list(np.where(top_words_df["coefficients"]>0, 'g', 'r'))

In [None]:
top_words_df.plot.barh(y="coefficients",x="feature_names", color=colors)

In [None]:
predicted = pipeline.predict(X_test)

In [None]:
metrics.confusion_matrix(y_true=y_test
                         , y_pred=predicted
                         )

In [None]:
metrics.accuracy_score(y_true=y_test
                         , y_pred=predicted)

In [None]:
print(metrics.classification_report(y_test, predicted))

# Deep Learning Model

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential

In [None]:
X = df['review']
y = df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, )

In [None]:
top_words = 100000
embedding_vecor_length = 50
max_review_length = 500
droput = 0.2
lstm_units = 10

In [None]:
def prepare_input_for_dl(tokenizer, docs, max_len=max_review_length,):
    """
    Given a tokenizerm convert text to 
    """
    sequences = tokenizer.texts_to_sequences(docs)
    sequences = sequence.pad_sequences(sequences, maxlen=max_len)
    return sequences

In [None]:
tokenizer = Tokenizer(num_words= top_words)
tokenizer.fit_on_texts(X_train)

In [None]:
_X_train = prepare_input_for_dl( tokenizer, X_train )
_X_test  = prepare_input_for_dl( tokenizer, X_test  )

In [None]:
model = Sequential()
model.add(Embedding(top_words, embedding_vecor_length, input_length=max_review_length))
model.add(LSTM(lstm_units, dropout=droput, recurrent_dropout=droput))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])


In [None]:
print(model.summary())

In [None]:
_X_train.shape

In [None]:
y_train.shape

In [None]:
model.fit(_X_train, y_train, epochs=3, batch_size=256)

In [None]:
predicted = model.predict_classes(_X_test)

In [None]:
metrics.confusion_matrix(y_true=y_test
                         , y_pred=predicted
                         )

In [None]:
metrics.accuracy_score(y_true=y_test
                         , y_pred=predicted)

In [None]:
# Final evaluation of the model
scores = model.evaluate(_X_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
df.head(7)

In [None]:
model.predict_proba([np.expand_dims(_X_train[6], axis=0)])

In [None]:
_X_train.shape

In [None]:
np.expand_dims(_X_train[0], axis=0).shape

In [None]:
?Tokenizer

# Predictions

In [None]:
df.sentiment.value_counts()

In [None]:
sample_sentence = "this movie is horrible"

In [None]:
TextBlob(sample_sentence).sentiment

In [None]:
pipeline.predict_proba([sample_sentence])

In [None]:
model.predict_proba(
    prepare_input_for_dl(tokenizer, [sample_sentence], max_len=max_review_length)
)