### Import the packages

In [1]:
### General Packages
import numpy as np
import pandas as pd
import os
import re
import bz2
from collections import Counter

### Packages for text processing and modelling

In [2]:
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding,LSTM,Dropout,Dense
from keras.utils import to_categorical
from sklearn.preprocessing import LabelBinarizer
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


['/job:localhost/replica:0/task:0/device:GPU:0',
 '/job:localhost/replica:0/task:0/device:GPU:1']

In [3]:
print(os.listdir('data'))

['test.ft.txt.bz2', 'train.ft.txt.bz2']


### Need to create a function to load the data, as the data is in a different encoding format we need to change it into readable text that can be worked with easily in python

In [4]:
def load_data(source):
    data = bz2.BZ2File(source)
    data_lines = data.readlines()
    data_clean = [x.decode('utf-8') for x in data_lines]
    return data_clean

In [5]:
training_data_path = 'data/train.ft.txt.bz2'

In [6]:
training_data = load_data(training_data_path)

In [7]:
train_len = len(training_data)
print(train_len)

3600000


In [8]:
test_data_path = 'data/test.ft.txt.bz2'

In [9]:
test_data = load_data(test_data_path)

In [10]:
test_len = len(test_data)
print(test_len)

400000


In [11]:
training_data[:2]

['__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^\n',
 "__label__2 The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny.\n"]

In [12]:
test_data[:2]

['__label__2 Great CD: My lovely Pat has one of the GREAT voices of her generation. I have listened to this CD for YEARS and I still LOVE IT. When I\'m in a good mood it makes me feel better. A bad mood just evaporates like sugar in the rain. This CD just oozes LIFE. Vocals are jusat STUUNNING and lyrics just kill. One of life\'s hidden gems. This is a desert isle CD in my book. Why she never made it big is just beyond me. Everytime I play this, no matter black, white, young, old, male, female EVERYBODY says one thing "Who was that singing ?"\n',
 "__label__2 One of the best game music soundtracks - for a game I didn't really play: Despite the fact that I have only played a small portion of the game, the music I heard (plus the connection to Chrono Trigger which was great as well) led me to purchase the soundtrack, and it remains one of my favorite albums. There is an incredible mix of fun, epic, and emotional songs. Those sad and beautiful tracks I especially like, as there's not too 

### Profile the data

### Find what a positive and what a negative review looks like based on their label markers

In [13]:
sample_postive = []
sample_negative = []
for review in training_data[:50]:
    if '__label__2' in review:
        sample_postive.append(review)
    else:
        sample_negative.append(review)

In [14]:
print("Number of postive sample reviews {}".format(len(sample_postive)))
print("Number of negative sample reviews {}".format(len(sample_negative)))

Number of postive sample reviews 27
Number of negative sample reviews 23


In [15]:
sample_postive[:2]

['__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^\n',
 "__label__2 The best soundtrack ever to anything.: I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny.\n"]

In [16]:
sample_negative[:2]

['__label__1 Buyer beware: This is a self-published book, and if you want to know why--read a few paragraphs! Those 5 star reviews must have been written by Ms. Haddon\'s family and friends--or perhaps, by herself! I can\'t imagine anyone reading the whole thing--I spent an evening with the book and a friend and we were in hysterics reading bits and pieces of it to one another. It is most definitely bad enough to be entered into some kind of a "worst book" contest. I can\'t believe Amazon even sells this kind of thing. Maybe I can offer them my 8th grade term paper on "To Kill a Mockingbird"--a book I am quite sure Ms. Haddon never heard of. Anyway, unless you are in a mood to send a book to someone as a joke---stay far, far away from this one!\n',
 "__label__1 The Worst!: A complete waste of time. Typographical errors, poor grammar, and a totally pathetic plot add up to absolutely nothing. I'm embarrassed for this author and very disappointed I actually paid for this book.\n"]

### We need to lower case all the text, as if we do any word profiling it will see upper and lower case words as two different occurances

In [26]:
def count_sentiment_classes(text,criteria):
    class_1_count = 0
    class_2_count = 0
    for review in text:
        if criteria in review:
            class_1_count = class_1_count + 1
        else:
            class_2_count = class_2_count + 1
    print("class one count {} {}".format(class_1_count,criteria))
    print("class two count {} other label".format(class_2_count))

In [27]:
count_sentiment_classes(training_data,"__label__2")

class one count 1800000 __label__2
class two count 1800000 other label


### Data cleaning
1. lower case all words
2. strip out stop words
3. remove special characters (to do)

In [28]:
def clean_data(text):
    data_lower = [x.lower() for x in text]
    data_stop = [word for word in data_lower if word not in stopwords.words('english')]
    return data_stop
    

In [29]:
training_clean = clean_data(training_data)

In [30]:
test_clean = clean_data(test_data)

In [31]:
training_clean[:2]

['__label__2 stuning even for the non-gamer: this sound track was beautiful! it paints the senery in your mind so well i would recomend it even to people who hate vid. game music! i have played the game chrono cross but out of all of the games i have ever played it has the best music! it backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. it would impress anyone who cares to listen! ^_^\n',
 "__label__2 the best soundtrack ever to anything.: i'm reading a lot of reviews saying that this is the best 'game soundtrack' and i figured that i'd write a review to disagree a bit. this in my opinino is yasunori mitsuda's ultimate masterpiece. the music is timeless and i'm been listening to it for years now and its beauty simply refuses to fade.the price tag on this is pretty staggering i must say, but if you are going to buy any cd for this much money, this is the only one that i feel would be worth every penny.\n"]

In [32]:
test_clean[:2]

['__label__2 great cd: my lovely pat has one of the great voices of her generation. i have listened to this cd for years and i still love it. when i\'m in a good mood it makes me feel better. a bad mood just evaporates like sugar in the rain. this cd just oozes life. vocals are jusat stuunning and lyrics just kill. one of life\'s hidden gems. this is a desert isle cd in my book. why she never made it big is just beyond me. everytime i play this, no matter black, white, young, old, male, female everybody says one thing "who was that singing ?"\n',
 "__label__2 one of the best game music soundtracks - for a game i didn't really play: despite the fact that i have only played a small portion of the game, the music i heard (plus the connection to chrono trigger which was great as well) led me to purchase the soundtrack, and it remains one of my favorite albums. there is an incredible mix of fun, epic, and emotional songs. those sad and beautiful tracks i especially like, as there's not too 

### Build into a dataframe for ease of use (although this step can be also subsituted for loading to the a database depending on the RAM of your pc) 

### Since we want to reduce the features for the model as we will run out of memory, we will only take half of the data at this stage, in a later model I will look at using all of the data

### ok before we continue lets turn that data into a dataframe

In [33]:
def sentiment_seperator(text, splitStr='__label__'):
    review_transform = []
    for review in range(len(text)):
        reviews = str(text[review])
        split_reviews = reviews.split(splitStr)
        sentiment_score = split_reviews[1]
        review_final = sentiment_score[2:len(sentiment_score)-1]
        sentiment=sentiment_score[0]
        review_transform.append([review_final,sentiment])
    return review_transform

In [34]:
training_final = sentiment_seperator(training_clean[:train_len//2])

In [35]:
len(training_final)

1800000

In [36]:
test_final = sentiment_seperator(test_clean[:test_len//2])

In [37]:
len(test_final)

200000

In [38]:
train_df = pd.DataFrame(training_final, columns=['review','sentiment'])
test_df = pd.DataFrame(test_final, columns=['review','sentiment'])

In [39]:
train_df.head(10)

Unnamed: 0,review,sentiment
0,stuning even for the non-gamer: this sound tra...,2
1,the best soundtrack ever to anything.: i'm rea...,2
2,amazing!: this soundtrack is my favorite music...,2
3,excellent soundtrack: i truly like this soundt...,2
4,"remember, pull your jaw off the floor after he...",2
5,an absolute masterpiece: i am quite sure any o...,2
6,"buyer beware: this is a self-published book, a...",1
7,glorious story: i loved whisper of the wicked ...,2
8,a five star book: i just finished reading whis...,2
9,whispers of the wicked saints: this was a easy...,2


In [40]:
train_df.describe()

Unnamed: 0,review,sentiment
count,1800000,1800000
unique,1799872,2
top,cable only looks secure: 11/28/08 - buyer bewa...,2
freq,3,903442


In [41]:
test_df.head(10)

Unnamed: 0,review,sentiment
0,great cd: my lovely pat has one of the great v...,2
1,one of the best game music soundtracks - for a...,2
2,batteries died within a year ...: i bought thi...,1
3,"works fine, but maha energy is better: check o...",2
4,great for the non-audiophile: reviewed quite a...,2
5,dvd player crapped out after one year: i also ...,1
6,"incorrect disc: i love the style of this, but ...",1
7,dvd menu select problems: i cannot scroll thro...,1
8,unique weird orientalia from the 1930's: exoti...,2
9,"not an ""ultimate guide"": firstly,i enjoyed the...",1


In [42]:
train_df['sentiment'].value_counts()

2    903442
1    896558
Name: sentiment, dtype: int64

In [43]:
test_df['sentiment'].value_counts()

2    100565
1     99435
Name: sentiment, dtype: int64

In [44]:
X_train = train_df['review']
y_train = train_df['sentiment']
X_test = test_df['review']
y_test = test_df['sentiment']

In [45]:
def binary_y(y_data):
    y = y_data.astype('int32')
    lb=LabelBinarizer(pos_label=1,neg_label=0)
    y_binarized=lb.fit_transform(y)
    y=to_categorical(num_classes=2,y=y_binarized)
    print(y.shape)
    return y

In [46]:
y_train = binary_y(y_train)

(1800000, 2)


In [47]:
y_train

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [0., 1.]])

In [48]:
y_test = binary_y(y_test)

(200000, 2)


In [49]:
max_len = 100000
token=Tokenizer(num_words=max_len,lower=True)

In [66]:
token.fit_on_texts(X_train)

In [51]:
sequence = token.texts_to_sequences(X_train)

In [52]:
X_train_final=pad_sequences(sequence,maxlen=100)

In [53]:
X_train_final.shape,y_train.shape

((1800000, 100), (1800000, 2))

In [54]:
X_train_final

array([[    0,     0,     0, ...,  2572,     5,   304],
       [    0,     0,     0, ...,   163,   156,  2619],
       [    1,   169,     2, ...,     7,   424,   247],
       ...,
       [    0,     0,     0, ...,   196,    35, 12493],
       [    0,     0,     0, ...,     8,     5,   232],
       [    0,     0,     0, ...,     1,  1695,    53]])

In [55]:
### Hyperparameters

epochs = 3
batch_size = 512
validation_split = 0.10
Embed = X_train_final.shape

In [96]:
Embed[1]

100

In [57]:
model = Sequential()
model.add(Embedding(max_len,Embed[1]))
model.add(LSTM(256,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(512))
model.add(Dropout(0.2))
model.add(Dense(500,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(100,activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(2,activation='sigmoid'))


Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [58]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         10000000  
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 256)         365568    
_________________________________________________________________
dropout_1 (Dropout)          (None, None, 256)         0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 512)               1574912   
_________________________________________________________________
dropout_2 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 500)               256500    
_________________________________________________________________
dropout_3 (Dropout)          (None, 500)               0         
__________

In [59]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc'])


Instructions for updating:
keep_dims is deprecated, use keepdims instead


In [60]:
model.fit(X_train_final,y_train,
          batch_size=batch_size,
          epochs=epochs,
          verbose=1,
          validation_split=validation_split)

Train on 1620000 samples, validate on 180000 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x18d05a9fc88>

In [129]:
token.fit_on_texts(X_test)

In [130]:
X_test_sequence = token.texts_to_sequences(X_test)

In [131]:
X_test_final=pad_sequences(X_test_sequence,maxlen=100)

In [132]:
X_train_final.shape,y_test.shape,X_test_final.shape

((1800000, 100), (200000, 2), (200000, 100))

In [133]:
model.evaluate(X_test_final,y_test)



[0.514247350269556, 0.79407]

### Accuracy is 79%, which is ok but could be better with some more tweaking of the parameters, and also using more of the data - feeding it out of a database.

### However in the meantime let's do some testing with random reviews

In [141]:
def predict(text):
    print(text)
    new_text = text
    text_sequence = token.texts_to_sequences(new_text)
    new_text_final = pad_sequences(text_sequence,maxlen=100)
    prediction = model.predict(new_text_final)
    pred = np.argmax(prediction)
    print("The outcome score is {}".format(np.argmax(prediction)))
    if pred == 0:
        print("This is a negative review")
    else:
        print("This is a positive review")
    

In [142]:
test_1 = "This is a pretty horrible product"

In [143]:
predict(test_1)

This is a pretty horrible product
The outcome score is 0
This is a negative review


In [144]:
test_2 = "I would love to watch this again, it was an amazing show and I wish there was a whole series"

In [145]:
predict(test_2)

I would love to watch this again, it was an amazing show and I wish there was a whole series
The outcome score is 26
This is a positive review


In [148]:
test_3 = "I didn't really like the movie, but it wasn't totally bad as the special effects were good"

In [149]:
predict(test_3)

I didn't really like the movie, but it wasn't totally bad as the special effects were good
The outcome score is 14
This is a positive review
