# Amazon reviews for sentimental analysis

In [1]:
import pandas as pd
import numpy as np
import bz2
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
train_file = bz2.BZ2File('./train.ft.txt.bz2')
test_file = bz2.BZ2File('./test.ft.txt.bz2')

In [4]:
train_lines = train_file.readlines()
test_lines = test_file.readlines()

In [5]:
del train_file, test_file

In [6]:
train_file_lines = [x.decode('utf-8') for x in train_lines]
test_file_lines = [x.decode('utf-8') for x in test_lines]

In [7]:
del train_lines, test_lines

### Preprocessing

In [8]:
train_file_lines[0]

'__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^\n'

In [9]:
train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file_lines]
train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_file_lines]


In [10]:
train_sentences[0]

'stuning even for the non-gamer: this sound track was beautiful! it paints the senery in your mind so well i would recomend it even to people who hate vid. game music! i have played the game chrono cross but out of all of the games i have ever played it has the best music! it backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. it would impress anyone who cares to listen! ^_^'

In [11]:
import re

In [12]:
for i in range(len(train_sentences)):
    train_sentences[i] = re.sub('\d','0',train_sentences[i])

In [13]:
test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_file_lines]
test_sentences = [x.split(' ', 1)[1][:-1].lower() for x in test_file_lines]

In [14]:
for i in range(len(test_sentences)):
    test_sentences[i] = re.sub('\d','0',test_sentences[i])

In [15]:
del train_file_lines, test_file_lines

### Cleaning the text file

In [19]:
#testnp = np.array(train_sentences)

In [22]:
#del testnp

In [23]:
def clean_text(texts):
    texts = texts.replace('\n', ' ')
    if 'www.' in texts or 'http:' in texts or 'https:' in texts or '.com' in texts:
        texts = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "website", texts)
    return texts

In [25]:
train_sentences = [clean_text(i) for i in train_sentences]

In [26]:
#test['text'] = test['text'].apply(lambda x: clean_text(x)
test_sentences = [clean_text(i) for i in test_sentences]

### Tensorflow

In [42]:
# Hyperparamter

max_features = 20000
max_len = 120
pad_stype = 'post'
epochs = 7
OOV = '<OOV>'

In [43]:
tokenizer = Tokenizer(num_words=max_features, oov_token= OOV)
tokenizer.fit_on_texts(train_sentences)

In [29]:
train_sequence = tokenizer.texts_to_sequences(train_sentences)
test_sequence = tokenizer.texts_to_sequences(test_sentences)

In [30]:
train_text = pad_sequences(train_sequence, maxlen=max_len, padding=pad_stype)
test_text = pad_sequences(test_sequence, maxlen=max_len, padding=pad_stype)

In [44]:
word_index = tokenizer.word_index
vocabulary_size = len(word_index)
dim = 100

In [46]:
model = keras.Sequential([
    keras.layers.Embedding(max_features, dim, input_length = max_len),
    keras.layers.Dropout(0.2),
    keras.layers.Conv1D(128,5, activation = 'relu'),
    keras.layers.MaxPooling1D(4),
    keras.layers.Bidirectional(keras.layers.LSTM(64)),
    keras.layers.Dense(64, activation = 'relu'),
    keras.layers.Dense(1, activation = 'sigmoid')
])

In [47]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 120, 100)          2000000   
_________________________________________________________________
dropout_4 (Dropout)          (None, 120, 100)          0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 116, 128)          64128     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 29, 128)           0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense_6 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                

In [48]:
model.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
history = model.fit(train_text, train_labels, epochs = epochs, validation_data=(test_text, test_labels), verbose = 2 )

Train on 3600000 samples, validate on 400000 samples
Epoch 1/7
3600000/3600000 - 13405s - loss: 0.1642 - accuracy: 0.9371 - val_loss: 0.1545 - val_accuracy: 0.9419
Epoch 2/7
3600000/3600000 - 11752s - loss: 0.1432 - accuracy: 0.9466 - val_loss: 0.1496 - val_accuracy: 0.9436
Epoch 3/7
3600000/3600000 - 11016s - loss: 0.1393 - accuracy: 0.9483 - val_loss: 0.1475 - val_accuracy: 0.9454
Epoch 4/7
3600000/3600000 - 10356s - loss: 0.1384 - accuracy: 0.9488 - val_loss: 0.1476 - val_accuracy: 0.9455
Epoch 5/7
3600000/3600000 - 10831s - loss: 0.1383 - accuracy: 0.9489 - val_loss: 0.1527 - val_accuracy: 0.9433
Epoch 6/7


### Tokenizing and tfidf

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [51]:
history

<tensorflow.python.keras.callbacks.History at 0x1b3520ac88>

In [None]:
cv = TfidfVectorizer(max_features=1500, stop_words= 'english')
train_tfidf = cv.fit_transform(train['text']).toarray()

In [24]:
cv.vocabulary_.get(u'best')

134

In [45]:
cv

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=1500,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [25]:
test_tfidf = cv.transform(test['text']).toarray()

In [27]:
train_tfidf_small = train_tfidf[:][0:500000,]

In [28]:
train_score_small = train['labels'][0:500000]

### Logistic regression

In [30]:
from sklearn.linear_model import LogisticRegression

In [31]:
lg = LogisticRegression(penalty='l2',C=1)
lg.fit(train_tfidf_small,train_score_small)



LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [32]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [33]:
score_pred = lg.predict(test_tfidf)

In [34]:
accuracy_score(score_pred, test['labels'])

0.8829875

In [35]:
confusion_matrix(score_pred, test['labels'])

array([[176547,  23352],
       [ 23453, 176648]])

In [46]:
from sklearn.naive_bayes import MultinomialNB

In [47]:
nb = MultinomialNB()

In [48]:
nb.fit(train_tfidf_small,train_score_small)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [49]:
score_pred = nb.predict(test_tfidf)

In [50]:
accuracy_score(score_pred, test['labels'])

0.8421875