In [1]:
import numpy as np 
import pandas as pd 
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import re
import pickle
import nltk
from os import getcwd
from nltk.corpus import twitter_samples 

In [2]:
nltk.download('twitter_samples')

[nltk_data] Downloading package twitter_samples to
[nltk_data]     C:\Users\Paulius\AppData\Roaming\nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!


True

# Importing and cleaning data
Here I deleted neutral posts and changed category's values so that it would be easier for sigmoid activation function.

In [3]:
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')

In [4]:
test_pos = all_positive_tweets[4000:]
train_pos = all_positive_tweets[:4000]
test_neg = all_negative_tweets[4000:]
train_neg = all_negative_tweets[:4000]

train_x = train_pos + train_neg 
test_x = test_pos + test_neg

In [5]:
train_x[5]

'@BhaktisBanter @PallaviRuhail This one is irresistible :)\n#FlipkartFashionFriday http://t.co/EbZ0L2VENM'

In [6]:
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y = np.append(np.ones((len(test_pos), 1)), np.zeros((len(test_neg), 1)), axis=0)

In [7]:
print("train_y.shape = " + str(train_y.shape))
print("test_y.shape = " + str(test_y.shape))

train_y.shape = (8000, 1)
test_y.shape = (2000, 1)


# Preprocessing data
First we decapitalize all words and leave only proper words. Then we turn 5000 words into numeric values and pad texts so they are all the same length.

In [8]:
train_x = [i.lower() for i in train_x]
train_x = [re.sub('[^a-zA-z0-9\s]','',x) for x in train_x]
test_x = [i.lower() for i in test_x]
test_x = [re.sub('[^a-zA-z0-9\s]','',x) for x in test_x]

In [9]:
train_x[5]

'bhaktisbanter pallaviruhail this one is irresistible \nflipkartfashionfriday httptcoebz0l2venm'

In [10]:
tokenizer = Tokenizer(num_words=5000, split=' ')
tokenizer.fit_on_texts(train_x)
tokenizer.fit_on_texts(test_x)
train_x = tokenizer.texts_to_sequences(train_x)
train_x = pad_sequences(train_x, maxlen = 256)
test_x = tokenizer.texts_to_sequences(test_x)
test_x = pad_sequences(test_x, maxlen = 256)

In [11]:
train_x[5]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

# Saving Tokenizer, because we will need it in web app

In [17]:
with open("./models/tokenizer.pickle", "wb") as tok:
    pickle.dump(tokenizer, tok, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
train_x.shape

(8000, 256)

In [19]:
model = keras.Sequential([
    keras.layers.Embedding(8000, 256, input_length = train_x.shape[1]),
    keras.layers.SpatialDropout1D(0.2),
    keras.layers.LSTM(128, dropout = 0.2, recurrent_dropout = 0.2),
    keras.layers.Dense(1, activation = "sigmoid")
])

In [None]:
model.compile(loss="binary_crossentropy", optimizer = "adam", metrics=['accuracy'])
model.fit(train_x, train_y, epochs = 15, validation_split = 0.33, batch_size = 32)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15

In [None]:
model.evaluate(test_x,test_y)

In [None]:
post = ["I hate this nonsense film"]
post = tokenizer.texts_to_sequences(post)
post = pad_sequences(post, maxlen=28, dtype='int32', value=0)
print(post)
prediction = model.predict(post)
round(float(prediction))


In [None]:
model.save("./models/Model.h5")