In [35]:
import pandas as pd
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.layers import Embedding

df = pd.read_csv('twitter.csv', encoding='latin-1')
df.head(10)

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,is so sad for my APL frie...
1,2,0,I missed the New Moon trail...
2,3,1,omg its already 7:30 :O
3,4,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,5,0,i think mi bf is cheating on me!!! ...
5,6,0,or i just worry too much?
6,7,1,Juuuuuuuuuuuuuuuuussssst Chillin!!
7,8,0,Sunny Again Work Tomorrow :-| ...
8,9,1,handed in my uniform today . i miss you ...
9,10,1,hmmmm.... i wonder how she my number @-)


In [36]:
# Clean the text

# Function for clean text
def CleanTxt(text):
    text = re.sub(u'[^a-zA-Z0-9áéíóúÁÉÍÓÚâêîôÂÊÎÔãõÃÕçÇ: ]', '', text)
    return text

# Function for remove stop words
def RemoveStopWords(text):
    tokens = word_tokenize(text)
    tokens = [w.lower() for w in tokens]
    stop_words = set(stopwords.words('english'))
    words = [w for w in tokens if w not in string.punctuation and w not in stop_words]
    new_words = ' '.join(words)

    return new_words

# Cleaning the text
df['SentimentText'] = df['SentimentText'].apply(CleanTxt)
df['SentimentText'] = df['SentimentText'].apply(RemoveStopWords)

# Show the cleaned text
df.head(10)

Unnamed: 0,ItemID,Sentiment,SentimentText
0,1,0,sad apl friend
1,2,0,missed new moon trailer
2,3,1,omg already 7:30
3,4,0,omgaga im sooo im gunna cry ive dentist since ...
4,5,0,think mi bf cheating tt
5,6,0,worry much
6,7,1,juuuuuuuuuuuuuuuuussssst chillin
7,8,0,sunny work tomorrow tv tonight
8,9,1,handed uniform today miss already
9,10,1,hmmmm wonder number


In [8]:
tweet_df = df[['SentimentText','Sentiment']]
tweet_df.head(10)

Unnamed: 0,SentimentText,Sentiment
0,"[sad, apl, friend]",0
1,"[missed, new, moon, trailer]",0
2,"[omg, already, 730]",1
3,"[omgg, sooo, im, gunn, cry, dentist, since, 11...",0
4,"[thnk, bf, cheatng, tt]",0
5,"[worry, much]",0
6,"[juuuuuuuuuuuuuuuuussssst, chillin]",1
7,"[sunny, work, tomorrow, tv, tonight]",0
8,"[handed, n, unform, today, mss, already]",1
9,"[hmmmm, wonder, number]",1


In [6]:
tweet = df.SentimentText.values
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(tweet)
vocab_size = len(tokenizer.word_index) + 1
encoded_docs = tokenizer.texts_to_sequences(tweet)
padded_sequence = pad_sequences(encoded_docs, maxlen=200)

In [7]:
# print(tokenizer.word_index)
# print(tweet[0])
# print(encoded_docs[0])
print(padded_sequence[0])

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0  10  18 124   9
   8 266]


In [8]:
embedding_vector_length = 32
model = Sequential()
model.add(Embedding(vocab_size, embedding_vector_length,     
                                     input_length=200) )
model.add(SpatialDropout1D(0.25))
model.add(LSTM(50, dropout=0.5, recurrent_dropout=0.5))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', 
                           metrics=['accuracy'])
print(model.summary())

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 200, 32)           3807520   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 200, 32)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 50)                16600     
_________________________________________________________________
dropout (Dropout)            (None, 50)                0         
_________________________________________________________________
dense (Dense)              

In [11]:
sentiment_label = df.Sentiment.factorize()

history = model.fit(padded_sequence,sentiment_label[0],
                  validation_split=0.2, epochs=5, batch_size=32)

Train on 79991 samples, validate on 19998 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [16]:
test_word ="This is soo sad"
tw = tokenizer.texts_to_sequences([test_word])
tw = pad_sequences(tw,maxlen=200)
prediction = int(model.predict(tw).round().item())
sentiment_label[1][prediction]

0