In [1]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 

warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

In [2]:
df  = pd.read_csv('train.csv',encoding='latin-1')
df.drop(['ItemID'], axis=1, inplace=True)

In [3]:
df.head()

Unnamed: 0,Sentiment,SentimentText
0,0,is so sad for my APL frie...
1,0,I missed the New Moon trail...
2,1,omg its already 7:30 :O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...
4,0,i think mi bf is cheating on me!!! ...


# Normalize Data

In [4]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt  

In [5]:
# remove twitter handles
df['CleanSenText'] = np.vectorize(remove_pattern)(df['SentimentText'], "@[\w]*")

In [6]:
df[109:115] 

# used 109:115 because includes @user text; verifies @ were removed

Unnamed: 0,Sentiment,SentimentText,CleanSenText
109,0,so whats the status on next weekend,so whats the status on next weekend
110,0,sorry @gigi4462 The Ex Husband has overdosed...,sorry The Ex Husband has overdosed on his d...
111,0,Thanks for your definition of throwbie! Edi...,Thanks for your definition of throwbie! Edi...
112,1,"Thanks, I need all the help i can get.","Thanks, I need all the help i can get."
113,1,- that explains alot.,- that explains alot.
114,1,There's going to be a Heathers sequel. Wino...,There's going to be a Heathers sequel. Wino...


In [7]:
# remove special characters, punctuations
df['CleanSenText'] = df['CleanSenText'].str.replace("[^a-zA-Z0-9#]", " ")

In [8]:
df.head()

Unnamed: 0,Sentiment,SentimentText,CleanSenText
0,0,is so sad for my APL frie...,is so sad for my APL frie...
1,0,I missed the New Moon trail...,I missed the New Moon trail...
2,1,omg its already 7:30 :O,omg its already 7 30 O
3,0,.. Omgaga. Im sooo im gunna CRy. I'...,Omgaga Im sooo im gunna CRy I ...
4,0,i think mi bf is cheating on me!!! ...,i think mi bf is cheating on me ...


Now we will tokenize all the cleaned tweets in our dataset. Tokens are individual terms or words, and tokenization is the process of splitting a string of text into tokens.

In [9]:
from sklearn.model_selection import train_test_split
from nltk.tokenize import TweetTokenizer

tt = TweetTokenizer()

label=list(df.Sentiment)
text=df['CleanSenText'].apply(tt.tokenize)

X_train, X_test, y_train, y_test = train_test_split(text, label, test_size=0.2, random_state=628)

In [10]:
X_train[0:5]

18068    [oxygen, Love, the, atmosphere, http, myloc, m...
31602    [HAHAH, YES, he, s, so, cute, and, he, has, th...
76299                                [we, r, all, special]
73950    [Yes, that, s, different, Unless, they, have, ...
520      [I, feel, irresponsible, and, sort, of, horrib...
Name: CleanSenText, dtype: object

In [11]:
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
vocab = tokenizer.word_index

X_train_word_ids = tokenizer.texts_to_sequences(X_train)
X_test_word_ids = tokenizer.texts_to_sequences(X_test)
x_train = pad_sequences(X_train_word_ids, maxlen=50)
x_test= pad_sequences(X_test_word_ids, maxlen=50)

Using TensorFlow backend.


In [12]:
x_train[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,  6447,    46,     3,
        7409,    65,  1109,    15, 31660], dtype=int32)

In [13]:
#vocab

In [14]:
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
import numpy as np
from keras.optimizers import Adam
from keras.optimizers import RMSprop
from keras.callbacks import EarlyStopping

num_words = 125000
embedding_matrix = np.zeros((num_words, 50))

e1 = Embedding(num_words, 50, input_length=50, trainable=True)
e2 = Embedding(num_words, 50, input_length=50, trainable=True)
e3 = Embedding(num_words, 50, input_length=50, trainable=True)

In [15]:
import keras.utils
y_train_cat = keras.utils.to_categorical(y_train)
y_test_cat = keras.utils.to_categorical(y_test)

In [16]:
# I did try optimizer=RMSprop(), sgd, adam
# also tried learning rate of 0.1, 0.01, 0.001 
# also tried categorizing the y
# per the instructions here: https://stackoverflow.com/questions/37213388/keras-accuracy-does-not-change

#model_cnn.reset_states() #clear previous model weights

model_cnn = Sequential()
model_cnn.add(e1)
model_cnn.add(Dropout(.5))
model_cnn.add(Conv1D(filters=100, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn.add(Dropout(.5))
model_cnn.add(Conv1D(filters=50, kernel_size=2, padding='valid', activation='relu', strides=1))
model_cnn.add(Dropout(.5))
model_cnn.add(GlobalMaxPooling1D())
model_cnn.add(Dense(64, activation='relu'))
model_cnn.add(Dropout(.5))
model_cnn.add(Dense(2, activation='sigmoid'))
adamop=Adam(lr=0.001) 
model_cnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_cnn.summary())
#Changed y to _cat below to try to fix repeating val_acc
model_cnn.fit(x_train, y_train_cat, validation_data=(x_test, y_test_cat), epochs=5, 
              batch_size=64, verbose=2, callbacks=[EarlyStopping(monitor='val_acc',min_delta=0.001)])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 50)            6250000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 50)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 49, 100)           10100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 49, 100)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 48, 50)            10050     
_________________________________________________________________
dropout_3 (Dropout)  

<keras.callbacks.History at 0x7fd9b52b3cc0>

In [17]:
# Final evaluation of the model
score_cnn = model_cnn.evaluate(x_test, y_test_cat, verbose=2)
print("Accuracy: %.2f%%" % (score_cnn[1]*100))

Accuracy: 74.64%


In [18]:
from keras.layers import LSTM
from keras.layers import Dropout
from keras.layers import RNN
from keras.layers import SimpleRNN

In [19]:
#model_rnn.reset_states() #clear previous model weights

# create the model
model_rnn = Sequential()
model_rnn.add(e2)
model_cnn.add(Dropout(.5))
model_rnn.add(SimpleRNN(128, activation='relu'))
model_rnn.add(Dense(64, activation='relu'))
model_cnn.add(Dropout(.5))
model_rnn.add(Dense(1, activation='sigmoid'))
adamop=Adam(lr=0.001) 
model_rnn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_rnn.summary())
model_rnn.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=5, batch_size=64, verbose=2,callbacks=[EarlyStopping(monitor='val_acc',min_delta=0.01)])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 50)            6250000   
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 128)               22912     
_________________________________________________________________
dense_3 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 65        
Total params: 6,281,233
Trainable params: 6,281,233
Non-trainable params: 0
_________________________________________________________________
None
Train on 79991 samples, validate on 19998 samples
Epoch 1/5
 - 98s - loss: 0.5096 - acc: 0.7456 - val_loss: 0.4693 - val_acc: 0.7703
Epoch 2/5
 - 97s - loss: 0.3863 - acc: 0.8268 - val_loss: 0.5067 - val_acc: 0.7695


<keras.callbacks.History at 0x7fd9b403f860>

In [20]:
# Final evaluation of the model
score_rnn = model_rnn.evaluate(x_test, y_test, verbose=2)
print("Accuracy: %.2f%%" % (score_rnn[1]*100))

Accuracy: 76.95%


In [24]:
#model_lstm.reset_states()

# create the model
model_lstm = Sequential()
model_lstm.add(e3)
model_lstm.add(Dropout(0.5))
model_lstm.add(LSTM(100))
model_lstm.add(Dropout(0.5))
model_lstm.add(Dense(1, activation='sigmoid'))
adamop=Adam(lr=0.001) 
model_lstm.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model_lstm.summary())
model_lstm.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=5, batch_size=64, verbose=2,callbacks=[EarlyStopping(monitor='val_acc',min_delta=.01)])
#model_lstm.fit(x_train, y_train, validation_data=(x_train, y_train), validation_split = 0.2, epochs=5, batch_size=64, verbose=2,callbacks=[EarlyStopping(monitor='val_acc',min_delta=.01)])
#model_lstm.fit(x_train, y_train, epochs=5, batch_size=64, verbose=2, validation_split = 0.1)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 50)            6250000   
_________________________________________________________________
dropout_7 (Dropout)          (None, 50, 50)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               60400     
_________________________________________________________________
dropout_8 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 101       
Total params: 6,310,501
Trainable params: 6,310,501
Non-trainable params: 0
_________________________________________________________________
None
Train on 79991 samples, validate on 19998 samples
Epoch 1/5
 - 160s - loss: 0.5225 - acc: 0.7388 - val_loss: 0.4774 - val_acc: 

<keras.callbacks.History at 0x7fd98845d668>

In [25]:
# Final evaluation of the model
score_lstm = model_lstm.evaluate(x_test, y_test, verbose=1)
print("Accuracy: %.2f%%" % (score_lstm[1]*100))

Accuracy: 76.76%
