In [55]:
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
import pickle
from keras.callbacks import TensorBoard
from sklearn.preprocessing import LabelEncoder
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

import os
os.environ['KMP_DUPLICATE_LIB_OK']='True'

In [40]:
df = pd.read_csv('Sentiment.csv')
# Keeping only the neccessary columns
df = df[['text','sentiment']]

In [41]:
df.head()

Unnamed: 0,text,sentiment
0,RT @NancyLeeGrahn: How did everyone feel about...,Neutral
1,RT @ScottWalker: Didn't catch the full #GOPdeb...,Positive
2,RT @TJMShow: No mention of Tamir Rice and the ...,Neutral
3,RT @RobGeorge: That Carly Fiorina is trending ...,Positive
4,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,Positive


In [42]:
df['text'] = df['text'].apply(lambda x: x.lower())
df['text'] = df['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

In [43]:
df.head()

Unnamed: 0,text,sentiment
0,rt nancyleegrahn how did everyone feel about t...,Neutral
1,rt scottwalker didnt catch the full gopdebate ...,Positive
2,rt tjmshow no mention of tamir rice and the go...,Neutral
3,rt robgeorge that carly fiorina is trending h...,Positive
4,rt danscavino gopdebate w realdonaldtrump deli...,Positive


In [44]:
df['text']=df.text.str.replace('rt','')

In [45]:
df.head()

Unnamed: 0,text,sentiment
0,nancyleegrahn how did everyone feel about the...,Neutral
1,scottwalker didnt catch the full gopdebate la...,Positive
2,tjmshow no mention of tamir rice and the gopd...,Neutral
3,robgeorge that carly fiorina is trending hou...,Positive
4,danscavino gopdebate w realdonaldtrump delive...,Positive


In [46]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(df['text'].values)
X = tokenizer.texts_to_sequences(df['text'].values)
X = pad_sequences(X)

In [47]:
embed_dim = 128
lstm_out = 196

In [48]:
def createmodel():
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(3,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    return model

In [49]:
labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(df['sentiment'])
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [50]:
tensorboard = TensorBoard(log_dir="logs/icp5", histogram_freq=1, write_graph=True, write_images=False)

In [60]:
batch_size = 32
model = createmodel()
hist = model.fit(X_train, Y_train,
                 epochs=7,
                 batch_size=batch_size,
                 verbose=2,callbacks=[tensorboard])

score, acc = model.evaluate(X_test, Y_test, verbose=2, batch_size=batch_size)
print(score)
print(acc)

Epoch 1/7
 - 31s - loss: 0.8326 - acc: 0.6363
Epoch 2/7
 - 26s - loss: 0.6911 - acc: 0.7047
Epoch 3/7
 - 25s - loss: 0.6173 - acc: 0.7382
Epoch 4/7
 - 25s - loss: 0.5761 - acc: 0.7637
Epoch 5/7
 - 27s - loss: 0.5390 - acc: 0.7797
Epoch 6/7
 - 26s - loss: 0.4999 - acc: 0.7972
Epoch 7/7
 - 26s - loss: 0.4621 - acc: 0.8122
0.9320533190715151
0.6657929226736566


In [56]:
model = KerasClassifier(build_fn=createmodel, verbose=0)
batch_size = [32, 64]
epochs = [1, 2]
param_grid = dict(batch_size=batch_size, epochs=epochs)

grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1)
grid_result = grid.fit(X_train, Y_train)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))



Best: 0.678252 using {'batch_size': 64, 'epochs': 2}


In [59]:
model = createmodel()
hist = model.fit(X_train, Y_train,
                 epochs=2,
                 batch_size=64,
                 verbose=2)

score, acc = model.evaluate(X_test, Y_test, verbose=2, batch_size=64)
print(score)
print(acc)

Epoch 1/2
 - 21s - loss: 0.8501 - acc: 0.6311
Epoch 2/2
 - 16s - loss: 0.6965 - acc: 0.7035
0.7470867430710699
0.6823940586710455


In [15]:
filename = 'final_model.pkl'
pickle.dump(model, open(filename, 'wb'))

In [16]:
loaded_model = pickle.load(open(filename, 'rb'))

In [17]:
loaded_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [27]:
new_string = ["A lot of good things are happening. We are respected again throughout the world, " \
             "and that's a great thing.@realDonaldTrump"]
# new_string = [[new_string]]
new_df = pd.DataFrame(new_string, columns=['data'])

In [28]:
new_df

Unnamed: 0,data
0,A lot of good things are happening. We are res...


In [29]:
new_df['data'] = new_df['data'].apply(lambda x: x.lower())
new_df['data'] = new_df['data'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

In [35]:
for x in new_df['data']:
    print(x)

a lot of good things are happening we are respected again throughout the world and thats a great thingrealdonaldtrump


In [70]:
max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(new_df['data'].values)
X = tokenizer.texts_to_sequences(new_df['data'].values)
# print(X)
X = pad_sequences(X, maxlen=28)

print(loaded_model.predict(X))
# print("====== the input vector")
# print(X)

[[0.51476616 0.13547854 0.34975532]]


In [66]:
t  = Tokenizer()
fit_text = ["The earth is an awesome place live"]
t.fit_on_texts(fit_text)

In [69]:
test_text1 = "The earth is an great place live"
test_text2 = "The is my program"
sequences = t.texts_to_sequences([test_text1, test_text2])

print('sequences : ',sequences,'\n')

print('word_index : ',t.word_index)

sequences :  [[1, 2, 3, 4, 6, 7], [1, 3]] 

word_index :  {'the': 1, 'earth': 2, 'is': 3, 'an': 4, 'awesome': 5, 'place': 6, 'live': 7}
