In [0]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from matplotlib import pyplot
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv('Sentiment.csv')
# Keeping only the neccessary columns
data = data[['text','sentiment']]

data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]', '', x)))

for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ')

max_fatures = 2000
tokenizer = Tokenizer(num_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)

X = pad_sequences(X)

embed_dim = 128
lstm_out = 196
def createmodel():
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim,input_length = X.shape[1]))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(3,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    return model
# print(model.summary())

labelencoder = LabelEncoder()
integer_encoded = labelencoder.fit_transform(data['sentiment'])
y = to_categorical(integer_encoded)
X_train, X_test, Y_train, Y_test = train_test_split(X,y, test_size = 0.33, random_state = 42)

batch_size = 32
model = createmodel()
model.fit(X_train, Y_train, epochs = 1, batch_size=batch_size, verbose = 2)
score,acc = model.evaluate(X_test,Y_test,verbose=2,batch_size=batch_size)
print(score)
print(acc)
print(model.metrics_names)
model.save("twitter_sentiment_model.h5")


Epoch 1/1
 - 37s - loss: 0.8390 - acc: 0.6401
0.7619723086015477
0.6714722586282219
['loss', 'acc']


# Question 1

In [0]:
from keras.models import load_model
loaded_model = load_model('twitter_sentiment_model.h5')
data = pd.DataFrame({'text': np.array(["A lot of good things are happening. We are respected again throughout the world, and that's a great thing"])})

data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply(lambda x: re.sub('[^a-zA-z0-9\s]', '', x))

X2 = tokenizer.texts_to_sequences(data['text'].values)
X2 = pad_sequences(X2, maxlen=28)

print(labelencoder.inverse_transform(loaded_model.predict_classes(X2))[0])

Negative


#Question 2

Here we have reduced batch_size and epochs to only 10 and 1 respectively. This is because higher values were taking too long to execute

In [70]:
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
model = KerasClassifier(build_fn=createmodel, verbose=0)
batch_size = [10]
epochs = [1]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result = grid.fit(X_train, Y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))



Best: 0.663295 using {'batch_size': 10, 'epochs': 1}


# Question 3

In [0]:
spam_data = pd.read_csv('spam.csv', encoding='latin')
spam_data = spam_data[['v1', 'v2']]
spam_data = spam_data.rename(columns={'v1': 'label', 'v2': 'text'})
spam_data['text'] = spam_data['text'].apply(lambda x: x.lower())
spam_data['text'] = spam_data['text'].apply(lambda x: re.sub('[^a-zA-z0-9\s]', '', x))

X3 = tokenizer.texts_to_sequences(spam_data['text'].values)
X3 = pad_sequences(X3, maxlen=28)

new_encoder = LabelEncoder()
new_int_encoder = new_encoder.fit_transform(spam_data['label'])
y_c = to_categorical(new_int_encoder, num_classes=3)
X_train, X_test, Y_train, Y_test = train_test_split(X3, y_c, test_size=0.33, random_state=43)

model2 = KerasClassifier(build_fn=createmodel, verbose=0)
grid2 = GridSearchCV(estimator=model2, param_grid=param_grid)
grid_result = grid.fit(X_train, Y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))



Best: 0.969997 using {'batch_size': 10, 'epochs': 1}
