In [1]:
# Ref: https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Conv2D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout, Conv1D,Input,MaxPooling1D,Flatten
import re
import os
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder


4
from numpy.random import seed
seed(1)
from tensorflow import set_random_seed
set_random_seed(2)## Read data

In [2]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 400000
# Max number of words in each dialogue.
MAX_SEQUENCE_LENGTH = 180
# This is fixed.
EMBEDDING_DIM = 100

# Tagging

In [3]:
data = pd.read_csv("../Database/ISEAR/isear_databank.csv")
data = data[['Field1', 'SIT']]

In [4]:
import nltk

In [5]:
sentence = "?"
text = nltk.word_tokenize(sentence)
nltk.pos_tag(text)

[('?', '.')]

In [6]:
eliminator = ['CC', 'CD', 'DT', 'EX', 'IN', 'LS', 'NNP', 'PDT', 'POS', 'PRP', 'PRP$', 'UH', 'WDT', 'WP', 'WRB','SYM','.']
sym = list('!"#$%&()*+,-./:;<=>?@[\]^_`{|}~รก\n')
for i in range(len(data['SIT'])):
    cur_tok = nltk.word_tokenize(data['SIT'][i])
    cur_tok = nltk.pos_tag(cur_tok)
    rest = []
    for word, t in cur_tok :
        if not (t in eliminator or word in sym):
            rest.append(word)
    data['SIT'][i] = rest
data.head(10)

Unnamed: 0,Field1,SIT
0,joy,"[period, falling, love, time, met, especially,..."
1,fear,"[was, involved, traffic, accident]"
2,anger,"[was, driving, home, several, days, hard, work..."
3,sadness,"[lost, person, meant, most, to]"
4,disgust,"[time, knocked, deer, down, sight, animal, inj..."
5,shame,"[did, not, speak, truth]"
6,guilt,"[caused, problems, somebody, could, not, keep,..."
7,joy,"[got, letter, offering, job, had, applied]"
8,fear,"[was, going, home, alone, night, man, came, up..."
9,anger,"[was, talking, to, party, first, time, long, w..."


In [7]:
MAX_NB_WORDS = 10000
tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~รก\n', lower=True)
tokenizer.fit_on_texts(data['SIT'].values)
word_index = tokenizer.word_index
print(len(word_index))

8583


In [8]:
X = tokenizer.texts_to_sequences(data['SIT'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', X.shape)

Shape of data tensor: (7666, 180)


# Covert to Matrix

In [9]:
Z = np.zeros((len(X), len(word_index)))
for i in range(len(X)):
    seq = np.zeros(len(word_index))
    for j in X[i]:
        seq[j-1] += 1
    Z[i] = seq
print(np.sum(Z))

1379880.0


In [10]:
Z = np.expand_dims(Z,2)
print(Z.shape)

(7666, 8583, 1)


In [11]:
#TODO: apply the column for data Y
Y = pd.get_dummies(data[ 'Field1']).values
print('Shape of label tensor:', Y.shape, Y[3])

Shape of label tensor: (7666, 7) [0 0 0 0 0 1 0]


In [12]:
K = np.array([np.where(i == 1)[0][0] for i in Y])

In [13]:
X_train, X_test, Y_train, Y_test = train_test_split(Z,K, test_size = 0.30, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(5366, 8583, 1) (5366,)
(2300, 8583, 1) (2300,)


## start DL stuff

In [None]:
epochs = 10
batch_size = 32
stop = [EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)]
history = model.fit(X_train, Y_train, epochs=epochs, shuffle=True, batch_size=batch_size,validation_split=0.1,callbacks=None)

In [None]:
accr = model.evaluate(X_test,Y_test)
print(accr)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show();

In [None]:
plt.title('Accuracy')
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='test')
plt.legend()
plt.show();

## TODO: actually use the trained model to experiment somehow

In [9]:
# Ref: https://towardsdatascience.com/multi-class-text-classification-with-lstm-1590bee1bd17
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras import Model
from keras.layers import Dense, Bidirectional, Embedding, LSTM, SpatialDropout1D, Input, Concatenate, GaussianNoise
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.layers import Dropout, Flatten
import os
import io
import tensorflow as tf
import sys

"""
Import scripts
"""
# Append the source code directory in to file
sys.path.append("../src") 
from preprocessing import *
from embedding import *
from model_builder import *

"""
Define Constant
"""
# Max number of words in each dialogue.
MAX_SEQUENCE_LENGTH = 30
PLOT = False
SIMPLE_CATEGORY = True
CATEGORY_NUM = 5000

"""
Read data and preprocess it
"""
data = pd.read_csv('../Dataset/Tweets.csv')
text_col = 'content'
data = preprocess_text(data,text_col)
# Shuffle data
data = data.sample(frac=1).reset_index(drop=True)

"""
Convert into 3 categories
"""
if SIMPLE_CATEGORY:
    # Categorize into 3 categories
    data.loc[data['sentiment'] == 'anger'] = 'negative'
    data.loc[data['sentiment'] == 'hate'] = 'negative'
    data.loc[data['sentiment'] == 'worry'] = 'negative'
    data.loc[data['sentiment'] == 'sadness'] = 'negative'
    data.loc[data['sentiment'] == 'boredom'] = 'negative'
    data.loc[data['sentiment'] == 'relief'] = 'positive'
    data.loc[data['sentiment'] == 'happiness'] = 'positive'
    data.loc[data['sentiment'] == 'love'] = 'positive'
    data.loc[data['sentiment'] == 'enthusiasm'] = 'positive'
    data.loc[data['sentiment'] == 'surprise'] = 'positive'
    data.loc[data['sentiment'] == 'fun'] = 'positive'
    data.loc[data['sentiment'] == 'empty'] = 'neutral'
    # divide into 3 data to maintain a evenly distributed dataset
    neutral_data = data.loc[data['sentiment'] == 'neutral']
    negative_data = data.loc[data['sentiment'] == 'negative']
    positive_data = data.loc[data['sentiment'] == 'positive']
    # Obtain 5000 from each category
    data = pd.concat([neutral_data.sample(n=CATEGORY_NUM),
                      negative_data.sample(n=CATEGORY_NUM),
                      positive_data.sample(n=CATEGORY_NUM)])
    

"""
Create Embedding Layer
"""
embeddings, dim = get_embeddings("../glove/glove.6B.100d.txt")
tokenizer = get_tokenizer([' '.join(list(embeddings.keys()))])
embedding_matrix = get_embedding_matrix(embeddings, tokenizer.word_index, dim)

"""
Create training and testing sets
"""
X = tokenizer.texts_to_sequences(data['content'].values)
X = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.50, random_state = 42)

In [10]:
from SVM import svm_classify
Z_test =[np.where(i==1)[0][0] for i in Y_test]
Z_train =[np.where(i==1)[0][0] for i in Y_train]
svm_classify(X_train, Z_train, X_test, Z_test)

SVM Accuracy Score ->  52.93333333333333


In [11]:
len(neutral_data)

9444