In [1]:
from warnings import simplefilter 
simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import time
import pickle
import os
import matplotlib.pyplot as plt
import pandas as pd
import re

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score, accuracy_score

from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Input, Conv1D, Flatten
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import GlobalMaxPooling1D, Dropout, MaxPooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [2]:
def clean_text(text):
    """Function is used to preprocess user tweets.
    Removes @username and links, and whitespaces.
    Args:
        tweet (str): Raw text of the tweet.
    Returns:
        result (str): Processed tweet, after removing unnecessary data.
    """

    # removing @username
    result = re.sub(r'@[A-Za-z0-9]+', '', text)
    # removing link
    result = re.sub(r'https?://[A-Za-z0-9./]+', '', result)
    # removing leading and trailing whitespace
    result = result.strip()

    return result


def convert_label(polarity):
    """Simple function to preprocess polarity.
    """

    if polarity == 4:
        return 1
    elif polarity == 0:
        return 0
    else:
        print('[WARNING]')


def load_data():
    """Loads the data.
    Function loads in data, preprocesses the data
    (removes @username and links, and converts sentiment
    into neg: 0 and pos: 1)
    Returns:
        texts (pd.Series): Preprocessed texts.
        sentiment (pd.Series): Preprocessed sentiments.
    """

    train_dir = '..\\data\\text\\train.csv'
    columns = ['Polarity', 'ID', 'Date', 'Query', 'User', 'Texts']

    df = pd.read_csv(train_dir, encoding='latin-1', names=columns, header=None)
    df.drop(['ID', 'Date', 'Query', 'User'], axis=1, inplace=True)

    sentiment_raw = df['Polarity']
    sentiment = sentiment_raw.apply(lambda x: convert_label(x))

    texts_raw = df['Texts']
    texts = texts_raw.apply(lambda x: clean_text(x))

    return texts, sentiment

In [4]:
VOCAB_SIZE = 20000
MAX_SEQ_LEN = 40
EMB_DIM = 100
BASE_DIR = str(int(np.ceil(time.time())))

if not os.path.exists(BASE_DIR):
    os.makedirs(BASE_DIR)
    print(f'[INFO] {BASE_DIR} created')

[INFO] 1590446290 created


### Load Data

In [35]:
print('[INFO] Loading data...')
texts, sentiments = load_data()

train_texts, val_texts, train_sent, val_sent = train_test_split(texts, sentiments, test_size=0.2)

[INFO] Loading data...


In [37]:
tk = Tokenizer(num_words=VOCAB_SIZE)
tk.fit_on_texts(train_texts)

In [38]:
with open(f'{BASE_DIR}\\tokenizer.pickle', 'wb') as f:
    pickle.dump(tk, f)

word_index = tk.word_index
print('[INFO] Number of unique tokens found (in train data):', len(word_index))

[INFO] Number of unique tokens found (in train data): 283625


In [39]:
x_train = tk.texts_to_sequences(train_texts)
x_test = tk.texts_to_sequences(val_texts)

max_length = len(max(x_train, key=len))
if max_length > MAX_SEQ_LEN:
    max_length = MAX_SEQ_LEN

In [41]:
x_train = pad_sequences(x_train, maxlen=max_length)
x_test = pad_sequences(x_test, maxlen=max_length)
y_train = np.array(train_sent).reshape(-1, 1)
y_test = np.array(val_sent).reshape(-1, 1)

print(f'[INFO] Sequence Length: {max_length}')
print(f'[INFO] Shape of x_train: {x_train.shape}')
print(f'[INFO] Shape of y_train: {y_train.shape}')
print(f'[INFO] Shape of x_test: {x_test.shape}')
print(f'[INFO] Shape of y_test: {y_test.shape}')

[INFO] Sequence Length: 40
[INFO] Shape of x_train: (1280000, 40)
[INFO] Shape of y_train: (1280000, 1)
[INFO] Shape of x_test: (320000, 40)
[INFO] Shape of y_test: (320000, 1)


### Using Glove Word Embedding
Download glove embedding from : https://nlp.stanford.edu/projects/glove/

File Name: glove.6B.100d.txt

In [12]:
print('[INFO] Indexing word vectors...')
embeddings_index = {}
embedding_path = f'.\glove.6B.{EMB_DIM}d.txt'

with open(embedding_path, encoding='utf8') as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

print('[INFO] Total number of word vectors in Glove Embedding:', len(embeddings_index))

[INFO] Indexing word vectors...
[INFO] Total number of word vectors in Glove Embedding: 400000


In [16]:
print('[INFO] Preparing embedding matrix...')
num_words = min(VOCAB_SIZE, len(word_index) + 1)
embeddings_matrix = np.zeros((num_words, EMB_DIM))  # initializing zeros matrix

for word, i in word_index.items():
    if i >= VOCAB_SIZE:
        continue

    embedding_vector = embeddings_index.get(word)  # vector for that word
    if embedding_vector is not None:  # if word not found, then 0
        embeddings_matrix[i] = embedding_vector

[INFO] Preparing embedding matrix...


In [17]:
embedding_layer = Embedding(num_words, EMB_DIM,
                            weights=[embeddings_matrix],
                            input_length=max_length,
                            trainable=True, name = 'Embedding')

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


### Model Architecture

In [19]:
model = Sequential()

model.add(Input(shape=(max_length,), name = 'Input'))
model.add(embedding_layer)
model.add(Dropout(0.2, name = 'Drop_1'))
model.add(Conv1D(512, 3, activation='relu', padding='same', name = 'Conv_1'))
model.add(MaxPooling1D(2, name = 'Max_1'))
model.add(Dropout(0.3, name = 'Drop_2'))

model.add(Conv1D(256, 3, activation='relu', padding='same', name = 'Conv_2'))
model.add(Dropout(0.3, name = 'Drop_3'))

model.add(Conv1D(15, 3, activation='relu', padding='same', name = 'Conv_3'))
model.add(Flatten(name='Flatten_1'))
model.add(Dense(20, name = 'Dense_2'))

model.add(Dense(1, activation='sigmoid', name = 'Output'))

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

print(model.summary())

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Embedding (Embedding)        (None, 40, 100)           2000000   
_________________________________________________________________
Drop_1 (Dropout)             (None, 40, 100)           0         
_________________________________________________________________
Conv_1 (Conv1D)              (None, 40, 512)           154112    
_________________________________________________________________
Max_1 (MaxPooling1D)         (None, 20, 512)           0         
_________________________________________________________________
Drop_2 (Dropout)             (None, 20, 512)           0         
________________________________________

### Training

In [24]:
# checkpoint
t = int(time.time())
batch_size = 512

chk_path = os.path.join(BASE_DIR, 'best_{}_{}'.format('Twitter',t))
checkpoint = ModelCheckpoint(chk_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
#tensorboard = TensorBoard(log_dir="logs/{}_{}".format('IMDB_LSTM',t))
callbacks_list = [checkpoint]

history = model.fit(x_train, y_train, 
      batch_size=batch_size,
      epochs=100, 
      verbose=1,
      shuffle=True,
      validation_data=[x_test, y_test],
      callbacks=callbacks_list)

#Saving the model
model.save(os.path.join(BASE_DIR, 'final_{}_{}'.format('Twitter',t)))


Train on 1280000 samples, validate on 320000 samples
Epoch 1/100
Epoch 00001: val_acc improved from -inf to 0.82209, saving model to 1590446290\best_Twitter_1590447138
Epoch 2/100
Epoch 00002: val_acc improved from 0.82209 to 0.82761, saving model to 1590446290\best_Twitter_1590447138
Epoch 3/100
Epoch 00003: val_acc improved from 0.82761 to 0.83019, saving model to 1590446290\best_Twitter_1590447138
Epoch 4/100
Epoch 00004: val_acc improved from 0.83019 to 0.83173, saving model to 1590446290\best_Twitter_1590447138
Epoch 5/100
Epoch 00005: val_acc improved from 0.83173 to 0.83380, saving model to 1590446290\best_Twitter_1590447138
Epoch 6/100
Epoch 00006: val_acc improved from 0.83380 to 0.83430, saving model to 1590446290\best_Twitter_1590447138
Epoch 7/100
Epoch 00007: val_acc improved from 0.83430 to 0.83513, saving model to 1590446290\best_Twitter_1590447138
Epoch 8/100
Epoch 00008: val_acc did not improve from 0.83513
Epoch 9/100
Epoch 00009: val_acc improved from 0.83513 to 0.83

Epoch 26/100
Epoch 00026: val_acc did not improve from 0.83557
Epoch 27/100
Epoch 00027: val_acc did not improve from 0.83557
Epoch 28/100
Epoch 00028: val_acc did not improve from 0.83557
Epoch 29/100
Epoch 00029: val_acc did not improve from 0.83557
Epoch 30/100
Epoch 00030: val_acc did not improve from 0.83557
Epoch 31/100
Epoch 00031: val_acc did not improve from 0.83557
Epoch 32/100
Epoch 00032: val_acc did not improve from 0.83557
Epoch 33/100
Epoch 00033: val_acc did not improve from 0.83557
Epoch 34/100
Epoch 00034: val_acc did not improve from 0.83557
Epoch 35/100
Epoch 00035: val_acc did not improve from 0.83557
Epoch 36/100
Epoch 00036: val_acc did not improve from 0.83557
Epoch 37/100
Epoch 00037: val_acc did not improve from 0.83557
Epoch 38/100
Epoch 00038: val_acc did not improve from 0.83557
Epoch 39/100
Epoch 00039: val_acc did not improve from 0.83557
Epoch 40/100
Epoch 00040: val_acc did not improve from 0.83557
Epoch 41/100
Epoch 00041: val_acc did not improve from 

Epoch 00077: val_acc did not improve from 0.83557
Epoch 78/100
Epoch 00078: val_acc did not improve from 0.83557
Epoch 79/100
Epoch 00079: val_acc did not improve from 0.83557
Epoch 80/100
Epoch 00080: val_acc did not improve from 0.83557
Epoch 81/100
Epoch 00081: val_acc did not improve from 0.83557
Epoch 82/100
Epoch 00082: val_acc did not improve from 0.83557
Epoch 83/100
Epoch 00083: val_acc did not improve from 0.83557
Epoch 84/100
Epoch 00084: val_acc did not improve from 0.83557
Epoch 85/100
Epoch 00085: val_acc did not improve from 0.83557
Epoch 86/100
Epoch 00086: val_acc did not improve from 0.83557
Epoch 87/100
Epoch 00087: val_acc did not improve from 0.83557
Epoch 88/100
Epoch 00088: val_acc did not improve from 0.83557
Epoch 89/100
Epoch 00089: val_acc did not improve from 0.83557
Epoch 90/100
Epoch 00090: val_acc did not improve from 0.83557
Epoch 91/100
Epoch 00091: val_acc did not improve from 0.83557
Epoch 92/100
Epoch 00092: val_acc did not improve from 0.83557
Epoch

NameError: name 'model_dir' is not defined

### Evaluation

In [65]:
model = load_model(f'{BASE_DIR}//best_Twitter_1590447138')

In [66]:
# Evaluate

y_pred = np.rint(model.predict(x_test))
y_true = y_test
cf_matrix = confusion_matrix(y_true, y_pred)
print(cf_matrix)
class_wise_f1 = f1_score(y_true, y_pred, average=None)
print('the mean-f1 score: {:.4f}'.format(np.mean(class_wise_f1)))
accuracy = accuracy_score(y_true, y_pred)
print('accuracy is: {:.4f}'.format(accuracy))

np.save(f'{BASE_DIR}//y_test_pred', y_pred)

[[134633  25321]
 [ 27296 132750]]
the mean-f1 score: 0.8356
accuracy is: 0.8356
