In [1]:
from collections import Counter

from sklearn.model_selection import train_test_split


import numpy as np
import pandas as pd
import re
import nltk 
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout

2022-09-16 11:16:52.692632: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-09-16 11:16:52.793426: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-09-16 11:16:53.135154: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2022-09-16 11:16:53.135197: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or 

In [2]:
# Read the data
path = '/media/popo/Elements/Datasets/nlp-getting-started/'
columns = ['text', 'target']
df_train = pd.read_csv(f'{path}train.csv', usecols=columns)
df_test = pd.read_csv(f'{path}test.csv', usecols=['text'])

In [3]:
df_train['target'].unique()

array([1, 0])

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    7613 non-null   object
 1   target  7613 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 119.1+ KB


In [5]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    3263 non-null   object
dtypes: object(1)
memory usage: 25.6+ KB


In [6]:
print('Train shape:', df_train.shape)
print('Test shape:', df_test.shape)

Train shape: (7613, 2)
Test shape: (3263, 1)


In [7]:
df_train.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
#train val split
train, val = train_test_split(df_train, test_size=0.2, random_state=42)

In [9]:
#preprocessing
def preprocess_text(df):
    #remove special characters
    df['text'] = df['text'].apply(lambda x: re.sub('[^a-zA-Z]', ' ', x))
    #convert to lowercase
    df['text'] = df['text'].apply(lambda x: x.lower())
    #remove stopwords
    stop_words = stopwords.words('english')
    df['text'] = df['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop_words]))
    #lemmatization
    lemmatizer = WordNetLemmatizer()
    df['text'] = df['text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()]))
    return df

In [10]:
#preprocess the data
train = preprocess_text(train)
val = preprocess_text(val)
test = preprocess_text(df_test)

In [11]:
X_train, y_train = train['text'], train['target']
X_val, y_val = val['text'], val['target']
X_test = test['text']

In [12]:
#check how many unique words we have
vocab_size = len(set(' '.join(X_train).split()))
print('Vocab size:', vocab_size)

Vocab size: 17727


In [13]:
vocab_size = 20000
encoded_docs_train = [one_hot(d, vocab_size) for d in X_train]
encoded_docs_val = [one_hot(d, vocab_size) for d in X_val]
encoded_docs_test = [one_hot(d, vocab_size) for d in X_test]

In [14]:
#pad the sequences
max_length = 100
padded_docs_train = pad_sequences(encoded_docs_train, maxlen=max_length, padding='post')
padded_docs_test = pad_sequences(encoded_docs_test, maxlen=max_length, padding='post')
padded_docs_val = pad_sequences(encoded_docs_val, maxlen=max_length, padding='post')

In [15]:
#max length
max_length = 100
#embedding size
embedding_size = 32
#number of classes
num_classes = 5
#model
model = Sequential()
model.add(Embedding(vocab_size, embedding_size, input_length=max_length))
#bidirectional LSTM
model.add(Bidirectional(LSTM(32)))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

2022-09-16 11:16:55.295522: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-16 11:16:55.313812: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-16 11:16:55.313952: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-09-16 11:16:55.314428: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 32)           640000    
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 5)                 325       
                                                                 
Total params: 656,965
Trainable params: 656,965
Non-trainable params: 0
_________________________________________________________________


In [16]:
#train the model
model.fit(padded_docs_train, y_train, epochs=10, verbose=1)

Epoch 1/10


2022-09-16 11:16:57.966587: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8500
2022-09-16 11:16:58.052814: I tensorflow/stream_executor/cuda/cuda_blas.cc:1614] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7faeb08c7ac0>

In [17]:
#evaluate the model
loss, accuracy = model.evaluate(padded_docs_val, y_val, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 72.357190


In [18]:
#make predictions
predictions = model.predict(padded_docs_test)
predictions = np.argmax(predictions, axis=1)



In [19]:
print(predictions)

[1 1 1 ... 1 1 0]
