In [1]:
import re
import pandas as pd
import matplotlib.pyplot as plt
import nltk
import string
import tensorflow
import tensorflow as tf

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score, classification_report
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras import Model
from keras.models import Sequential
from keras import models, layers, optimizers, regularizers
from keras.layers import Dense, Embedding, MaxPool1D, Conv1D, GlobalMaxPooling1D, Dropout, LSTM, GRU, Bidirectional, Dropout, MaxPooling1D, Input, Lambda, GlobalMaxPool1D
from keras.callbacks import ModelCheckpoint
from collections import Counter
from keras.optimizers import Adam

In [2]:
tensorflow.random.set_seed(42)

In [3]:
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
LEMMATIZER = WordNetLemmatizer()
PORTER_STEMMER = PorterStemmer()
ENGLISH_STOP_WORDS = set(stopwords.words('english'))

In [5]:
from google.colab import drive

In [6]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
DF_ORIG = pd.read_csv("/content/drive/MyDrive/UrfuMagistracy/natural-language-processing-III/datasets/train.csv", index_col=0)

In [8]:
DF_ORIG.head()

Unnamed: 0,Text,Sentiment
0,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Neutral
1,advice Talk to your neighbours family to excha...,Positive
2,Coronavirus Australia: Woolworths to give elde...,Positive
3,My food stock is not the only one which is emp...,Positive
4,"Me, ready to go at supermarket during the #COV...",Extremely Negative


In [9]:
DF_ORIG.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41159 entries, 0 to 41156
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       41158 non-null  object
 1   Sentiment  41155 non-null  object
dtypes: object(2)
memory usage: 964.7+ KB


In [10]:
DF_ORIG.describe()

Unnamed: 0,Text,Sentiment
count,41158,41155
unique,41158,5
top,@MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...,Positive
freq,1,11422


### Preprocessing

In [11]:
def text_preprocessing(text, stop_words=None, stem=None, lemmatizer=None):
    if isinstance(text, str):
        text = re.sub(r"http\S+", "", text)

    text = text.translate(str.maketrans("", "", string.punctuation))
    text = "".join([i for i in text if not i.isdigit()])
    text = text.lower()

    if stop_words is not None:
        words = nltk.word_tokenize(text)
        filtered_words = [word for word in words if word not in stop_words]
        text = " ".join(filtered_words)

    if stem is not None:
        words = nltk.word_tokenize(text)
        stemmed_words = [stem.stem(word) for word in words]
        text = " ".join(stemmed_words)

    if lemmatizer is not None:
        words = nltk.word_tokenize(text)
        lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
        text = " ".join(lemmatized_words)

    return text

In [12]:
DF_ORIG.isna().sum().sum()

5

In [13]:
DF_ORIG.dropna(inplace=True)

In [14]:
DF_ORIG.isna().sum().sum()

0

In [15]:
DF_ORIG["Text"] = DF_ORIG["Text"].apply(text_preprocessing, args=(ENGLISH_STOP_WORDS, PORTER_STEMMER, LEMMATIZER,))

In [16]:
DF_ORIG = DF_ORIG[DF_ORIG["Text"] != ""]

In [17]:
DF_ORIG.info()

<class 'pandas.core.frame.DataFrame'>
Index: 41139 entries, 0 to 41156
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       41139 non-null  object
 1   Sentiment  41139 non-null  object
dtypes: object(2)
memory usage: 964.2+ KB


In [18]:
DF_ORIG.head()

Unnamed: 0,Text,Sentiment
0,menyrbi philgahan chrisitv,Neutral
1,advic talk neighbour famili exchang phone numb...,Positive
2,coronaviru australia woolworth give elderli di...,Positive
3,food stock one empti plea dont panic enough fo...,Positive
4,readi go supermarket covid outbreak im paranoi...,Extremely Negative


### Tokenization

In [19]:
df_train = DF_ORIG.copy()

In [21]:
df_train.shape

(41139, 2)

In [22]:
counter = Counter()

In [23]:
VOC_LEN = len(df_train.Text.apply(counter.update))
VOC_LEN

41139

In [24]:
tokenizer = Tokenizer(num_words=VOC_LEN)

In [25]:
tokenizer.fit_on_texts(df_train.Text)

In [26]:
X_vect = tokenizer.texts_to_sequences(df_train.Text)

In [27]:
maxlen = len(max(X_vect, key=len))

In [28]:
X_vect_pads = pad_sequences(X_vect, maxlen=maxlen)

In [29]:
X_vect_pads

array([[    0,     0,     0, ..., 17378, 17379,  9380],
       [    0,     0,     0, ...,   972,  2192,    84],
       [    0,     0,     0, ...,   103,     1,    75],
       ...,
       [    0,     0,     0, ...,  1227,    25,    19],
       [    0,     0,     0, ...,     2,     1,     2],
       [    0,     0,     0, ...,   280,   709,     1]], dtype=int32)

In [30]:
label_encoder = LabelEncoder()
y_vect = label_encoder.fit_transform(df_train.Sentiment)
y_vect = to_categorical(y_vect)
y_vect

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.]], dtype=float32)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X_vect_pads, y_vect, test_size=0.25, random_state=42)

In [32]:
LOGITS_QTY = len(df_train["Sentiment"].unique())

In [33]:
LOGITS_QTY

5

In [34]:
model = Sequential([
    Embedding(input_dim=VOC_LEN, output_dim=64, input_length=maxlen),
    LSTM(64, return_sequences=True),
    Dropout(0.5),
    LSTM(32),
    Dense(LOGITS_QTY, activation='softmax')
])

In [35]:
model.compile(optimizer=Adam(learning_rate=0.002), loss="categorical_crossentropy", metrics=["accuracy"])

In [None]:
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/5