In [1]:
!nvidia-smi

Mon May 16 03:21:41 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   41C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install --upgrade -q matplotlib
!pip install -q transformers

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk

from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split

#transformers
from transformers import BertTokenizerFast
from transformers import TFBertModel

import tensorflow as tf

#metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

In [4]:
seed=42
sns.set_style("whitegrid")
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)

In [5]:
df = pd.read_csv('/content/drive/MyDrive/NLP/Setiment_Analysis/en/dataset/covid-19/clean_train.csv', header=0)
df.head()

Unnamed: 0,text_clean,Sentiment
0,it s a confusing odd time for the shopping pub...,Negative
1,in 2019 d2c ecommerce sales reached 1428 billi...,Positive
2,chinese residents are paying exorbitant prices...,Negative
3,list of supermarkets grocery shops and vegetab...,Neutral
4,its there any wonder tesco and other supermark...,Positive


In [6]:
df_test = pd.read_csv('/content/drive/MyDrive/NLP/Setiment_Analysis/en/dataset/covid-19/clean_test.csv', header=0)
df_test.head()

Unnamed: 0,text_clean,Sentiment
0,we may not have any toilet paper in our house ...,Positive
1,really whats the downside of coronavirus for a...,Positive
2,hello everyone we made amp sell high quality m...,Extremely Positive
3,happy to report that i jumped on the panic sho...,Positive
4,just been to the supermarket why do all women ...,Neutral


# Sentiment column ananlysis

In [7]:
df['Sentiment'].value_counts()

Positive              11381
Negative               9889
Neutral                7560
Extremely Positive     6618
Extremely Negative     5475
Name: Sentiment, dtype: int64

In [8]:
df['Sentiment'] = df['Sentiment'].map({'Extremely Negative': 0, 'Negative': 0, 'Neutral': 1,
                                       'Positive': 2, 'Extremely Positive': 2})

In [9]:
df_test['Sentiment'] = df_test['Sentiment'].map({'Extremely Negative': 0, 'Negative': 0, 'Neutral': 1,
                                                 'Positive': 2, 'Extremely Positive': 2})

In [10]:
df['Sentiment'].value_counts()

2    17999
0    15364
1     7560
Name: Sentiment, dtype: int64

## Class Balanceing by RandomOverSampler

In [11]:
ros = RandomOverSampler()
train_x, train_y = ros.fit_resample(np.array(df['text_clean']).reshape(-1, 1), np.array(df['Sentiment']).reshape(-1, 1))
train_os = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns=['text_clean', 'Sentiment'])

In [12]:
train_os['Sentiment'].value_counts()

0    17999
2    17999
1    17999
Name: Sentiment, dtype: int64

## Train - Validation - Test split

In [13]:
X = train_os['text_clean'].values
y = train_os['Sentiment'].values

In [14]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, stratify=y, random_state=seed)

In [15]:
X_test = df_test['text_clean'].values
y_test = df_test['Sentiment'].values

## One hot encoding

In [16]:
y_train_le = y_train.copy()
y_val_le = y_val.copy()
y_test_le = y_test.copy()

In [17]:
ohe = preprocessing.OneHotEncoder()
y_train = ohe.fit_transform(np.array(y_train).reshape(-1, 1)).toarray()
y_val = ohe.fit_transform(np.array(y_val).reshape(-1, 1)).toarray()
y_test = ohe.fit_transform(np.array(y_test).reshape(-1, 1)).toarray()

In [18]:
print(f"TRAINING DATA: {X_train.shape[0]}\nVALIDATION DATA: {X_val.shape[0]}\nTESTING DATA: {X_test.shape[0]}" )

TRAINING DATA: 48597
VALIDATION DATA: 5400
TESTING DATA: 3787


# BERT Sentiment analysis

In [19]:
MAX_LEN=128

In [20]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [21]:
def tokenize(tokenizer, data, max_len=MAX_LEN):
    input_ids = []
    attention_masks = []
    for i in range(len(data)):
        encoded = tokenizer.encode_plus(
            data[i],
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            return_attention_mask=True
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    
    input_ids = np.array(input_ids)
    attention_masks = np.array(attention_masks)
    return input_ids, attention_masks

In [22]:
train_input_ids, train_attention_masks = tokenize(tokenizer, X_train, MAX_LEN)
val_input_ids, val_attention_masks = tokenize(tokenizer, X_val, MAX_LEN)
test_input_ids, test_attention_masks = tokenize(tokenizer, X_test, MAX_LEN)

# BERT Modeling

In [23]:
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [24]:
def create_model(bert_model, max_len=MAX_LEN):
    #params
    opt = tf.keras.optimizers.Adam(learning_rate=1e-5, decay=1e-7)
    loss = tf.keras.losses.CategoricalCrossentropy()
    accuracy = tf.keras.metrics.CategoricalAccuracy()

    input_ids = tf.keras.Input(shape=(max_len, ), dtype='int32')
    attention_masks = tf.keras.Input(shape=(max_len, ), dtype='int32')
    embeddings = bert_model([input_ids, attention_masks])[1]
    output = tf.keras.layers.Dense(3, activation="softmax")(embeddings)
    model = tf.keras.models.Model(inputs=[input_ids, attention_masks], outputs=output)
    model.compile(opt, loss=loss, metrics=accuracy)

    return model

In [25]:
model = create_model(bert_model, MAX_LEN)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model (TFBertModel)    TFBaseModelOutputWi  109482240   ['input_1[0][0]',                
                                thPoolingAndCrossAt               'input_2[0][0]']                
                                tentions(last_hidde                                               
                                n_state=(None, 128,                                           

In [26]:
history_bert = model.fit([train_input_ids, train_attention_masks], y_train,
                         validation_data=([val_input_ids, val_attention_masks], y_val),
                         epochs=5, batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# BERT result

In [27]:
result_bert = model.predict([test_input_ids,test_attention_masks])

In [28]:
y_pred_bert =  np.zeros_like(result_bert)
y_pred_bert[np.arange(len(y_pred_bert)), result_bert.argmax(1)] = 1

In [29]:
print('\tClassification Report for BERT:\n\n',classification_report(y_test,y_pred_bert, target_names=['Negative', 'Neutral', 'Positive']))

	Classification Report for BERT:

               precision    recall  f1-score   support

    Negative       0.91      0.89      0.90      1629
     Neutral       0.77      0.82      0.79       614
    Positive       0.91      0.90      0.90      1544

   micro avg       0.88      0.88      0.88      3787
   macro avg       0.86      0.87      0.86      3787
weighted avg       0.88      0.88      0.88      3787
 samples avg       0.88      0.88      0.88      3787

