## Importing libraries

In [1]:
!pip install sentence-transformers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
from sentence_transformers import SentenceTransformer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[K     |████████████████████████████████| 85 kB 4.7 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 49.1 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.97-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[K     |████████████████████████████████| 1.3 MB 50.6 MB/s 
[?25hCollecting huggingface-hub>=0.4.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 71.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 53.0 MB/s 
Building wheels for collected p

### Reading the data

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import os

#os.listdir("drive/MyDrive")

Mounted at /content/drive


In [3]:
df_tweet = pd.read_csv("./drive/MyDrive/train.csv")
df_tweet.shape

(29992, 2)

In [4]:
mapper = {
    "Anxious": 0,
    "Normal": 1,
    "Stressed": 2,
    "Lonely": 3
}
df_tweet['labels'] = df_tweet['labels'].map(mapper)

### In this approach we will

- Clean text only using regex
- Here nltk  `stemming` and `Lemmatization` shall not be used as the sequential contextual information will be retained
- use `sentence transformers` as `sentence embeddings` which is a vector of length 768 (Bert-base-uncased)
- Train the classical Machine learning models
- Train a MLP Feed Forward network

### Step 1: cleaning of text

In [5]:

#### URLs , Hashtags and mentions are removed
def regex_clean(text):
    text = text.lower()
    url = "https?://([A-z0-9_\.\-%\?=&]+/)*[A-z0-9_\.\-%\?=&]+"
    hashtags = "#[A-z0-9_%\.\-]{1,}"
    mentions = "@[A-z0-9_%\.\-]+"
    
    text = re.sub(url,"",text)
    text = re.sub(hashtags,"",text)
    text = re.sub(mentions,"",text)
    
    return text

In [6]:
df_tweet["tweets"] = df_tweet["tweets"].apply(regex_clean)

### Step 2: Generate Sentence_Embedding and training data for neural networks

In [7]:
st = SentenceTransformer("bert-base-uncased")

Downloading:   0%|          | 0.00/437 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/11.4k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Some weights of the model checkpoint at /root/.cache/torch/sentence_transformers/bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
### SentenceEmbedding Batches
from sklearn.model_selection import train_test_split,GridSearchCV


#### Splitting the data 
df_train, df_test = train_test_split(df_tweet,random_state=1,train_size=0.75)


#### total number of batches
def num_batches(total,batch_size):
    if total % batch_size == 0:
        return total // batch_size
    else:
        return total // batch_size + 1
    
#### Batch Generator
def generate_batch_feedforward(df, text_col, label_col, n_classes, batch_size=300, encoder=st):
    total = df.shape[0]
    batches = num_batches(total,batch_size)
    df.index = list(range(len(df)))
    #print(df.index)
    while True:
        for b in range(batches):

            if total % batch_size == 0 or b < batches -1:
                X_batch = np.zeros((batch_size,768))
                y_batch = np.zeros((batch_size,n_classes))
                X_data = df[text_col].iloc[b*batch_size:b*batch_size + batch_size].tolist()
                embeddings = encoder.encode(X_data)
                X_batch = embeddings.copy()
                y_data = df[label_col].iloc[b*batch_size:b*batch_size+batch_size].tolist()
                for i,value in enumerate(y_data):
                    y_batch[i,value] = 1
            else:
                X_batch = np.zeros((total % batch_size,768))
                y_batch = np.zeros((total % batch_size,n_classes))
                X_data = df[text_col].iloc[b*batch_size:].tolist()
                embeddings = encoder.encode(X_data)
                X_batch = embeddings.copy()
                y_data = df[label_col].iloc[b*batch_size:].tolist()
                for i,value in enumerate(y_data):
                    y_batch[i,value] = 1
            yield X_batch, y_batch


### Step 3: Training MLP classifier

In [9]:
train_gen  = generate_batch_feedforward(df_train, "tweets", "labels", 4, batch_size=300, encoder=st)
val_gen = generate_batch_feedforward(df_test, "tweets", "labels", 4, batch_size=300, encoder=st)

#### Building the model
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout
model = Sequential()
#model.add(Dense(1000,activation="relu"))
#model.add(Dropout(0.2))
model.add(Dense(4,activation="softmax"))


### compile the model
model.compile(optimizer="adam",loss="categorical_crossentropy", metrics="categorical_accuracy")

#### fit the model
train_steps = num_batches(df_train.shape[0],300)
val_steps = num_batches(df_test.shape[0],300)

history = model.fit_generator(train_gen,steps_per_epoch=train_steps, validation_data=val_gen,validation_steps=val_steps,epochs=10)



Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 4)                 3076      
                                                                 
Total params: 3,076
Trainable params: 3,076
Non-trainable params: 0
_________________________________________________________________


In [11]:
test_data = np.zeros((7498,768))
val_gen = generate_batch_feedforward(df_test, "tweets", "labels", 4, batch_size=300, encoder=st)
i = 0


while True:
  try:
    x = val_gen.__next__()[0]
    
    test_data[i*300:i*300+len(x),:] = x
    i+=1
    #print(x.shape,i)
  except:
    break
test_data

array([[-0.07562211, -0.47467816,  0.5884068 , ..., -0.40620095,
        -0.11727925,  0.12965332],
       [ 0.11971789,  0.04601855,  0.58112258, ..., -0.36822259,
         0.01272304, -0.10664078],
       [ 0.14101349, -0.06181362, -0.29519066, ..., -0.00979735,
        -0.3213127 , -0.09142604],
       ...,
       [ 0.21935265,  0.11478394,  0.91006374, ..., -0.33271286,
         0.07568053, -0.03282821],
       [ 0.10857902, -0.29791287,  0.39974594, ...,  0.10281919,
        -0.09486614, -0.53830236],
       [ 0.40743008,  0.4512569 ,  0.21083188, ..., -0.32115874,
         0.34430206,  0.00104175]])

In [12]:
from sklearn.metrics import confusion_matrix, classification_report
predictions = model.predict(test_data)
predictions = np.argmax(predictions,axis=1)
cf = confusion_matrix(df_test["labels"],predictions)
print(cf)
print(classification_report(df_test["labels"],predictions))

[[1174  240  177  508]
 [ 166 1545  184   73]
 [ 172  239 1294   25]
 [ 958  181  114  448]]
              precision    recall  f1-score   support

           0       0.48      0.56      0.51      2099
           1       0.70      0.79      0.74      1968
           2       0.73      0.75      0.74      1730
           3       0.43      0.26      0.33      1701

    accuracy                           0.59      7498
   macro avg       0.58      0.59      0.58      7498
weighted avg       0.58      0.59      0.58      7498

