In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

### Reading the tweet Dataset

In [2]:
df_tweet = pd.read_csv("../input/tweet-mental-health-classification/train.csv")
df_tweet.shape

### Let us look at the class distribution

In [3]:
df_tweet['labels'].value_counts()

#### Since it was observed that Anxious and lonely are higly correlated. We ll try to combine Anxious and lonely as common class

In [4]:
df_tweet.loc[df_tweet['labels'].isin(['Anxious',"Lonely"]),'labels'] = "Anx_Lonely"
df_tweet = df_tweet.drop_duplicates()
df_tweet['labels'].value_counts()

### convert all the text into lower case and convert labels int numeric valuse

In [5]:
df_tweet['tweets'] = df_tweet['tweets'].apply(lambda x: x.lower())

mapper = {
    "Normal": 0,
    "Anx_Lonely": 1,
    "Stressed": 2
}

df_tweet['labels'] = df_tweet["labels"].map(mapper)
df_tweet.shape

### Preparing training and testing Data

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from transformers import AutoTokenizer, TFBertModel
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense, Dropout

df_train, df_test = train_test_split(df_tweet,train_size=0.75, random_state=2)

### Preparing Batch Data 

In [7]:
def num_batches(total, batch_size):
    if total % batch_size == 0:
        return total // batch_size
    else:
        return total // batch_size + 1



def generate_batch(df,text_col,label_col,batch_size=150):
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased",
                                              padding_side='left',truncation_side='right')
    #model = TFBertModel.from_pretrained("bert-base-uncased")
    texts = df[text_col].tolist()
    unique_labels = len(df[label_col].unique())
    labels = np.asarray(df[label_col])
    total = df.shape[0]
    batches = num_batches(total, batch_size)
    
    while True:
        for b in range(batches):

            if total % batch_size == 0 or b < batches - 1:
                y_data = np.zeros((batch_size,unique_labels))
                batch_labels = labels[b*batch_size : b*batch_size + batch_size]
                batch_texts = texts[b*batch_size : b*batch_size + batch_size]
                tokens = tokenizer(batch_texts,max_length=25,padding='max_length',
                                   truncation=True,return_tensors='tf')
                for i in range(batch_size):
                    cat = batch_labels[i]
                    y_data[i,cat] = 1
            else:
                y_data = np.zeros((total % batch_size,unique_labels))
                batch_labels = labels[b*batch_size : b*batch_size + total % batch_size]
                batch_texts = texts[b*batch_size : b*batch_size + total % batch_size]
                tokens = tokenizer(batch_texts,max_length=25,padding='max_length',
                                   truncation=True,return_tensors='tf')
                for i in range(total % batch_size):
                    cat = batch_labels[i]
                    y_data[i,cat] = 1

            
            t1 = tokens['input_ids'] = tokens['input_ids'].numpy()
            t2 = tokens['token_type_ids'] = tokens['token_type_ids'].numpy()
            t3 = tokens['attention_mask'] = tokens['attention_mask'].numpy()
            #print(t2.shape,y_data.shape)
            yield (t1,t2,t3), y_data

### Let us look at the tokens

### Building A Tensorflow BERT model
- The model will be fine tuned

In [8]:
from tensorflow.keras.layers import Input
from tensorflow.keras.optimizers import Adam

bert_model = TFBertModel.from_pretrained("bert-base-uncased")

input_ids = Input(shape=25,dtype=np.int64)
token_type_ids =  Input(shape=25,dtype=np.int64)
attention_mask = Input(shape=25,dtype=np.int64)
bert_out = bert_model([input_ids,token_type_ids,attention_mask])
dense = Dense(3,activation='softmax')
out = dense(bert_out.last_hidden_state[:,0,:])

model = Model([input_ids,token_type_ids,attention_mask],out)

model.compile(loss='categorical_crossentropy',optimizer=Adam(2e-6),metrics='accuracy')
model.summary()


### Fitting the model

In [9]:
train_gen  = generate_batch(df_train,'tweets','labels',150)
val_gen = generate_batch(df_test,'tweets','labels',150)
steps_per_epoch = num_batches(df_train.shape[0],150)
val_steps = num_batches(df_test.shape[0],150)

history = model.fit_generator(train_gen,steps_per_epoch=steps_per_epoch,
                              validation_data=val_gen,validation_steps=val_steps,epochs=15)

In [10]:
h = history.history
plt.figure(figsize=(10,8))
epochs = 15
plt.subplot(1,2,1)
plt.plot(range(1,epochs+1),h['loss'])
plt.plot(range(1,epochs+1),h['val_loss'])
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.subplot(1,2,2)
plt.plot(range(1,epochs+1),h['accuracy'])
plt.plot(range(1,epochs+1),h['val_accuracy'])
plt.xlabel("Epoch")
plt.ylabel("Accuracy")

### Saved tuned weightsBERT weights
`model.layers[-3]` is the bert model layer

In [11]:
import os
import shutil
shutil.rmtree("model")
os.mkdir("model")
b_layer = model.layers[-3]
b_layer.save_pretrained("model/bert-tuned")

### Evaluating the data on test set

In [12]:

def get_test_data(df,text_col,label_col,batch_size=150):
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased",
                                              padding_side='left',truncation_side='right')
    #model = TFBertModel.from_pretrained("bert-base-uncased")
    texts = df[text_col].tolist()
    unique_labels = len(df[label_col].unique())
    labels = np.asarray(df[label_col])
    total = df.shape[0]
    batches = num_batches(total, batch_size)
    t1 = np.zeros((total,25))
    t2 = np.zeros((total,25))
    t3 = np.zeros((total,25))
    y_data = np.zeros((total,unique_labels))
    for b in range(batches):

        if total % batch_size == 0 or b < batches - 1:
            
            batch_labels = labels[b*batch_size : b*batch_size + batch_size]
            batch_texts = texts[b*batch_size : b*batch_size + batch_size]
            tokens = tokenizer(batch_texts,max_length=25,padding='max_length',
                               truncation=True,return_tensors='tf')
            
            t1[b*batch_size:b*batch_size+batch_size] = tokens['input_ids'].numpy()
            t2[b*batch_size:b*batch_size+batch_size] = tokens['token_type_ids'].numpy()
            t3[b*batch_size:b*batch_size+batch_size] = tokens['attention_mask'].numpy()
            for i in range(batch_size):
                cat = batch_labels[i]
                y_data[b*batch_size + i,cat] = 1
        else:
            
            batch_labels = labels[b*batch_size : b*batch_size + total % batch_size]
            batch_texts = texts[b*batch_size : b*batch_size + total % batch_size]
            tokens = tokenizer(batch_texts,max_length=25,padding='max_length',
                               truncation=True,return_tensors='tf')
            t1[b*batch_size:] = tokens['input_ids'].numpy()
            t2[b*batch_size:] = tokens['token_type_ids'].numpy()
            t3[b*batch_size:] = tokens['attention_mask'].numpy()
           
            for i in range(total % batch_size):
                cat = batch_labels[i]
                y_data[b*batch_size + i,cat] = 1


        
        #print(t2.shape,y_data.shape)
    return (t1,t2,t3), y_data

### Prediction on test set

In [13]:
test_data = get_test_data(df_test,'tweets','labels')
inputs = test_data[0]
predictions = model.predict(inputs)

In [14]:
y_test = df_test['labels']
y_test_pred = np.argmax(predictions,axis=1)
cf = confusion_matrix(y_test,y_test_pred)
cf

In [15]:
print(classification_report(y_test,y_test_pred))

### BERT + LSTM hybrid

In [16]:
from tensorflow.keras.layers import Bidirectional, LSTM



### Step 1 is to convert all the texts into embeddings
### The Embeddings are generated from the fine-tuned BERT weights

In [17]:
### Load saved model
hyb_model = TFBertModel.from_pretrained("model/bert-tuned")
def load_embeddings(text_data,batch_size=150,model=hyb_model):
    tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased",padding_side='left',
                                             truncation_side='right')
    total = len(text_data)
    batches = num_batches(total,batch_size)
    features = np.zeros((total,25,768))
    for b in range(batches):
        if total % batch_size == 0 or b < batches -1:
            batch_texts = text_data[b*batch_size:b*batch_size+batch_size]
            tokens = tokenizer(batch_texts,max_length=25,padding="max_length",
                               truncation=True, return_tensors="tf")
            embeddings = model(tokens).last_hidden_state
            features[b*batch_size:b*batch_size+batch_size,:,:] = embeddings
        else:
            batch_texts = text_data[b*batch_size:]
            tokens = tokenizer(batch_texts,max_length=25,padding="max_length",
                               truncation=True, return_tensors="tf")
            embeddings = model(tokens).last_hidden_state
            features[b*batch_size:,:,:] = embeddings
    return features
            

In [18]:
texts_train = df_train['tweets'].tolist()
texts_test = df_test['tweets'].tolist()
train_data = load_embeddings(texts_train,batch_size=50,model=hyb_model)
test_data = load_embeddings(texts_test,batch_size=50,model=hyb_model)

In [19]:
train_data.shape

In [22]:
from tensorflow.keras import Sequential
lstm_model = Sequential()
lstm_model.add(Bidirectional(LSTM(256),input_shape=(25,768)))
lstm_model.add(Dropout(0.1))
lstm_model.add(Dense(3,activation='softmax'))

lstm_model.compile(optimizer="adam",loss='sparse_categorical_crossentropy',metrics='accuracy')
lstm_model.summary()

In [23]:
y_train = df_train['labels']
y_test = df_test['labels']
history = lstm_model.fit(train_data,y_train,batch_size=150,epochs=10,
               validation_data=(test_data,y_test))

### Plotting the model

In [25]:
h = history.history
plt.figure(figsize=(10,6))
epochs = 10
plt.subplot(1,2,1)
plt.plot(range(1,epochs+1),h['loss'])
plt.plot(range(1,epochs+1),h['val_loss'])
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.subplot(1,2,2)
plt.plot(range(1,epochs+1),h['accuracy'])
plt.plot(range(1,epochs+1),h['val_accuracy'])
plt.xlabel("Epoch")
plt.ylabel("Accuracy")

### No Significant improvement by this experiment

### Generating the classification report

In [26]:
y_test_pred = lstm_model.predict(test_data)
y_test_pred = np.argmax(y_test_pred,axis=1)
cf = confusion_matrix(y_test,y_test_pred)
print(cf)

In [28]:
print(classification_report(y_test,y_test_pred))