In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, TFBertModel


### Let us check the embedding difference between bert-pretrained and bert-tuned

In [2]:
tc = AutoTokenizer.from_pretrained("bert-base-uncased",padding_side='left',truncation_side='right',return_tensors='tf')
model_base = TFBertModel.from_pretrained("bert-base-uncased")
model_tuned = TFBertModel.from_pretrained("../input/model-bert-ft")

In [3]:
tokens = tc("I am feeling lonely and I need someone"
        ,padding='max_length',truncation=True,max_length=25,return_tensors="tf")
tokens

In [4]:
v1 = model_base(tokens)
v1 = v1.last_hidden_state[0,0,:]

In [5]:
v2 = model_tuned(tokens)
v2 = v2.last_hidden_state[0,0,:]

In [6]:
v3 = model_base(tokens)
v3.last_hidden_state[0]

In [7]:
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Bidirectional, LSTM, GRU, Dense, Dropout

def num_batches(total,batch_size):
    batches = total // batch_size
    return batches if total % batch_size == 0 else (batches +1)

def get_embeddings_for_lstm(texts,seq_len, model=model_tuned,tokenizer=tc,batch_size=512):
    total = len(texts)
    batches = num_batches(total,batch_size)
    embeddings = np.zeros((total,seq_len,768))
    for b in range(batches):
        if total % batch_size == 0 or b < batches -1:
            tokens = tokenizer(texts[b*batch_size:b*batch_size+batch_size],
                           padding='max_length', truncation=True, max_length=seq_len,
                               return_tensors='tf')
            batch_hidden_state = model(tokens).last_hidden_state.numpy()
            embeddings[b*batch_size:b*batch_size+batch_size,:,:] = batch_hidden_state
        
        else:
            tokens = tokenizer(texts[b*batch_size:],
                           padding='max_length', truncation=True, max_length=seq_len,
                              return_tensors='tf')
            batch_hidden_state = model(tokens).last_hidden_state.numpy()
            embeddings[b*batch_size: ,: ,:] = batch_hidden_state
        #print(f"batch {b}")
    
    return embeddings 

### Splitting the data into train and test

In [8]:
from sklearn.model_selection import train_test_split
df_tweet = pd.read_csv("../input/tweet-mental-health-classification/train.csv")
mapper = {
    "Anxious": 0,
    "Normal": 1,
    "Stressed": 2,
    "Lonely": 3
}
df_tweet['labels'] = df_tweet['labels'].map(mapper)
df_train, df_test = train_test_split(df_tweet,train_size=0.75,random_state=3)


### now let us create embeddings

In [9]:
train_texts = df_train["tweets"].tolist()
test_texts = df_test["tweets"].tolist()

In [10]:
### To load the weights of the tuned model change model parameter to model_tuned

train_embeddings = get_embeddings_for_lstm(train_texts,25,model=model_base)
test_embeddings = get_embeddings_for_lstm(test_texts,25,model=model_base)

import pickle

def write_pkl(data,file):
    with open(file,"wb") as f:
        pickle.dump(data,f)

write_pkl(train_embeddings,"train-embeddings.pkl")
write_pkl(test_embeddings,"test-embeddings.pkl")


In [None]:
lstm_model = Sequential()
lstm_model.add(Bidirectional(LSTM(256),input_shape=(25,768)))
lstm_model.add(Dense(4,activation="softmax"))
lstm_model.compile(optimizer="adam",loss="categorical_crossentropy",metrics="categorical_accuracy")
lstm_model.summary()

### Define GRU model

In [11]:
gru_model = Sequential()
gru_model.add(Bidirectional(GRU(256),input_shape=(25,768)))
gru_model.add(Dense(4,activation="softmax"))
gru_model.compile(optimizer="adam",loss="categorical_crossentropy",metrics="categorical_accuracy")
gru_model.summary()

In [12]:
y_train = pd.get_dummies(df_train['labels']).values
y_test =  pd.get_dummies(df_test['labels']).values
write_pkl(y_train,"y_train.pkl")
write_pkl(y_test,"y_test.pkl")

In [14]:
history = lstm_model.fit(train_embeddings,y_train,validation_data=(test_embeddings,y_test),batch_size=512,epochs=5)
lstm_model.save("model.h5")

In [13]:
history = gru_model.fit(train_embeddings,y_train,validation_data=(test_embeddings,y_test),batch_size=512,epochs=10)
gru_model.save("model.h5")

### Plotting the model

In [None]:
#h = history.history
def plot_data(h,epochs):
    plt.figure(figsize=(16,9))
    plt.subplot(1,2,1)
    plt.plot(range(1,epochs+1),h["loss"])
    plt.plot(range(1,epochs+1),h["val_loss"])
    plt.subplot(1,2,2)
    plt.plot(range(1,epochs+1),h["categorical_accuracy"])
    plt.plot(range(1,epochs+1),h["val_categorical_accuracy"])
    
#plot_data(h,5)

In [1]:

from sklearn.metrics import confusion_matrix,classification_report
#### Avoid OOM
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
import pickle

def load_assets(file_path):
    with open(file_path,"rb") as f:
        data = pickle.load(f)
    return data

train_embeddings = load_assets("train-embeddings.pkl")
test_embeddings = load_assets("test-embeddings.pkl")
y_train = load_assets("y_train.pkl")
y_test = load_assets("y_test.pkl")
model = keras.models.load_model("model.h5")


#train_pred = lstm_model.predict(train_embeddings,batch_size=25)
#labels = np.argmax(train_pred,axis=1)
#print(classification_report(y_train,train_pred))

In [2]:
train_pred = model.predict(train_embeddings,batch_size=10)
labels = np.argmax(train_pred,axis=1)
print(classification_report(np.argmax(y_train,axis=1),labels))

### Let us examine on test data

In [3]:
test_pred = model.predict(test_embeddings,batch_size=10)
labels = np.argmax(test_pred,axis=1)
print(classification_report(np.argmax(y_test,axis=1),labels))

In [4]:
cf = confusion_matrix(np.argmax(y_test,axis=1),labels)
cf