<a href="https://colab.research.google.com/github/mindyng/mindyng.github.io/blob/master/Burnout_LSTM_Deployment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Set Up Kaggle Dependencies

In [45]:
# Install Kaggle library
!pip install -q kaggle
from google.colab import files
#upload the kaggle.json file that was downloaded
uploaded = files.upload()
# make a diectory in which kaggle.json is stored
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
#download the dataset into colab(paste API command after !)
!kaggle datasets download -d mindyng/healthcareworkersburnout
#unzip the data
!unzip healthcareworkersburnout.zip

Saving kaggle.json to kaggle (1).json
mkdir: cannot create directory ‘/root/.kaggle’: File exists
Downloading healthcareworkersburnout.zip to /content
  0% 0.00/154k [00:00<?, ?B/s]
100% 154k/154k [00:00<00:00, 72.8MB/s]
Archive:  healthcareworkersburnout.zip
  inflating: df.csv                  


# Load Dependencies

In [46]:
import numpy as np
import pandas as pd
import string
from keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, Bidirectional,Flatten,Dropout
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

#Pre-process Data

Clear data of HTML tags, numbers, special characters

In [47]:
def remove_special_characters(text):
    '''Make text lowercase, remove text in square brackets, remove links, remove punctuation.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    #text = re.sub('\w*\d\w*', '', text)
    return text

def remove_num(text):
 return "".join(re.sub('([0–9]+)','',text))

data = pd.read_csv('/content/df.csv')
data['Tweet Text']=data['Tweet Text'].apply(lambda x : remove_num(x))
data['Tweet Text'] = data['Tweet Text'].apply(lambda x: remove_special_characters(x))
data.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Tweet Date,Tweet ID,Tweet Text,tweet_source,tweet_retweet_count,tweet_favorite_count,word_count,char_count,word_density,punc_count,sentiment,year,month,day,unique_word_count,stop_word_count,url_count,mean_word_length,hashtag_count,mention_count,burnout
0,0,0,2021-01-19,1351590096752017416,im a big music person it speaks to me in all w...,Twitter for Android,0,6,27,142,0.230769,12,-0.0625,2021,1,19,25,15,1,4.296296,0,0,1
1,1,1,2021-01-19,1351555038863368193,johnwharris15 i adore you and your unlimited s...,Twitter for Android,0,1,9,59,0.176471,1,0.0,2021,1,19,9,4,0,5.555556,0,1,1
2,2,2,2021-01-19,1351554892926750720,bubblydncer i have replied to countless texts ...,Twitter for Android,0,1,22,142,0.180328,14,-0.125,2021,1,19,20,10,1,5.5,0,1,1
3,3,3,2021-01-19,1351554395234824192,melbeer3 ahhh i love this,Twitter for Android,0,0,5,29,0.2,2,0.625,2021,1,19,5,1,0,4.8,0,1,0
4,4,4,2021-01-19,1351554266247426050,thekimclub,Twitter for Android,0,1,2,35,0.057143,6,0.0,2021,1,19,2,0,1,17.0,0,1,1


In [54]:
#checking burnout classification is right-- it is
data[data['Tweet Text'].str.contains("im tired im cranky im emotional")] 

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Tweet Date,Tweet ID,Tweet Text,tweet_source,tweet_retweet_count,tweet_favorite_count,word_count,char_count,word_density,punc_count,sentiment,year,month,day,unique_word_count,stop_word_count,url_count,mean_word_length,hashtag_count,mention_count,burnout
99,99,110,2020-12-31,1344777500019335169,as the clock strikes the beckoning hour we all...,Twitter for Android,2,30,24,142,0.2,8,-0.3,2020,12,31,20,12,1,4.958333,0,0,1


# Word Embeddings

In [49]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(data["Tweet Text"])
X = tokenizer.texts_to_sequences(data["Tweet Text"])
X = pad_sequences(X,maxlen=500)
Y = data["burnout"]
vocab_size = len(tokenizer.word_index) + 1

In [50]:
# We can then create our train and test sets:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 24)
#We store this tokenizer in a file to use later in web app
import pickle
# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

No need to encode targets since they are already in numerical format (0/1).

# Model Architecture Build

In [51]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=500))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.5))
model.add(Dense(1,activation="sigmoid"))
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 500, 50)           287750    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 256)               183296    
_________________________________________________________________
dropout_4 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 257       
Total params: 471,303
Trainable params: 471,303
Non-trainable params: 0
_________________________________________________________________


# Training the Model

In [52]:
from keras.callbacks import EarlyStopping
es = EarlyStopping(monitor="val_loss", mode="min", verbose=1, patience=5)
history=model.fit(X_train, Y_train,
 batch_size=128,
 epochs=20,
 validation_data=[X_test, Y_test],
 callbacks=[es])
#We save this model so that we can use in own web app
model.save('HCW_burn.h5')

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 00006: early stopping


# Assess Model Prediction Accuracy

In [53]:
test_string="Today was not a good day. I'm in a bad headspace, I cried often for good reason and no reason at all. I'm not telling this for sympathy but because I know so many of you can relate. It's ok not to be ok. There is strength in admitting weakness. That's what I'm telling myself."
x_1=tokenizer.texts_to_sequences([test_string])
x_1 = pad_sequences(x_1,maxlen=500)
model.predict(x_1)
#Output (closer to 1 = positive review and closer to 0 = negative review)
#expect negative (close to 0)

array([[0.02027655]], dtype=float32)

# Deployment Using Streamlit

In [None]:
# import streamlit as st
# import pickle
# from tensorflow.keras.models import load_model
# from tensorflow.keras.preprocessing.text import Tokenizer
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# def predict(message):
#     model=load_model("HCW_burn.h5")
#     with open('tokenizer.pickle', 'rb') as handle:
#         tokenizer = pickle.load(handle)
#         x_1 = tokenizer.texts_to_sequences([message])
#         x_1 = pad_sequences(x_1, maxlen=500)
#         predictions = model.predict(x_1)[0][0]
#         return predictions

# st.title("HCW Sentiment Analyzer")
# message = st.text_area("Enter Tweet","Type Here ..")

# if st.button("Analyze"):
#     with st.spinner("Analyzing the text…"):
#         prediction=predict(message)
#         if prediction > 0.6:
#             st.success("Positive tweet with {:.2f} confidence".format(prediction))
#             st.balloons()
#         elif prediction <0.4:
#             st.error("Negative tweet with {:.2f} confidence".format(1-prediction))
#         else:
#             st.warning("Not sure! Try to add some more words/context")