In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/mental-health-corpus/mental_health.csv


# Importing Packages

In [17]:
import matplotlib.pyplot as plt 
import seaborn as sns
plt.style.use('fivethirtyeight')

import spacy
import nltk 
import string
import regex as re 
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from tensorflow.keras.layers import TextVectorization
import tensorflow as tf 
from tensorflow.keras.layers import LSTM, Dropout, Bidirectional, Dense, Embedding, BatchNormalization
from tensorflow.keras.models import Sequential,load_model,save_model
from tensorflow.keras.metrics import Precision, Recall, CategoricalAccuracy

In [4]:
train = pd.read_csv('/kaggle/input/mental-health-corpus/mental_health.csv')

# Modules to remove the stopwords

In [5]:
nlp = spacy.load('en_core_web_lg')
sp = spacy.load('en_core_web_lg')

nltk.download('stopwords')
nltk.download('punkt')

spacy_st = nlp.Defaults.stop_words
nltk_st = stopwords.words('english')

[nltk_data] Error loading stopwords: <urlopen error [Errno -3]
[nltk_data]     Temporary failure in name resolution>
[nltk_data] Error loading punkt: <urlopen error [Errno -3] Temporary
[nltk_data]     failure in name resolution>


# Function to clean the text

In [6]:
def clean(text, http=True, punc=True, lem=True, stop_w=True):
    if http == True:
        text = re.sub('https?:\/\/t.co\/[A-Za-z0-9]*', '', text)
    if stop_w == True:
        text = [word for word in word_tokenize(text) if not word.lower() in nltk_st]
        text = ' '.join(text)
    if lem == True:
        lemmatized = [word.lemma_ for word in sp(text)]
        text = ' '.join(lemmatized)
    if punc == True:
        text = text.translate(str.maketrans('', '', string.punctuation))
        
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"im", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('a{2,}', '', text)
    text = re.sub('b{2,}', '', text)
    text = re.sub('c{2,}', '', text)
    text = re.sub('d{2,}', '', text)
    text = re.sub('e{2,}', '', text)
    text = re.sub('f{2,}', '', text)
    text = re.sub('g{2,}', '', text)
    text = re.sub('h{2,}', '', text)
    text = re.sub('i{2,}', '', text)
    text = re.sub('j{2,}', '', text)
    text = re.sub('k{2,}', '', text)
    text = re.sub('l{2,}', '', text)
    text = re.sub('m{2,}', '', text)
    text = re.sub('n{2,}', '', text)
    text = re.sub('o{2,}', '', text)
    text = re.sub('p{2,}', '', text)
    text = re.sub('q{2,}', '', text)
    text = re.sub('r{2,}', '', text)
    text = re.sub('s{2,}', '', text)
    text = re.sub('t{2,}', '', text)
    text = re.sub('u{2,}', '', text)
    text = re.sub('v{2,}', '', text)
    text = re.sub('w{2,}', '', text)
    text = re.sub('x{2,}', '', text)
    text = re.sub('y{2,}', '', text)
    text = re.sub('z{2,}', '', text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    
    return text

In [7]:
train['cleaned_text'] = train['text'].apply(lambda text: clean(text, http=True, punc=True, lem=True, stop_w=True))

In [8]:
train.drop(columns=['text'], axis=1, inplace=True)

In [9]:
X = train['cleaned_text']
y = train['label']

MAX_FEATURES = 200000

vectorizer = TextVectorization(
    max_tokens=MAX_FEATURES, 
    output_sequence_length=1000, 
    output_mode='int'
)

vectorizer.adapt(X.values)

In [10]:
vectorizerd_text = vectorizer(X.values)

In [11]:
dataset = tf.data.Dataset.from_tensor_slices((vectorizerd_text, y))
dataset = dataset.cache()
dataset = dataset.shuffle(160000)
dataset = dataset.batch(32) 
dataset = dataset.prefetch(8)

In [12]:
batch_X, batch_y = dataset.as_numpy_iterator().next()
batch_X.shape, batch_y.shape

((32, 1000), (32,))

In [13]:
train = dataset.take(int(len(dataset)*.7))
val = dataset.skip(int(len(dataset)*.7)).take(int(len(dataset)*.2))
test = dataset.skip(int(len(dataset)*.9)).take(int(len(dataset)*.1))
len(train), len(val), len(test)

(612, 175, 87)

In [14]:
model = Sequential()

model.add(Embedding(MAX_FEATURES + 1, 32))
model.add(Bidirectional(LSTM(32, activation='tanh')))
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.1))
model.add(Dense(256, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.1))
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.1))
model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.1))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='BinaryCrossentropy', optimizer='Adam', metrics=['accuracy'])
model.summary()
hist = model.fit(train, epochs=3, batch_size=32, validation_data=val)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, None, 32)          6400032   
                                                                 
 bidirectional (Bidirectiona  (None, 64)               16640     
 l)                                                              
                                                                 
 dense (Dense)               (None, 128)               8320      
                                                                 
 batch_normalization (BatchN  (None, 128)              512       
 ormalization)                                                   
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 256)               3

In [15]:
pre = Precision()
re = Recall()
acc = CategoricalAccuracy()

for batch in test.as_numpy_iterator():
    X_true, y_true = batch
    yhat = model.predict(X_true)
    
    y_preds = []
    for y in yhat:
        if y >= 0.5:
            y_preds.append(1)
        else:
            y_preds.append(0)
    
    y_true = y_true.flatten()
    
    pre.update_state(y_true, y_preds)
    re.update_state(y_true, y_preds)
    acc.update_state(y_true, y_preds)

print(f'Precision: {pre.result().numpy()}, Recall: {re.result().numpy()}, Accuracy: {acc.result().numpy()}')

Precision: 0.9417071342468262, Recall: 0.9869090914726257, Accuracy: 0.931034505367279


In [19]:
model.save('model.h5')

In [21]:
os.listdir()

['__notebook_source__.ipynb', '.virtual_documents', 'model.h5']