In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import numpy as np

In [2]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package punkt to /Users/girasen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/girasen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/girasen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/girasen/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [4]:
#Loading and splitting the dataset between train and test
org_df=pd.read_csv('Mental_health.csv')
data=org_df.dropna(how='all')
data=data.dropna()
data = data.dropna().reset_index(drop=True)

In [6]:
#Preprocessing of the data
wnl=WordNetLemmatizer()
stop_words=set(stopwords.words('english'))
for i in range(len(data)):
    data.iloc[i,1]=re.sub(r'[^a-zA-Z0-9\s]','',data.iloc[i,1].lower())
    data.iloc[i,1]=re.sub(r'\s',' ',data.iloc[i,1])
    word_tokens=word_tokenize(str(data.iloc[i,1]))
    filtered_stop_words=[x for x in word_tokens if x not in stop_words]
    lemmatized_words=[wnl.lemmatize(x,pos="v") for x in filtered_stop_words]
    data.iloc[i,1]=' '.join(lemmatized_words)

In [7]:
#Building vocabulary list from the dataset
tokenize_words=[word_tokenize(i) for i in data['statement']]
vocabulary=set()
for sentence in tokenize_words:
    vocabulary.update(sentence)
vocab=sorted(list(vocabulary))


In [9]:
# Creating bag of words
def create_bow(sentence,vocab):
    vector=[0]*len(vocab)
    for i in sentence:
        if i in vocab:
            idx=vocab.index(i)
            vector[idx]+=1
    return vector

vectors=[create_bow(sentence,vocab) for sentence in tokenize_words]

In [50]:
#Using TF-IDF vectorization
merged_sentences=[' '.join(i) for i in tokenize_words]
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2),max_features=5000)
X_tfidf = tfidf_vectorizer.fit_transform(merged_sentences)
print("Vocabulary:", tfidf_vectorizer.get_feature_names_out())
print("Array representation:") 
print(X_tfidf.toarray())

Vocabulary: ['00' '10' '10 minutes' ... 'zoloft' 'zombie' 'zone']
Array representation:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [15]:
#Creating a  numeric label column
label=[i for i in data['status']]

In [16]:
#Printing unique labels(sentiments) in the data
unique_labels=list(set(label))
print(unique_labels)

['Suicidal', 'Depression', 'Personality disorder', 'Bipolar', 'Normal', 'Anxiety', 'Stress']


In [17]:
#Adding a label column
data['label'] = pd.factorize(data['status'])[0]

In [18]:
data.head()

Unnamed: 0.1,Unnamed: 0,statement,status,label
0,0,oh my gosh,Anxiety,0
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety,0
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety,0
3,3,I've shifted my focus to something else but I'...,Anxiety,0
4,4,"I'm restless and restless, it's been a month n...",Anxiety,0


In [47]:
#Splitting the TF-IDF data for train and test 
X=X_tfidf
y=data['label']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=40)

In [48]:
#Implementing Random forest 
rf_model=RandomForestClassifier(n_estimators=100,random_state=40)
#Fit the model for training data
rf_model.fit(X_train,y_train)
#Predict it on the test data
y_pred=rf_model.predict(X_test)

In [49]:
#Accuracy and classification report
accuracy_rep=accuracy_score(y_test,y_pred)
classification_rep=classification_report(y_test,y_pred)
print('Accuracy: ', accuracy_rep)
print('Classification report: ')
print(classification_rep)

Accuracy:  0.7120622568093385
Classification report: 
              precision    recall  f1-score   support

           0       0.83      0.63      0.72       767
           1       0.81      0.94      0.87      3263
           2       0.59      0.81      0.68      3147
           3       0.68      0.44      0.53      2040
           4       0.96      0.25      0.39       541
           5       0.97      0.52      0.68       593
           6       0.98      0.31      0.47       186

    accuracy                           0.71     10537
   macro avg       0.83      0.56      0.62     10537
weighted avg       0.74      0.71      0.70     10537



In [15]:
# Implementing count vecotrizer to check if the accuracy can be improved
count_vectorizer = CountVectorizer(ngram_range=(1,3))
X = count_vectorizer.fit_transform(merged_sentences)

In [18]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=40)

In [None]:
rf_model=RandomForestClassifier(n_estimators=100,random_state=40)
#Fit the model for training data
rf_model.fit(X_train,y_train)
#Predict it on the test data
y_pred=rf_model.predict(X_test)

In [None]:
accuracy_rep=accuracy_score(y_test,y_pred)
classification_rep=classification_report(y_test,y_pred)
print('Accuracy: ', accuracy_rep)
print('Classification report: ')
print(classification_rep)

In [41]:
#Predict a sentence
def predict_sentence(sen):
    sen=re.sub(r'[^a-zA-Z0-9\s]','',sen.lower())
    sen=re.sub(r'\s',' ',sen)
    word_tokens=word_tokenize(sen)
    filtered_stop_words=[x for x in word_tokens if x not in stop_words]
    lemmatized_words=[wnl.lemmatize(x,pos="v") for x in filtered_stop_words]
    sen=' '.join(lemmatized_words)
    return sen
sen=predict_sentence("I am feelig sad!")
count_vec_pred = tfidf_vectorizer.transform([sen])
pred_class=rf_model.predict(count_vec_pred)
print(pred_class)

[1]


In [51]:
## Implementing transformer for feature extraction and then using ML models on top of that for classifications

In [7]:
from transformers import BertTokenizer, BertModel
import torch

In [8]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [9]:
sentence = "This is a sample sentence."
inputs = tokenizer(sentence, return_tensors='pt')

In [10]:
print(inputs)

{'input_ids': tensor([[ 101, 2023, 2003, 1037, 7099, 6251, 1012,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}


In [11]:
with torch.no_grad(): outputs = model(**inputs)
sentence_embedding = outputs.pooler_output  # Size: [1, 768]
print(sentence_embedding.shape)

torch.Size([1, 768])


In [12]:
data.head()

Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety


In [19]:
X=data['statement']
y=data['label']

In [20]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)


In [21]:
import numpy as np

In [22]:
def bert_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors='pt',truncation=True, padding=True, max_length=512)
    with torch.no_grad(): outputs = model(**inputs)
    sentence_embedding = outputs.pooler_output 
    return sentence_embedding.squeeze().numpy()
X_train_embeddings=[bert_embedding(x) for x in X_train]


KeyboardInterrupt

