In [71]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import pandas as pd
from sklearn.model_selection import train_test_split
import re
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

In [57]:
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package punkt to /Users/girasen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/girasen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/girasen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/girasen/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [58]:
#Loading and splitting the dataset between train and test
org_df=pd.read_csv('Mental_health.csv')

data=org_df.dropna(how='all')
data=data.dropna()
data = data.dropna().reset_index(drop=True)
# X=org_df['statement']
# y=org_df['status']
# X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=40)
# print(data.loc[2,0])

In [59]:
#Preprocessing of the data
wnl=WordNetLemmatizer()
stop_words=set(stopwords.words('english'))
for i in range(len(data)):
    # print(data.iloc[i,1])
    data.iloc[i,1]=re.sub(r'[^a-zA-Z0-9\s]','',data.iloc[i,1].lower())
    data.iloc[i,1]=re.sub(r'\s',' ',data.iloc[i,1])
    word_tokens=word_tokenize(str(data.iloc[i,1]))
    filtered_stop_words=[x for x in word_tokens if x not in stop_words]
    lemmatized_words=[wnl.lemmatize(x,pos="v") for x in filtered_stop_words]
    data.iloc[i,1]=' '.join(lemmatized_words)
    # print(data.iloc[i,1])

In [60]:
#Building vocabulary list from the dataset
tokenize_words=[word_tokenize(i) for i in data['statement']]
vocabulary=set()
for sentence in tokenize_words:
    vocabulary.update(sentence)
vocab=sorted(list(vocabulary))
# print(len(vocab))


In [None]:
# Creating bag of words
def create_bow(sentence,vocab):
    vector=[0]*len(vocab)
    for i in sentence:
        if i in vocab:
            idx=vocab.index(i)
            vector[idx]+=1
    return vector

vectors=[create_bow(sentence,vocab) for sentence in tokenize_words]

In [61]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [62]:
merged_sentences=[' '.join(i) for i in tokenize_words]
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(merged_sentences)
print("Vocabulary:", tfidf_vectorizer.get_feature_names_out())
print("Array representation:") 
print(X_tfidf.toarray())

Vocabulary: ['00' '000' '0000' ... 'zzzz' 'zzzzzz' 'zzzzzzzzz']
Array representation:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [63]:
label=[i for i in data['status']]

In [65]:
unique_labels=list(set(label))
print(unique_labels)

['Normal', 'Stress', 'Suicidal', 'Anxiety', 'Bipolar', 'Personality disorder', 'Depression']


In [67]:
#Added label column
data['label'] = pd.factorize(data['status'])[0]

In [68]:
data.head()

Unnamed: 0.1,Unnamed: 0,statement,status,label
0,0,oh gosh,Anxiety,0
1,1,trouble sleep confuse mind restless heart tune,Anxiety,0
2,2,wrong back dear forward doubt stay restless re...,Anxiety,0
3,3,ive shift focus something else im still worry,Anxiety,0
4,4,im restless restless month boy mean,Anxiety,0


In [74]:
#Splitting the data for train and test
X=X_tfidf
y=data['label']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=40)
print(y_train.shape)

(42144,)


In [75]:
#Implementing Random forest
rf_model=RandomForestClassifier(n_estimators=100,random_state=40)
#Fit the model for training data
rf_model.fit(X_train,y_train)
#Predict it on the test data
y_pred=rf_model.predict(X_test)

In [76]:
#Accuracy and classification report
accuracy_rep=accuracy_score(y_test,y_pred)
classification_rep=classification_report(y_test,y_pred)
print('Accuracy: ', accuracy_rep)
print('Classification report: ')
print(classification_rep)

Accuracy:  0.6950744993831262
Classification report: 
              precision    recall  f1-score   support

           0       0.89      0.56      0.69       767
           1       0.80      0.95      0.87      3263
           2       0.56      0.81      0.66      3147
           3       0.68      0.41      0.51      2040
           4       1.00      0.21      0.35       541
           5       0.99      0.40      0.57       593
           6       1.00      0.29      0.45       186

    accuracy                           0.70     10537
   macro avg       0.85      0.52      0.59     10537
weighted avg       0.74      0.70      0.67     10537

