<a href="https://colab.research.google.com/github/m-adil172000/Omdena_MindGuardian/blob/main/Psych8k_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
import numpy as np
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline


nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
# read the json file as pandas dataframe

data = pd.read_json('Alexander_Street_shareGPT_2.0.json')
data.head()

Unnamed: 0,instruction,input,output
0,"If you are a counsellor, please answer the que...","Lately, I've been feeling really disconnected ...",Thank you for sharing your experience with me....
1,"If you are a counsellor, please answer the que...","Lately, I've been feeling really stressed and ...",Thank you for sharing your experience with me....
2,"If you are a counsellor, please answer the que...","Lately, I've been feeling a bit off. I sometim...",I appreciate you sharing your concerns with me...
3,"If you are a counsellor, please answer the que...","Lately, I've been feeling a bit off. I've noti...",Thank you for sharing your feelings and experi...
4,"If you are a counsellor, please answer the que...","Lately, I've been feeling quite off. I have be...",I appreciate you sharing your experiences with...


In [6]:
data.iloc[0]

instruction    If you are a counsellor, please answer the que...
input          Lately, I've been feeling really disconnected ...
output         Thank you for sharing your experience with me....
Name: 0, dtype: object

In [14]:
data.dtypes

instruction    object
input          object
output         object
dtype: object

In [8]:
data.isnull().sum()

instruction    0
input          0
output         0
dtype: int64

In [10]:
class TextPreprocessing(BaseEstimator, TransformerMixin):

  def __init__(self):
    self.stop_words = set(stopwords.words('english'))
    self.lemmatizer = WordNetLemmatizer()

  def fit(self, X, y=None):
    return self

  def transform(self, text):

    #converting to lowercase
    text = text.lower()

    #removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # tokenization
    words = word_tokenize(text)

    # removing stopwords
    words = [word for word in words if word not in self.stop_words]

    #lemmatization
    words = [self.lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

In [16]:
text_preprocessor = TextPreprocessing()

# creating the pipeline
pipeline = Pipeline([
    ('text_preprocessing', text_preprocessor)
])

In [18]:
# applying the preprocessing pipeline to all the columns

data['preprocessed_instruction'] = data['instruction'].apply(pipeline.fit_transform)
data['preprocessed_input'] = data['input'].apply(pipeline.fit_transform)
data['preprocessed_output'] = data['output'].apply(pipeline.fit_transform)

In [19]:
data.head()

Unnamed: 0,instruction,input,output,preprocessed_instruction,preprocessed_input,preprocessed_output
0,"If you are a counsellor, please answer the que...","Lately, I've been feeling really disconnected ...",Thank you for sharing your experience with me....,counsellor please answer question based descri...,lately ive feeling really disconnected reality...,thank sharing experience sound like youve deal...
1,"If you are a counsellor, please answer the que...","Lately, I've been feeling really stressed and ...",Thank you for sharing your experience with me....,counsellor please answer question based descri...,lately ive feeling really stressed affecting l...,thank sharing experience sound like going diff...
2,"If you are a counsellor, please answer the que...","Lately, I've been feeling a bit off. I sometim...",I appreciate you sharing your concerns with me...,counsellor please answer question based descri...,lately ive feeling bit sometimes find hard foc...,appreciate sharing concern good hear still fun...
3,"If you are a counsellor, please answer the que...","Lately, I've been feeling a bit off. I've noti...",Thank you for sharing your feelings and experi...,counsellor please answer question based descri...,lately ive feeling bit ive noticed ive trouble...,thank sharing feeling experience important rec...
4,"If you are a counsellor, please answer the que...","Lately, I've been feeling quite off. I have be...",I appreciate you sharing your experiences with...,counsellor please answer question based descri...,lately ive feeling quite experiencing strange ...,appreciate sharing experience seems like youve...


In [22]:
data_preprocessed = data[['preprocessed_instruction','preprocessed_input','preprocessed_output']]
data_preprocessed.head()

Unnamed: 0,preprocessed_instruction,preprocessed_input,preprocessed_output
0,counsellor please answer question based descri...,lately ive feeling really disconnected reality...,thank sharing experience sound like youve deal...
1,counsellor please answer question based descri...,lately ive feeling really stressed affecting l...,thank sharing experience sound like going diff...
2,counsellor please answer question based descri...,lately ive feeling bit sometimes find hard foc...,appreciate sharing concern good hear still fun...
3,counsellor please answer question based descri...,lately ive feeling bit ive noticed ive trouble...,thank sharing feeling experience important rec...
4,counsellor please answer question based descri...,lately ive feeling quite experiencing strange ...,appreciate sharing experience seems like youve...


In [24]:
data_preprocessed.shape

(8187, 3)

As we can see we have more than 8 thousand data entries, so let's perform sentiment analysis on a subset of our dataset

In [25]:
subset_data = data_preprocessed.sample(n=500, random_state=42)
subset_data.shape


(500, 3)

In [26]:
subset_data.head()

Unnamed: 0,preprocessed_instruction,preprocessed_input,preprocessed_output
2126,counsellor please answer question based descri...,want quit smoking think take lot willpower thi...,great youre considering quitting smoking invol...
4270,counsellor please answer question based descri...,situation called stubborn someone mom took asi...,clear mother special person life offered encou...
4894,counsellor please answer question based descri...,im struggling understanding certain aspect lif...,seems like couple option either find way peace...
6690,counsellor please answer question based descri...,ive dealing people calling birth name doesnt r...,im glad hear developed coping strategy deal pe...
1320,counsellor please answer question based descri...,dealing disease although went disability took ...,understand dealing disease overwhelming natura...


In [27]:
from transformers import pipeline

# Load a sentiment analysis pipeline
sentiment_pipeline = pipeline('sentiment-analysis')

# Perform sentiment analysis on the 'preprocessed_input' column
sentiments = sentiment_pipeline(subset_data['preprocessed_input'].tolist())

# Add the sentiment labels and scores to the DataFrame
subset_data['sentiment_label'] = [s['label'] for s in sentiments]
subset_data['sentiment_score'] = [s['score'] for s in sentiments]

subset_data.head()


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Unnamed: 0,preprocessed_instruction,preprocessed_input,preprocessed_output,sentiment_label,sentiment_score
2126,counsellor please answer question based descri...,want quit smoking think take lot willpower thi...,great youre considering quitting smoking invol...,NEGATIVE,0.994749
4270,counsellor please answer question based descri...,situation called stubborn someone mom took asi...,clear mother special person life offered encou...,NEGATIVE,0.993274
4894,counsellor please answer question based descri...,im struggling understanding certain aspect lif...,seems like couple option either find way peace...,NEGATIVE,0.954677
6690,counsellor please answer question based descri...,ive dealing people calling birth name doesnt r...,im glad hear developed coping strategy deal pe...,NEGATIVE,0.996055
1320,counsellor please answer question based descri...,dealing disease although went disability took ...,understand dealing disease overwhelming natura...,NEGATIVE,0.996824


In [30]:
subset_data.sentiment_label.unique()

array(['NEGATIVE', 'POSITIVE'], dtype=object)

In [32]:
unmask_pipeline1 = pipeline("fill-mask", model="nlp4good/psych-search")
#unmask_pipeline2 = pipeline("fill-mask", model="mental/mental-bert-base-uncased")
classify_pipeline1 = pipeline("text-classification",model="rabiaqayyum/autotrain-mental-health-analysis-752423172")
classify_pipeline2 = pipeline("text-classification",model="edmundhui/mental_health_trainer")

config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/346 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/947 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/321 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [41]:
# Using unmask_pipeline1
text = subset_data['preprocessed_input'].iloc[0]
text = text + 'the individual can be diagnosed as [MASK].'
unmask_pipeline1(text)


[{'score': 0.09809008985757828,
  'token': 9814,
  'token_str': 'schizophrenia',
  'sequence': 'want quit smoking think take lot willpower thinking speaking husband try get quit well dont knowthe individual can be diagnosed as schizophrenia.'},
 {'score': 0.09531347453594208,
  'token': 4754,
  'token_str': 'depression',
  'sequence': 'want quit smoking think take lot willpower thinking speaking husband try get quit well dont knowthe individual can be diagnosed as depression.'},
 {'score': 0.08378783613443375,
  'token': 6025,
  'token_str': 'smoking',
  'sequence': 'want quit smoking think take lot willpower thinking speaking husband try get quit well dont knowthe individual can be diagnosed as smoking.'},
 {'score': 0.03703862428665161,
  'token': 12563,
  'token_str': 'copd',
  'sequence': 'want quit smoking think take lot willpower thinking speaking husband try get quit well dont knowthe individual can be diagnosed as copd.'},
 {'score': 0.035000357776880264,
  'token': 290,
  'tok

In [51]:
data['input'].iloc[2126]

"I want to quit smoking, but I think it will take a lot of willpower. I am thinking of speaking to my husband about it and try to get him to quit as well. But I don't know how to do that."

In [48]:
text_cl = subset_data['preprocessed_input'].iloc[0]

# Using classify pipeline1
classify_pipeline1(text_cl)

[{'label': 'bipolar', 'score': 0.4525376558303833}]

In [49]:
# Using classify pipeline2
classify_pipeline2(text_cl)

[{'label': 'depression', 'score': 0.962296187877655}]

In [52]:
# Load the BERT-Emotions-Classifier
classifier = pipeline("text-classification", model="ayoubkirouane/BERT-Emotions-Classifier")

# Perform emotion classification
results_raw = classifier(data['input'].iloc[2126])
results_preprocessed = classifier(text_cl)

# Display the classification results
print(results_raw)
print(results_preprocessed)

config.json:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

[{'label': 'sadness', 'score': 0.6843137145042419}]
[{'label': 'sadness', 'score': 0.5858019590377808}]


Pretrained model on English language for text classification. Model trained from tweet_emotion_eval (roberta-base fine-tuned on emotion task of tweet_eval dataset) on psychotherapy text transcripts.

Given a sentence, this model provides a binary classification as either symptomatic or non-symptomatic where symptomatic means the sentence displays signs of anxiety and/or depression.

In [54]:
classifier_eval = pipeline(task="text-classification", model="margotwagner/roberta-psychotherapy-eval")

model_outputs = classifier_eval(text_cl)
print(model_outputs[0])

pytorch_model.bin:  63%|######3   | 315M/499M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/394 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

{'label': 'nonsymptomatic', 'score': 0.9741930961608887}


In [55]:
text_test = data['input'].iloc[7]
text_test

"Lately, I've been feeling a bit weird. I've always been an introverted person, but these past few weeks, I feel even more isolated. I've started hearing voices occasionally—whispers that I can't really understand. It's getting hard for me to concentrate on my studies and work. Last week, I was on my usual morning run when I suddenly felt like someone was following me. I got paranoid and had to abandon the run. I'm not sure what's happening to me."

In [57]:
print(f"model - rabiaqayyum/autotrain-mental-health-analysis-752423172 : {classify_pipeline1(text_test)}")
print(f"model - edmundhui/mental_health_trainer : {classify_pipeline2(text_test)}")
print(f"model - ayoubkirouane/BERT-Emotions-Classifier : {classifier(text_test)}")
print(f"model - margotwagner/roberta-psychotherapy-eval : {classifier_eval(text_test)}")

model - rabiaqayyum/autotrain-mental-health-analysis-752423172 : [{'label': 'schizophrenia', 'score': 0.6431885957717896}]
model - edmundhui/mental_health_trainer : [{'label': 'aspergers', 'score': 0.913762629032135}]
model - ayoubkirouane/BERT-Emotions-Classifier : [{'label': 'sadness', 'score': 0.9500150084495544}]
model - margotwagner/roberta-psychotherapy-eval : [{'label': 'symptomatic', 'score': 0.9849622249603271}]


In [59]:
print(f"model - rabiaqayyum/autotrain-mental-health-analysis-752423172 : {classify_pipeline1(data['input'].iloc[137])}")
print(f"model - edmundhui/mental_health_trainer : {classify_pipeline2(data['input'].iloc[137])}")
print(f"model - ayoubkirouane/BERT-Emotions-Classifier : {classifier(data['input'].iloc[137])}")
print(f"model - margotwagner/roberta-psychotherapy-eval : {classifier_eval(data['input'].iloc[137])}")

model - rabiaqayyum/autotrain-mental-health-analysis-752423172 : [{'label': 'mentalhealth', 'score': 0.5940428972244263}]
model - edmundhui/mental_health_trainer : [{'label': 'ADHD', 'score': 0.9805335998535156}]
model - ayoubkirouane/BERT-Emotions-Classifier : [{'label': 'sadness', 'score': 0.9447564482688904}]
model - margotwagner/roberta-psychotherapy-eval : [{'label': 'symptomatic', 'score': 0.9865826964378357}]
