In [2]:
# import drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
path = "/content/drive/MyDrive/colab_data/data"

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

In [10]:
data = json.load(open(path + "/data.json"))

In [12]:
for k, v in data.items():
  print(k, v)
  break

0 {'question_title': 'Do I have too many issues for counseling?', 'question_text': 'I have so many issues to address. I have a history of sexual abuse, I’m a breast cancer survivor and I am a lifetime insomniac.    I have a long history of depression and I’m beginning to have anxiety. I have low self esteem but I’ve been happily married for almost 35 years.\n   I’ve never had counseling about any of this. Do I have too many issues to address in counseling?', 'question_link': 'https://counselchat.com/questions/do-i-have-too-many-issues-for-counseling', 'topic': 'depression', 'responses': [{'therapist_info': 'Jennifer MolinariHypnotherapist & Licensed Counselor', 'therapist_url': 'https://counselchat.com/therapists/jennifer-molinari', 'answer_text': 'It is very common for\xa0people to have multiple issues that they want to (and need to) address in counseling.\xa0 I have had clients ask that same question and through more exploration, there is often an underlying fear that they\xa0 "can\'

In [20]:
# find unique number of topics in the entire dataset: data
topics = set()
for k, v in data.items():
  topics.add(v["topic"])

print("Number of topics: ", len(topics))
print("--")
for topic in topics:
  print(topic)

Number of topics:  31
--
social-relationships
professional-ethics
marriage
grief-and-loss
children-adolescents
parenting
eating-disorders
self-esteem
self-harm
depression
counseling-fundamentals
human-sexuality
sleep-improvement
military-issues
addiction
trauma
family-conflict
legal-regulatory
lgbtq
substance-abuse
diagnosis
relationships
relationship-dissolution
behavioral-change
domestic-violence
workplace-relationships
anxiety
spirituality
intimacy
anger-management
stress


## A better way to look at data is through the lense of grouping by topics

In [21]:
with open(path + "/grouped_by_topic.json", "r") as f:
  topics = json.load(f)

In [28]:
# are there any missing values in responses?
missing_question_cnt = 0
missing_response_cnt = 0
for k, v in topics.items():
  for question in v:
    if question['questionText'] == None or question['questionText'] == '':
      missing_question_cnt += 1
    if question['answerText'] == None or question['answerText'] == '':
      missing_response_cnt +=1

print('Missing questions', missing_question_cnt)
print('Missing responses', missing_response_cnt)

Missing questions 139
Missing responses 26


## Number of questions for a topic

In [30]:
topic_arr = []
for k, v in topics.items():
  topic_arr.append((len(v), k))

topic_arr.sort(reverse=True)
for topic in topic_arr:
  print(topic)

(465, 'depression')
(358, 'anxiety')
(270, 'counseling-fundamentals')
(248, 'intimacy')
(202, 'relationships')
(191, 'parenting')
(144, 'family-conflict')
(102, 'trauma')
(100, 'self-esteem')
(98, 'relationship-dissolution')
(69, 'behavioral-change')
(65, 'lgbtq')
(52, 'marriage')
(48, 'anger-management')
(47, 'spirituality')
(45, 'substance-abuse')
(41, 'professional-ethics')
(37, 'grief-and-loss')
(36, 'workplace-relationships')
(25, 'social-relationships')
(25, 'diagnosis')
(21, 'domestic-violence')
(16, 'eating-disorders')
(14, 'legal-regulatory')
(13, 'stress')
(13, 'addiction')
(11, 'sleep-improvement')
(8, 'children-adolescents')
(7, 'human-sexuality')
(3, 'military-issues')
(1, 'self-harm')


### Training a topic classifier with this dataset will become challenging especially with multi-class classification given sever class imbalances.

In [32]:
# let's compute number of words in the questions and responses
question_word_count = []
response_word_count = []

for k, v in topics.items():
  for question in v:
    if question['questionText'] == None or question['questionText'] == '':
      continue
    if question['answerText'] == None or question['answerText'] == '':
      continue
    question_word_count.append(len(question['questionText'].split()))
    response_word_count.append(len(question['answerText'].split()))

In [33]:
# print number of words
print("Number of words in questions: ", np.mean(question_word_count))
print("Number of words in responses: ", np.mean(response_word_count))

Number of words in questions:  51.90811638591118
Number of words in responses:  169.3093415007657


So average sequence length is good enough for a sentence-bert model for creating embeddings for questions