In [1]:
## helpful packages
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import random
import re
import string

## nltk imports
import nltk
### uncomment and run these lines if you haven't downloaded relevant nltk add-ons yet
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
from nltk import pos_tag
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

## spacy imports
import spacy
### uncomment and run the below line if you haven't loaded the en_core_web_sm library yet
! python -m spacy download en_core_web_sm
import en_core_web_sm
nlp = en_core_web_sm.load()

## vectorizer
from sklearn.feature_extraction.text import CountVectorizer

## sentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

## lda
from gensim import corpora
import gensim

## repeated printouts and wide-format text
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
pd.set_option('display.max_colwidth', None)

from collections import Counter

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import pandas as pd

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/orenpoleshuckkinel/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/orenpoleshuckkinel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m17.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
feis = pd.read_excel('Dartmouth FEIS Data.xlsx')

In [9]:
feis.columns

Index(['Respondent ID #  (SIRS Local ID)', 'Start Date', 'End Date',
       'What services does your family member currently receive?  Check all that apply',
       'If other, please describe.',
       'Where does your family member receive mental health services?',
       'If other, please describe', 'Your relationship',
       'If other, please describe.1',
       'Does (name of individual) continue to live with you?',
       'If no, where does he/she live now?',
       'In the past year, how difficult has it been caring for your family member?',
       'What\nis the total number of people, including yourself, presently living in your\nhousehold?  Please indicate everyone who\nlives with you at least half of the time - Parents/Step-parents',
       'What\nis the total number of people, including yourself, presently living in your\nhousehold?  Please indicate everyone who\nlives with you at least half of the time - Siblings of (name)',
       'What\nis the total number of people, incl

In [25]:
#subset data for topic modeling
advice = pd.DataFrame()
advice['feedback'] = feis['What\nadvice would you give to service planners regarding the mental health service\nneeds of persons with IDD and their families?']

# Removing NaN values from the 'feedback' column
advice = advice.dropna(subset=['feedback'])

advice

Unnamed: 0,feedback
0,None provided
1,PROVDE REFERRALS FOR EMERGENCY RESPITE SERVICES; JOB SUPPORT FOR ID CONSUMERS
5,Resommendations pertinent to treatment
7,would like consultation with family before announcing prognosis to consumer
9,nothing. they are doing the best that they can. It would be good for him to get out of the house more. we need help with transportation to help us get back into respite.
...,...
1933,I think they do a good job if the person with the disability lets them.
1934,They are doing a good job.
1936,"Medications may be helpful, keeping people busy helps a lot!"
1937,Take the time to get to know and give him a chance


In [26]:
## restrict to alpha

df_preprocessed = ''.join(char for item in advice['feedback'] for char in str(item) if char.isalpha() or char.isspace())

In [27]:
## part of speech tagging

tokens = word_tokenize(df_preprocessed) # Generate list of tokens
tokens_pos = pos_tag(tokens) # generate part of speech tags for those tokens
tokens_pos

[('None', 'NN'),
 ('providedPROVDE', 'NN'),
 ('REFERRALS', 'NNP'),
 ('FOR', 'NNP'),
 ('EMERGENCY', 'NNP'),
 ('RESPITE', 'NNP'),
 ('SERVICES', 'NNP'),
 ('JOB', 'NNP'),
 ('SUPPORT', 'NNP'),
 ('FOR', 'NNP'),
 ('ID', 'NNP'),
 ('CONSUMERSResommendations', 'NNP'),
 ('pertinent', 'NN'),
 ('to', 'TO'),
 ('treatmentwould', 'VB'),
 ('like', 'IN'),
 ('consultation', 'NN'),
 ('with', 'IN'),
 ('family', 'NN'),
 ('before', 'IN'),
 ('announcing', 'VBG'),
 ('prognosis', 'NN'),
 ('to', 'TO'),
 ('consumernothing', 'VBG'),
 ('they', 'PRP'),
 ('are', 'VBP'),
 ('doing', 'VBG'),
 ('the', 'DT'),
 ('best', 'JJS'),
 ('that', 'IN'),
 ('they', 'PRP'),
 ('can', 'MD'),
 ('It', 'PRP'),
 ('would', 'MD'),
 ('be', 'VB'),
 ('good', 'JJ'),
 ('for', 'IN'),
 ('him', 'PRP'),
 ('to', 'TO'),
 ('get', 'VB'),
 ('out', 'IN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('house', 'NN'),
 ('more', 'RBR'),
 ('we', 'PRP'),
 ('need', 'VBP'),
 ('help', 'VB'),
 ('with', 'IN'),
 ('transportation', 'NN'),
 ('to', 'TO'),
 ('help', 'VB'),
 ('us', 'PR

In [28]:
# All adjectives
all_adjectives = [one_tok[0] for one_tok in tokens_pos 
                if one_tok[1] == "JJS" or 
               one_tok[1] == "JJR" or
               one_tok[1] == "JJ"]

adjectives_count = Counter(all_adjectives)

adjectives_count

Counter({'more': 147,
         'available': 61,
         'mental': 24,
         'individual': 23,
         'other': 20,
         'better': 18,
         'many': 17,
         'sure': 15,
         'good': 12,
         'respite': 11,
         'hard': 11,
         'important': 11,
         'different': 11,
         'More': 11,
         'enough': 11,
         'helpful': 10,
         'medical': 10,
         'accessible': 9,
         'difficult': 8,
         'able': 8,
         'psychiatric': 8,
         'best': 7,
         'possible': 7,
         'willing': 7,
         'open': 7,
         'clear': 7,
         'same': 7,
         'new': 6,
         'much': 6,
         'aware': 6,
         'due': 6,
         'own': 6,
         'specific': 6,
         'social': 6,
         'inhome': 5,
         'rural': 5,
         'appropriate': 5,
         'dont': 5,
         'little': 5,
         'patient': 5,
         'high': 5,
         'unique': 4,
         'strong': 4,
         'timely': 4,
         'love

In [29]:
# Create a DataFrame with the adjectives and their counts
adjectives_df = pd.DataFrame(adjectives_count.items(), columns=['Adjective', 'Count'])

# Sort adjectives by count in descending order
adjectives_df = adjectives_df.sort_values(by='Count', ascending=False)

# Print the top 5 most frequent adjectives
print(adjectives_df.head(5))

     Adjective  Count
4         more    147
17   available     61
85      mental     24
36  individual     23
11       other     20


## Sentiment Analysis

In [30]:
advice

Unnamed: 0,feedback
0,None provided
1,PROVDE REFERRALS FOR EMERGENCY RESPITE SERVICES; JOB SUPPORT FOR ID CONSUMERS
5,Resommendations pertinent to treatment
7,would like consultation with family before announcing prognosis to consumer
9,nothing. they are doing the best that they can. It would be good for him to get out of the house more. we need help with transportation to help us get back into respite.
...,...
1933,I think they do a good job if the person with the disability lets them.
1934,They are doing a good job.
1936,"Medications may be helpful, keeping people busy helps a lot!"
1937,Take the time to get to know and give him a chance


In [31]:
analyzer = SentimentIntensityAnalyzer()

def process_advice(advice):
    advice_str = str(advice)
    advice_clean = re.sub(r'\b[A-Z][a-z]*\b', '', advice_str)
    
    sentiment_scores = analyzer.polarity_scores(advice_clean)
    
    return sentiment_scores

advice['sentiment'] = advice['feedback'].apply(process_advice)
advice

Unnamed: 0,feedback,sentiment
0,None provided,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}"
1,PROVDE REFERRALS FOR EMERGENCY RESPITE SERVICES; JOB SUPPORT FOR ID CONSUMERS,"{'neg': 0.182, 'neu': 0.629, 'pos': 0.189, 'compound': 0.0258}"
5,Resommendations pertinent to treatment,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound': 0.0}"
7,would like consultation with family before announcing prognosis to consumer,"{'neg': 0.0, 'neu': 0.783, 'pos': 0.217, 'compound': 0.3612}"
9,nothing. they are doing the best that they can. It would be good for him to get out of the house more. we need help with transportation to help us get back into respite.,"{'neg': 0.0, 'neu': 0.694, 'pos': 0.306, 'compound': 0.9147}"
...,...,...
1933,I think they do a good job if the person with the disability lets them.,"{'neg': 0.0, 'neu': 0.818, 'pos': 0.182, 'compound': 0.4404}"
1934,They are doing a good job.,"{'neg': 0.0, 'neu': 0.58, 'pos': 0.42, 'compound': 0.4404}"
1936,"Medications may be helpful, keeping people busy helps a lot!","{'neg': 0.0, 'neu': 0.552, 'pos': 0.448, 'compound': 0.69}"
1937,Take the time to get to know and give him a chance,"{'neg': 0.0, 'neu': 0.833, 'pos': 0.167, 'compound': 0.25}"


In [40]:
advice['negative'] = advice['sentiment'].apply(lambda x: x['neg'])
advice['neutral'] = advice['sentiment'].apply(lambda x: x['neu'])
advice['positive'] = advice['sentiment'].apply(lambda x: x['pos'])
advice['compound'] = advice['sentiment'].apply(lambda x: x['compound'])

advice_wscore = advice[['feedback',  
                                'negative', 
                                'positive', 
                                'neutral', 
                                'compound']].sort_values(by='positive', ascending=False)

top_10_pos_press_releases = advice_wscore.head(10)
top_10_pos_press_releases[['feedback', 'negative', 'neutral', 'positive', 'compound']]

Unnamed: 0,feedback,negative,neutral,positive,compound
405,Not sure.,0.0,0.0,1.0,0.3182
1502,Be kind,0.0,0.0,1.0,0.5267
399,Not sure,0.0,0.0,1.0,0.3182
1220,Respite care,0.0,0.0,1.0,0.4939
787,"Be responsive, be helpful, be kind",0.0,0.187,0.813,0.8271
1284,Be more compassionate,0.0,0.223,0.777,0.5413
402,easier navigation,0.0,0.263,0.737,0.4215
835,Provide more supports,0.0,0.264,0.736,0.4201
1884,"need support, help, direction",0.0,0.27,0.73,0.6597
889,To seek help,0.0,0.27,0.73,0.4019


## Topic Modeling

In [49]:
custom_stopwords = ['sure']

In [50]:
def preprocess_text(text, custom_stopwords):
    text = str(text)
    text = text.lower()

    words = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    stop_words.update(custom_stopwords)

    stemmer = SnowballStemmer('english')
    
    processed_words = [
        stemmer.stem(word) for word in words
        if word.isalpha() and
        word not in stop_words
        and len(word) >= 4
    ]

    return ' '.join(processed_words)

In [51]:
advice_wscore['processed_text'] = advice_wscore['feedback'].apply(lambda x: preprocess_text(x, custom_stopwords))

In [52]:
advice_wscore

Unnamed: 0,feedback,negative,positive,neutral,compound,processed_text
405,Not sure.,0.0,1.000,0.000,0.3182,
1502,Be kind,0.0,1.000,0.000,0.5267,kind
399,Not sure,0.0,1.000,0.000,0.3182,
1220,Respite care,0.0,1.000,0.000,0.4939,respit care
787,"Be responsive, be helpful, be kind",0.0,0.813,0.187,0.8271,respons help kind
...,...,...,...,...,...,...
952,listen to family members as they are the most informed,0.0,0.000,1.000,0.0000,listen famili member inform
955,have patience and perservere,0.0,0.000,1.000,0.0000,patienc perserver
957,Providers feel it is on the parents or school to figure everything out,0.0,0.000,1.000,0.0000,provid feel parent school figur everyth
313,"To be aware of his behavior, What to look for",0.0,0.000,1.000,0.0000,awar behavior look


In [None]:
## Create DTM from the preprocessed feedback and to explore top words

In [54]:
def create_dtm(list_of_strings, metadata):
    vectorizer = CountVectorizer(lowercase = True)
    dtm_sparse = vectorizer.fit_transform(list_of_strings)
    dtm_dense_named = pd.DataFrame(dtm_sparse.todense(), 
        columns=vectorizer.get_feature_names_out())
    dtm_dense_named_withid = pd.concat([metadata.reset_index(), dtm_dense_named], axis = 1)
    return(dtm_dense_named_withid)

In [65]:
dtm = create_dtm(advice_wscore['processed_text'], metadata = advice_wscore[['compound', 'processed_text']])

def get_topwords(dtm, column_name, n_top = 10):
    text_concat = ' '.join(dtm[column_name])
    words = text_concat.split()
    word_counts = Counter(words)
    top_words = word_counts.most_common(n_top)
    return top_words

# Part B
top_5_percent_threshold = advice_wscore['compound'].quantile(0.95)
top_5_percent_dtm = advice_wscore[advice_wscore['compound'] >= top_5_percent_threshold]
top_5_percent_positive_words = get_topwords(top_5_percent_dtm, 'processed_text')
top_5_percent_positive_words

# Part C
bottom_5_percent_threshold = advice_wscore['compound'].quantile(0.05)
bottom_5_percent_dtm = advice_wscore[advice_wscore['compound'] <= bottom_5_percent_threshold]
bottom_5_percent_positive_words = get_topwords(bottom_5_percent_dtm, 'processed_text')
bottom_5_percent_positive_words

[('famili', 33),
 ('need', 33),
 ('help', 27),
 ('support', 25),
 ('provid', 22),
 ('care', 20),
 ('servic', 15),
 ('better', 12),
 ('would', 12),
 ('avail', 12)]

[('servic', 30),
 ('crisi', 29),
 ('famili', 21),
 ('need', 19),
 ('frustrat', 8),
 ('individu', 8),
 ('train', 7),
 ('provid', 7),
 ('intervent', 5),
 ('avail', 5)]

## Extend analysis from unigrams to bigrams

In [57]:
def create_bigram_onedoc(processed_text):
    if not isinstance(processed_text, str):
        return ""  

    words = processed_text.split()
    bigrams = ["_".join(pair) for pair in zip(words, words[1:])]
    return " ".join(bigrams)

advice_wscore['processed_text_bigrams'] = advice_wscore['processed_text'].apply(create_bigram_onedoc)

405                                                                     
1502                                                                    
399                                                                     
1220                                                         respit_care
787                                               respons_help help_kind
                                      ...                               
952                            listen_famili famili_member member_inform
955                                                    patienc_perserver
957     provid_feel feel_parent parent_school school_figur figur_everyth
313                                          awar_behavior behavior_look
0                                                            none_provid
Name: processed_text_bigrams, Length: 998, dtype: object

In [59]:
metadata = advice_wscore[['feedback', 'compound']]

list_of_strings = advice_wscore['processed_text_bigrams']

dtm_bigram = create_dtm(list_of_strings, metadata)

print("Unigram DTM dimensions:", dtm.shape)
print("Bigram DTM dimensions:", dtm_bigram.shape)

Unigram DTM dimensions: (998, 1291)
Bigram DTM dimensions: (998, 4605)


Unnamed: 0,index,feedback,compound,abil_communic,abl_come,abl_get,abl_help,abl_know,abl_meet,abl_realli,...,year_find,year_hope,year_longer,year_start,york_need,york_option,young_adult,young_child,young_children,zero_confid
0,405,Not sure.,0.3182,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1502,Be kind,0.5267,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,399,Not sure,0.3182,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1220,Respite care,0.4939,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,787,"Be responsive, be helpful, be kind",0.8271,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,952,listen to family members as they are the most informed,0.0000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
994,955,have patience and perservere,0.0000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
995,957,Providers feel it is on the parents or school to figure everything out,0.0000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
996,313,"To be aware of his behavior, What to look for",0.0000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
dtm_bigram = dtm_bigram.apply(pd.to_numeric, errors='coerce')
print(dtm_bigram.dtypes)

dtm_bigram = dtm_bigram.fillna(0)

def get_topwords(dtm, n_topwords):
    top_words = {}
    for col in dtm.columns:
        top = dtm[col].nlargest(n_topwords).index.tolist()
        top_words[col] = top
    return top_words

index               int64
feedback          float64
compound          float64
abil_communic       int64
abl_come            int64
                   ...   
york_option         int64
young_adult         int64
young_child         int64
young_children      int64
zero_confid         int64
Length: 4605, dtype: object


In [69]:
top_bigrams = get_topwords(dtm_bigram, 'processed_text_bigrams')
top_bigrams

TypeError: '<=' not supported between instances of 'str' and 'int'