In [45]:
## load packages 
import pandas as pd
import re
import numpy as np
import plotnine
from plotnine import *

## nltk imports
from nltk.tokenize import word_tokenize, wordpunct_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

## sklearn imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

## print mult things
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## random
import random

pd.set_option('display.max_colwidth', None)

In [46]:
df_sirs = pd.read_excel('Dartmouth Data Set- SIRS .xlsx')

In [47]:
xls = pd.ExcelFile('Dartmouth Data Set- SIRS .xlsx')
df1 = pd.read_excel(xls, 'Emergency Crisis Services')

In [247]:
df1.keys()
date = 'Date/Time of Contact'
hospital_diversion = "Did START response prevent higher level of care (hospital diversion)"

Index(['Individual ID', 'Date/Time of Contact', 'Source of Contact',
       'Reason for Contact', 'Type of Emergency Response',
       'Total episode length (initial call to disposition)',
       'Total response time (initial call to arrival on-site)', 'Travel Time',
       'Police Response', 'If Police Responded', 'Who Contacted Police?',
       'Mobile Crisis', 'If Mobile Crisis', 'Restraints Used',
       'If Restraints Were Used', 'Disposition', 'Disposition Information',
       'Admitted through ED/CPEP', 'Date of Admission',
       'Disposition Information (ER)',
       'Did START response prevent higher level of care (hospital diversion)',
       'RSQ Completed?'],
      dtype='object')

In [248]:
df1[hospital_diversion].value_counts()

Did START response prevent higher level of care (hospital diversion)
No     10237
Yes     7931
Name: count, dtype: int64

In [256]:
df1_clean = df1[["Individual ID", date, hospital_diversion, "Reason for Contact"]].copy()

In [257]:
df1_clean["Reason for Contact"] = df1_clean["Reason for Contact"].str.replace("Aggression (physical, verbal, property destruction, threats)", "Aggression")
df1_clean['Reason for Contact'] = df1_clean['Reason for Contact'].str.lower()
df1_clean["Reason for Contact"] = df1_clean["Reason for Contact"].str.replace(r'(other:)?\s*?(change\s*(of|in)?\s*)?mood(\s*change)?', 'mood change', regex=True)


In [258]:
df1_clean[df1_clean["Reason for Contact"].str.contains('mood')==True]

Unnamed: 0,Individual ID,Date/Time of Contact,Did START response prevent higher level of care (hospital diversion),Reason for Contact
1380,629785C,2018-05-24 12:20:00,Yes,"aggression, other: changes inmood change, leaving unexpectedly"
1629,375509,2018-03-16 18:20:00,No,"aggression, mood change"
1635,375509,2018-02-27 18:00:00,No,"mood change , self-injurious"
1638,375509,2018-04-26 16:30:00,No,"other: increase of anxiety,mood change"
1659,375509,2017-12-18 21:00:00,No,"aggression, mood change"
...,...,...,...,...
17854,906566E,2021-10-25 09:45:00,No,"aggression, family needs assistance, mood change, suicidal ideation/behaviors"
17881,6236749,2021-10-29 16:26:00,Yes,"aggression, mood change, property destruction, and threats to harm others (father)"
17895,243197C,2021-11-02 15:30:00,No,"aggression, mood change"
17933,906566E,2021-11-10 13:45:00,No,"aggression, mood change, self-injurious"


In [259]:
df1_clean['Reason for Contact'].fillna("Unknown", inplace=True)

In [260]:
def checker(df, word):
    # Create the 'is_blank' column based on whether 'Reason for Contact' contains the word
    column_name = 'is_' + str(word)
    print(column_name)
    df[column_name] = df['Reason for Contact'].str.contains(word)
    return df


In [261]:
categories = ['aggression', 'mental health symptoms', 'family needs assistance', 'suicidal', 'self-injurious', 'decrease in ability to participate in daily functions', 'risk of losing placement', 'mood change', 'medical']

df2 = df1_clean.apply(lambda row: pd.Series({f'is_{cat}': cat.lower() in row['Reason for Contact'].lower() for cat in categories}), axis=1)

In [266]:
categories = ['aggression', 'mental health symptoms', 'family needs assistance', 'suicidal', 'self-injurious', 'decrease in ability to participate in daily functions', 'risk of losing placement', 'mood', 'medical']

df2 = df1_clean.apply(lambda row: pd.Series({
    'Individual ID': row['Individual ID'],
    'Date/Time of Contact': row['Date/Time of Contact'],
    'Reason for Contact': row['Reason for Contact'],
    'Did START response prevent higher level of care (hospital diversion)': row['Did START response prevent higher level of care (hospital diversion)'],
    **{f'is_{cat}': cat.lower() in row['Reason for Contact'].lower() for cat in categories}
}), axis=1)


In [267]:
df2

Unnamed: 0,Individual ID,Date/Time of Contact,Reason for Contact,Did START response prevent higher level of care (hospital diversion),is_aggression,is_mental health symptoms,is_family needs assistance,is_suicidal,is_self-injurious,is_decrease in ability to participate in daily functions,is_risk of losing placement,is_mood,is_medical
0,11153125,2016-08-16 08:40:00,"at risk of losing placement, other, self-injurious",No,False,False,False,False,True,False,True,False,False
1,11153125,2018-04-27 15:00:00,"mental health symptoms, self-injurious",Yes,False,True,False,False,True,False,False,False,False
2,11153125,2018-06-05 16:00:00,"mental health symptoms, suicidal ideation/behaviors",Yes,False,True,False,True,False,False,False,False,False
3,11158347,2015-06-01 14:00:00,"diagnosis and treatment plan assistance, other: facilitating med ajustment w/ center",No,False,False,False,False,False,False,False,False,False
4,11158347,2015-06-01 16:00:00,"aggression, family needs assistance, mental health symptoms",No,True,True,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
18163,108482C,2021-12-03 14:05:00,"aggression, mental health symptoms",Yes,True,True,False,False,False,False,False,False,False
18164,817731C,2021-12-04 14:21:00,"family needs assistance, mental health symptoms",Yes,False,True,True,False,False,False,False,False,False
18165,471473,2021-12-19 15:00:00,mental health symptoms,No,False,True,False,False,False,False,False,False,False
18166,780879W,2021-12-17 11:00:00,"at risk of losing placement, decrease in ability to participate in daily functions, diagnosis and treatment plan assistance, mental health symptoms, self-injurious, suicidal ideation/behaviors",No,False,True,False,True,True,True,True,False,False


In [244]:
df1_clean['Reason for Contact'].str.split(', ').explode().str.lower().value_counts(normalize=True).head(30)

Reason for Contact
aggression                                                  0.325284
mental health symptoms                                      0.196120
family needs assistance                                     0.118616
suicidal ideation/behaviors                                 0.060850
self-injurious                                              0.055545
decrease in ability to participate in daily functions       0.049069
at risk of losing placement                                 0.028343
other                                                       0.020047
mood change                                                 0.019954
transition from hospital                                    0.012367
medical concerns                                            0.009345
other: leaving unexpectedly                                 0.007865
diagnosis and treatment plan assistance                     0.006415
unknown                                                     0.005706
medication eval

In [10]:
flatten
count flatten
then create or lump as other

SyntaxError: invalid syntax (2225691262.py, line 2)