In [1]:
import pandas as pd

# Load the Excel file
file_path = '../data/Dartmouth Data Set- SIRS .xlsx'
xls = pd.ExcelFile(file_path)

# Display sheet names to understand the structure of the file
sheet_names = xls.sheet_names
sheet_names

['Demographics',
 'ABC Data',
 'RSQ Data',
 'Medciation Review Data',
 'Diagnostic Data',
 'Presenting Problems',
 'Emergency Crisis Services',
 'living Situation change',
 'ED Use',
 'Law Enforcement',
 'In-patient admissions',
 'START Tool Dates',
 'Sheet12']

In [2]:
# Load the relevant sheets
demographics_df = pd.read_excel(file_path, sheet_name='Demographics')
diagnostic_data_df = pd.read_excel(file_path, sheet_name='Diagnostic Data')
presenting_problems_df = pd.read_excel(file_path, sheet_name='Presenting Problems')

# Display the first few rows of each dataframe to understand their structure and identify the ID columns
demographics_df.head(), diagnostic_data_df.head(), presenting_problems_df.head()

(   Local ID Date Enrolled in START  Status Status Date  \
 0  11128335             2010-07-01  Active  2010-07-22   
 1  11134104             2011-11-01  Active  2011-11-30   
 2  11138520             2012-05-01  Active  2012-05-21   
 3  11132094             2013-05-01  Active  2013-05-09   
 4   975134C             2014-07-01  Active  2014-07-01   
 
                        Time Enrolled in START  \
 0  Business Hours (Monday - Friday 8am - 5pm)   
 1  Business Hours (Monday - Friday 8am - 5pm)   
 2  Business Hours (Monday - Friday 8am - 5pm)   
 3  Business Hours (Monday - Friday 8am - 5pm)   
 4  Business Hours (Monday - Friday 8am - 5pm)   
 
         Source of referral to START Suitability of enrollment in START  \
 0  Case Manager/Service Coordinator                        Appropriate   
 1  Case Manager/Service Coordinator                        Appropriate   
 2  Case Manager/Service Coordinator                        Appropriate   
 3  Case Manager/Service Coordinator      

In [3]:
# Standardize ID column names
demographics_df.rename(columns={'Local ID': 'Client_ID'}, inplace=True)
diagnostic_data_df.rename(columns={'Client Local Id': 'Client_ID'}, inplace=True)
presenting_problems_df.rename(columns={'Client Local Id': 'Client_ID'}, inplace=True)

# Merge dataframes on 'Client_ID'
merged_df = demographics_df.merge(diagnostic_data_df, on='Client_ID', how='inner') \
    .merge(presenting_problems_df, on='Client_ID', how='inner')

# Display the first few rows of the merged dataframe to verify the merge
merged_df.shape

(70916, 49)

In [4]:
print(merged_df['Current living situation at enrollment to START'].nunique())
print(merged_df['Diagnosis'].nunique())
print(merged_df['Type'].nunique())

102
1319
3


In [5]:
living_sit_value_counts = merged_df['Current living situation at enrollment to START'].value_counts()
print(living_sit_value_counts)

# Examining top race categories (by proprotion)
living_sit_norm = merged_df['Current living situation at enrollment to START'].value_counts(normalize = True)

# Calculating cumulative sum of proportions and showing top 30
living_sit_norm_cumsum = living_sit_norm.cumsum().head(30)
print(living_sit_norm_cumsum)

Current living situation at enrollment to START
Family home                          37353
Group home                           16644
Alternative Family Living (AFL)       3496
Supported living                      2383
Independent living                    2013
                                     ...  
Other: Polinsky                          2
Other: Adoptive Home                     2
Other: CPEP                              1
Other: Currently living at the RC        1
Other: juvenile dentention center        1
Name: count, Length: 102, dtype: int64
Current living situation at enrollment to START
Family home                                                0.526722
Group home                                                 0.761422
Alternative Family Living (AFL)                            0.810720
Supported living                                           0.844323
Independent living                                         0.872709
Community ICF/DD                                    

In [6]:
# Top 95% of values in current living situation
def categorize_living_sit(living_sit):
    categories = {
        'Family Home': ['Family Home', 'Other: Bouncing between family homes'],
        'Group home': ['Group home', 'Other: Acute crisis group home'],
        'Alternative Family Living (AFL)': ['Alternative Family Living (AFL)'],
        'Supported Living': ['Supported Living'],
        'Independent living': ['Independent living'],
        'Community ICF/DD': ['Community ICF/DD'],
        'Homeless, sheltered': ['Homeless, sheltered'],
        'Psychiatric hospital': ['Psychiatric hospital', 'Other: In-patient', 'Other: in-patient psychiatric unit of hospital', 'Other: Mercy Hospital Behavioral Heath Unit'],
        'Supervised apartment': ['Supervised apartment'],
    }

    for category, labels in categories.items():
        if living_sit in labels:
            return category
    return 'Other'

In [7]:
# Applying function to data
merged_df['living_situation_clean'] = merged_df['Current living situation at enrollment to START'].apply(categorize_living_sit)
print(merged_df.living_situation_clean.value_counts())

living_situation_clean
Other                              45878
Group home                         16704
Alternative Family Living (AFL)     3496
Community ICF/DD                    1825
Psychiatric hospital                1487
Supervised apartment                1014
Homeless, sheltered                  512
Name: count, dtype: int64


In [8]:
# Taking top 96% of data
def categorize_race(race):
    categories = {
        'White': ['White', 'Other: Middle Eastern'],
        'Black/African American': ['Black or African American'],
        'Other/Unknown': ['Unknown, not collected', 'Other', 'Unknown, not collected, White', 'Other: Biracial'],
        'Hispanic/Latinx': ['Other: Hispanic', 'Other: Latino', 'Other: hispanic', 'Other: Mexican'],
        'Asian/Pacific Islander': ['Asian', 'Native Hawaiian or Other Pacific Islander'],
        'Mixed': ['Black or African American, White', 'Asian, White'],
        'American Indian or Alaska Native': ['American Indian or Alaska Native'],
    }

    for category, labels in categories.items():
        if race in labels:
            return category
    return 'Other/Unknown'

In [9]:
# Applying function to data
merged_df['race_clean'] = merged_df['Race'].apply(categorize_race)
merged_df.race_clean.value_counts()

race_clean
White                               45621
Black/African American              13435
Other/Unknown                        6527
Hispanic/Latinx                      2718
Asian/Pacific Islander               1552
Mixed                                 624
American Indian or Alaska Native      439
Name: count, dtype: int64

In [10]:
gender_value_counts = merged_df['Gender'].value_counts()
print(gender_value_counts)

# Examining top race categories (by proprotion)
gender_norm = merged_df['Gender'].value_counts(normalize = True)

# Calculating cumulative sum of proportions and showing top 30
gender_norm_cumsum = gender_norm.cumsum().head(30)
print(gender_norm_cumsum)

Gender
Male                                                                   45957
Female                                                                 24715
Other:                                                                    40
Other: Born male, unsure of gender identity at this time                  35
Other: not sure yet as to who Sam identifies with                         24
Other: Transgender female                                                 23
Other: currently exploring male and female gender identities              20
Other: biologically female. self describes as male                        18
Other: non binary                                                         15
Other: Born male/ he/ him, Currently identifies as female/ she/ her       15
Other: Identifies as female                                               12
Other: transgender female                                                 12
Other: Desires to transition from male to female                     

In [11]:
# Taking top 96% of data
def categorize_gender(gender):
    categories = {
        'Male': 'Male',
        'Female': 'Female'
    }

    for category, labels in categories.items():
        if gender in labels:
            return category
    return 'Other'

# Applying function to data
merged_df['gender_clean'] = merged_df['Gender'].apply(categorize_gender)
merged_df.gender_clean.value_counts()

gender_clean
Male      45957
Female    24715
Other       244
Name: count, dtype: int64

In [12]:
from datetime import datetime
import pandas as pd

def categorize_before_covid(status_date):
    # Define the cutoff date
    cutoff_date = datetime(2020, 3, 13)
    
    # Ensure status_date is in datetime format and convert to YYYY-MM-DD
    status_date_obj = pd.to_datetime(status_date).strftime('%Y-%m-%d')
    
    # Convert status_date_obj back to datetime for comparison
    status_date_obj = datetime.strptime(status_date_obj, '%Y-%m-%d')
    
    # Compare the status date with the cutoff date
    if status_date_obj < cutoff_date:
        return True
    else:
        return False

# Assuming 'Status Date' is in string format initially
merged_df['Status Date'] = pd.to_datetime(merged_df['Status Date'], errors='coerce').dt.strftime('%Y-%m-%d')

# Apply the function to the DataFrame
merged_df['before_covid'] = merged_df['Status Date'].apply(categorize_before_covid)
merged_df.before_covid.value_counts()

before_covid
False    45216
True     25700
Name: count, dtype: int64

In [14]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

regression_problem = 'Aggression'

# Convert 'Date Enrolled in START' to datetime and create new features from it
merged_df['Date Enrolled in START'] = pd.to_datetime(merged_df['Date Enrolled in START'], errors='coerce')

# Create a binary target variable for suicidal ideation
merged_df['regression_problem'] = merged_df['Presenting Problems at Enrollment'].str.contains(regression_problem, na=False).astype(int)

# Select features for the regression
features = [
    'before_covid',
    'living_situation_clean', 
    'race_clean',
    'gender_clean'
]

# One-hot encode categorical features
encoded_df = pd.get_dummies(merged_df[features], drop_first=True)

# Combine encoded features with the target variable
regression_df = pd.concat([encoded_df, merged_df['regression_problem']], axis=1)

# Split the data into training and testing sets
X = regression_df.drop(columns='regression_problem')
y = regression_df['regression_problem']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a linear regression model
linreg = LinearRegression()
linreg.fit(X_train, y_train)

# Make predictions
y_pred = linreg.predict(X_test)

# Calculate mean squared error and R^2 score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

# Get the coefficients and feature names
coefficients = linreg.coef_
feature_names = X.columns

# Calculate Pearson correlation coefficients
correlations = []
for feature in feature_names:
    correlation = np.corrcoef(X[feature], y)[0, 1]
    correlations.append(correlation)

# Create a DataFrame to display feature importance and correlations
feature_importance = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients,
    'R-value': correlations
}).sort_values(by='Coefficient', ascending=False)

print(f"Feature Impact on {regression_problem} in people with IDD in START program:")
feature_importance

Mean Squared Error: 0.17059778262254913
R^2 Score: 0.0020486862003067374
Feature Impact on Aggression in people with IDD in START program:


Unnamed: 0,Feature,Coefficient,R-value
1,living_situation_clean_Community ICF/DD,0.028381,0.011352
10,race_clean_Mixed,0.018627,0.003576
13,gender_clean_Male,0.016197,0.023296
2,living_situation_clean_Group home,0.013871,0.005054
0,before_covid,0.006958,0.00658
4,living_situation_clean_Other,0.005639,-0.001274
6,living_situation_clean_Supervised apartment,0.005119,-0.002312
8,race_clean_Black/African American,0.002555,0.011172
11,race_clean_Other/Unknown,0.002399,0.00675
7,race_clean_Asian/Pacific Islander,0.001147,0.002531


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Load the data
covid_cases_df = pd.read_excel('../data/New_cases_nh.xlsx')

# Convert 'Date Enrolled in START' and 'Date' to datetime
covid_cases_df['Date'] = pd.to_datetime(covid_cases_df['Date'], errors='coerce')

# Map COVID cases to merged_df based on the date
covid_cases_dict = dict(zip(covid_cases_df['Date'], covid_cases_df['New cases']))
merged_df['covid_cases'] = merged_df['Date Enrolled in START'].map(covid_cases_dict)

# Define the independent variable (COVID cases) and dependent variables
independent_variable = 'covid_cases'
dependent_variables = [
    'before_covid',
    'living_situation_clean', 
    'race_clean',
    'gender_clean'
]

# One-hot encode categorical dependent variables
encoded_df = pd.get_dummies(merged_df[dependent_variables], drop_first=True)

# Combine encoded features with the independent variable
regression_df = pd.concat([encoded_df, merged_df[[independent_variable]]], axis=1).dropna()

# Split the data into training and testing sets
X = regression_df[[independent_variable]]
y_df = regression_df.drop(columns=[independent_variable])

results = []

for dependent_var in y_df.columns:
    y = y_df[dependent_var]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Train a linear regression model
    linreg = LinearRegression()
    linreg.fit(X_train, y_train)

    # Make predictions
    y_pred = linreg.predict(X_test)

    # Calculate mean squared error and R^2 score
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    # Get the coefficient for the independent variable
    coefficient = linreg.coef_[0]

    # Calculate Pearson correlation coefficient
    correlation = np.corrcoef(X_train[independent_variable], y_train)[0, 1]

    results.append({
        'Dependent Variable': dependent_var,
        'Coefficient': coefficient,
        'Mean Squared Error': mse,
        'R^2 Score': r2,
        'R-value': correlation
    })

# Create a DataFrame to display the results
results_df = pd.DataFrame(results)

print(f"Impact of COVID cases on features in people with IDD in START program:")
results_df