## Import

In [23]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import timedelta
from tabulate import tabulate
from scipy.stats import chi2_contingency
import statsmodels.api as sm

In [25]:
# Create a file path
data_path_raw = '/Users/nataschajademinnitt/Documents/5. Data Analysis/ipv_screening/data/raw/'
data_path_processed = '/Users/nataschajademinnitt/Documents/5. Data Analysis/ipv_screening/data/processed/'
figures_path = '/Users/nataschajademinnitt/Documents/5. Data Analysis/ipv_screening/results/figures/'
tables_path = '/Users/nataschajademinnitt/Documents/5. Data Analysis/ipv_screening/results/tables/'

# Set style
sns.set_theme(style="whitegrid", context="paper")

# Load data
raw = pd.read_csv(data_path_processed + 'cleaned_data.csv', low_memory=False)

In [27]:
# Convert data time cateory or date/time
clean = raw.copy()

# Select column names by index
columns_to_convert = list(clean.columns[1:28]) + list(clean.columns[30:32]) + list(clean.columns[34:37])

# Convert selected columns to 'category'
clean[columns_to_convert] = clean[columns_to_convert].astype('category')

# Convert date to date and time
clean['createdDate'] = pd.to_datetime(clean['createdDate'])
clean['date_only'] = pd.to_datetime(clean['date_only'])

# Ordering the days of the week and storing it as categorical data
day_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']

# Convert 'weekday' column to an ordered categorical dtype
clean['weekday'] = pd.Categorical(clean['weekday'], categories=day_order, ordered=True)

In [29]:
# Creating seperate datasets for ease of analysis
gsh = clean[clean['location'] == 'Groote Schuur Hospital']
mpdh = clean[clean['location'] == 'Mitchell‚Äôs Plain District Hospital']
consent_yes = clean[clean['consent'] == 'yes']
consent_no = clean[clean['consent'] == 'no']
gsh_yes = gsh[gsh['consent'] == 'yes']
mpdh_yes = mpdh[mpdh['consent'] == 'yes']

## Feeling safe and IPV outcome

### At home

In [33]:
# Filter out rows with 'refused' in 'ipv_outcome'
filtered_data = consent_yes[consent_yes['ipv_outcome'] != 'refused']

# Create a contingency table for "Safe at Home" vs "Screening Outcome"
home_table = pd.crosstab(
    filtered_data['ipv_outcome'], 
    filtered_data['feel_safe_at_home']
)

# Chi-Square Test for Home Safety
chi2_home, p_home, dof_home, expected_home = chi2_contingency(home_table)

print("\nHome Safety vs IPV Outcome:")
print(f"Chi-Square Statistic: {chi2_home}")
print(f"P-value: {p_home}")


Home Safety vs IPV Outcome:
Chi-Square Statistic: 443.7662833908494
P-value: 1.6397047012848733e-98


These results indicate very strong evidence of a significant association between feeling safe in the community or feeling safe at home and IPV outcome. The results show that individuals who do not feel safe in their community or at home are significantly more likely to have a positive IPV screening outcome.

In [39]:
# Filter data for valid rows
filtered_data = consent_yes[
    (consent_yes['ipv_outcome'].isin(['pos', 'neg'])) &
    (~consent_yes['feel_safe_at_home'].isna())
]

# Recode ipv_outcome to binary
filtered_data['ipv_outcome_binary'] = (filtered_data['ipv_outcome'] == 'pos').astype(int)

# Recode feel_safe_where_you_live to binary
filtered_data['feel_safe_binary'] = (filtered_data['feel_safe_at_home'] == 'yes').astype(int)

import statsmodels.api as sm

# Define predictors and outcome
X = sm.add_constant(filtered_data['feel_safe_binary'])  # Add constant
y = filtered_data['ipv_outcome_binary']  # Binary outcome (0 = neg, 1 = pos)

# Fit logistic regression model
logit_model = sm.Logit(y, X).fit()

# Display the summary
print(logit_model.summary())

# Extract the odds ratio
odds_ratio = round(np.exp(logit_model.params['feel_safe_binary']), 2)

# Calculate percentage change in odds
percentage_change = round((odds_ratio - 1) * 100, 1)

print(f"Odds Ratio: {odds_ratio}")
print(f"Percentage Reduction in Odds of Positive IPV Screening: {abs(percentage_change)}%")

Optimization terminated successfully.
         Current function value: 0.295847
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:     ipv_outcome_binary   No. Observations:                14548
Model:                          Logit   Df Residuals:                    14546
Method:                           MLE   Df Model:                            1
Date:                Fri, 06 Dec 2024   Pseudo R-squ.:                 0.03558
Time:                        13:09:37   Log-Likelihood:                -4304.0
converged:                       True   LL-Null:                       -4462.8
Covariance Type:            nonrobust   LLR p-value:                 4.961e-71
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const               -0.9629      0.070    -13.731      0.000      -1.100      -0.825
feel_safe_b

The logistic regression results indicate a significant relationship between feeling safe and a positive IPV screening outcome. The odds ratio is 0.4, indicating that patients who feel safe where they live are 60% less likely to screen positively for IPV compared to those who do not feel safe.

Patients who feel safe where they live are significantly less likely to screen positively for IPV (ùëù < 0.001). The odds of a positive IPV screening decrease by 60% for individuals who report feeling safe compared to those who do not. This finding highlights the potential predictive value of perceived community and home safety in IPV risk assessments.

### In community

In [31]:
# Filter out rows with 'refused' in 'ipv_outcome'
filtered_data = consent_yes[consent_yes['ipv_outcome'] != 'refused']

# Create a contingency table for "Safe in Community" vs "Screening Outcome"
community_table = pd.crosstab(
    filtered_data['ipv_outcome'], 
    filtered_data['feel_safe_where_you_live']
)

# Chi-Square Test for Community Safety
chi2_com, p_com, dof_com, expected_com = chi2_contingency(community_table)

print("Community Safety vs IPV Outcome:")
print(f"Chi-Square Statistic: {chi2_com}")
print(f"P-value: {p_com}")

Community Safety vs IPV Outcome:
Chi-Square Statistic: 200.8521267437588
P-value: 1.3610707399934049e-45


In [37]:
# Filter data for valid rows
filtered_data = consent_yes[
    (consent_yes['ipv_outcome'].isin(['pos', 'neg'])) &
    (~consent_yes['feel_safe_where_you_live'].isna())
]

# Recode ipv_outcome to binary
filtered_data['ipv_outcome_binary'] = (filtered_data['ipv_outcome'] == 'pos').astype(int)

# Recode feel_safe_where_you_live to binary
filtered_data['feel_safe_binary'] = (filtered_data['feel_safe_where_you_live'] == 'yes').astype(int)

import statsmodels.api as sm

# Define predictors and outcome
X = sm.add_constant(filtered_data['feel_safe_binary'])  # Add constant
y = filtered_data['ipv_outcome_binary']  # Binary outcome (0 = neg, 1 = pos)

# Fit logistic regression model
logit_model = sm.Logit(y, X).fit()

# Display the summary
print(logit_model.summary())

# Extract the odds ratio
odds_ratio = round(np.exp(logit_model.params['feel_safe_binary']), 2)

# Calculate percentage change in odds
percentage_change = round((odds_ratio - 1) * 100, 1)

print(f"Odds Ratio: {odds_ratio}")
print(f"Percentage Reduction in Odds of Positive IPV Screening: {abs(percentage_change)}%")

Optimization terminated successfully.
         Current function value: 0.300880
         Iterations 6
                           Logit Regression Results                           
Dep. Variable:     ipv_outcome_binary   No. Observations:                14548
Model:                          Logit   Df Residuals:                    14546
Method:                           MLE   Df Model:                            1
Date:                Fri, 06 Dec 2024   Pseudo R-squ.:                 0.01917
Time:                        13:08:50   Log-Likelihood:                -4377.2
converged:                       True   LL-Null:                       -4462.8
Covariance Type:            nonrobust   LLR p-value:                 4.255e-39
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const               -1.5580      0.057    -27.275      0.000      -1.670      -1.446
feel_safe_b

# Export

In [None]:
# Export screening counts
output_path = tables_path + 'diagnostic tests.xlsx'
with pd.ExcelWriter(output_path, engine='xlsxwriter') as writer:
    count_location.to_excel(writer, sheet_name='Location', index=False)
    count_site.to_excel(writer, sheet_name='Site', index=False)
    count_weekday.to_excel(writer, sheet_name='Weekday', index=False)
    count_month.to_excel(writer, sheet_name='Month', index=False)
    count_staff.to_excel(writer, sheet_name='Staff', index=False)
    count_days.to_excel(writer, sheet_name='Screening days', index=False)
    refusal_counts.to_excel(writer, sheet_name='Refusal reasons', index=False)