In [0]:
spark


In [0]:
diabetes_path = "dbfs:/Workspace/Users/olatowoju@gmail.com/diabetic_data.csv"
hospital_path = "dbfs:/Workspace/Users/olatowoju@gmail.com/hospital.csv"
ids_path      = "dbfs:/Workspace/Users/olatowoju@gmail.com/IDS_mapping.csv"

# Read all CSVs as STRING 
diabetes_raw_df = spark.read.option("header", "true").option("inferSchema", "false").csv(diabetes_path)
hospital_df     = spark.read.option("header", "true").option("inferSchema", "false").csv(hospital_path)
ids_raw_df      = spark.read.option("header", "true").option("inferSchema", "false").csv(ids_path)


In [0]:
# Join on the string admission_type_id
diabetes_joined_df = (
    diabetes_raw_df
    .join(ids_raw_df, on="admission_type_id", how="left")
    .withColumnRenamed("description", "admission_type_desc")
)

display(
    diabetes_joined_df.select(
        "admission_type_id",
        "admission_type_desc",
        "readmitted"
    )
)


early_readmission + time_to_event columns

In [0]:
from pyspark.sql.functions import col, when

# Create survival-style columns
diabetes_surv_df = (
    diabetes_joined_df
    # 1 = early readmission (<30 days), 0 = otherwise
    .withColumn(
        "event_early_readmit",
        when(col("readmitted") == "<30", 1).otherwise(0)
    )
    # Approximate time-to-event in days
    .withColumn(
        "time_to_event",
        when(col("readmitted") == "<30", 15)   # mid-point of <30
        .when(col("readmitted") == ">30", 45)  # some later time
        .otherwise(60)                         # not readmitted (censored)
    )
)

# Quick check of the new columns
display(
    diabetes_surv_df.select(
        "readmitted",
        "event_early_readmit",
        "time_to_event",
        "age",
        "time_in_hospital",
        "admission_type_desc"
    )
)


In [0]:
# How many early readmissions vs others?
diabetes_surv_df.groupBy("event_early_readmit").count().show()

# Average time_to_event by event
diabetes_surv_df.groupBy("event_early_readmit").avg("time_to_event").show()


Export Data to Pandas & Run Survival Analysis (Kaplan–Meier)

In [0]:
surv_select_df = diabetes_surv_df.select(
    "event_early_readmit",
    "time_to_event",
    "age",
    "gender",
    "time_in_hospital",
    "num_lab_procedures",
    "num_medications",
    "number_inpatient",
    "admission_type_desc"
)


In [0]:
surv_pd = surv_select_df.sample(fraction=0.20, seed=42).toPandas()
surv_pd.head()


In [0]:
%pip install lifelines


In [0]:
from lifelines import KaplanMeierFitter
import matplotlib.pyplot as plt

kmf = KaplanMeierFitter()

T = surv_pd["time_to_event"]         # time variable
E = surv_pd["event_early_readmit"]   # event indicator

plt.figure(figsize=(8,6))
kmf.fit(T, event_observed=E)
kmf.plot_survival_function()

plt.title("Kaplan–Meier Survival Curve for Early Readmission")
plt.xlabel("Days")
plt.ylabel("Survival Probability (No Early Readmission)")
plt.grid(True)
plt.show()


Grouped Survival Curves

In [0]:
surv_pd["age"] = surv_pd["age"].astype(str)


In [0]:
plt.figure(figsize=(10,6))

for grp in surv_pd["age"].unique():
    grp_df = surv_pd[surv_pd["age"] == grp]
    if len(grp_df) > 30:   # avoid tiny groups
        kmf.fit(grp_df["time_to_event"], grp_df["event_early_readmit"], label=f"Age {grp}")
        kmf.plot()

plt.title("Kaplan–Meier Survival by Age Group")
plt.xlabel("Days")
plt.ylabel("Survival Probability")
plt.grid(True)
plt.show()


Sentiment Analysis using the hospital reviews dataset

In [0]:
hospital_df.printSchema()
hospital_df.show(5, truncate=False)


In [0]:
from pyspark.sql.functions import col, when

hospital_sent_df = (
    hospital_df
    .withColumn(
        "sentiment_label",
        when(col("Ratings").cast("int") <= 2, "negative")
        .when(col("Ratings").cast("int") == 3, "neutral")
        .otherwise("positive")
    )
)

display(hospital_sent_df.groupBy("sentiment_label").count())


cleaning the text in the feedback column 

In [0]:
from pyspark.sql.functions import lower, regexp_replace, col

hospital_clean_df = (
    hospital_sent_df
    # convert to lowercase
    .withColumn("clean_text", lower(col("Feedback")))
    # remove punctuation and symbols
    .withColumn("clean_text", regexp_replace(col("clean_text"), "[^a-zA-Z0-9 ]", " "))
    # remove extra spaces
    .withColumn("clean_text", regexp_replace(col("clean_text"), " +", " "))
)

display(hospital_clean_df.select("Feedback", "clean_text", "sentiment_label"))


Convert Spark → Pandas for ML processing

In [0]:
# Select only the columns needed for modeling
hospital_model_df = hospital_clean_df.select("clean_text", "sentiment_label")

# Take a 50% random sample to keep it light (you can change 0.5 to 1.0 if you want all)
hospital_pd = hospital_model_df.sample(fraction=0.5, seed=42).toPandas()

hospital_pd.head()
len(hospital_pd)


TF-IDF Vectorization + Logistic Regression Model

In [0]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Features and labels
X = hospital_pd["clean_text"]
y = hospital_pd["sentiment_label"]

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# TF-IDF vectorizer
tfidf = TfidfVectorizer(
    max_features=5000, 
    ngram_range=(1, 2),  # unigrams + bigrams
    stop_words='english'
)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Logistic Regression model
model = LogisticRegression(max_iter=2000)
model.fit(X_train_tfidf, y_train)

# Predictions
y_pred = model.predict(X_test_tfidf)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Extract Top Positive & Negative Words

In [0]:
import numpy as np

# Get feature names from TF-IDF
feature_names = np.array(tfidf.get_feature_names_out())

# Get logistic regression coefficients
coef = model.coef_

# Display top words for each class
for i, label in enumerate(model.classes_):
    print(f"\nTop words for class: {label.upper()}")

    # Most positive indicators for this class
    top_positive = feature_names[np.argsort(coef[i])[-10:]]
    
    # Most negative indicators for this class
    top_negative = feature_names[np.argsort(coef[i])[:10]]

    print("  Positive indicators:", top_positive)
    print("  Negative indicators:", top_negative)


Sentiment Distribution Pie Chart

In [0]:
import matplotlib.pyplot as plt

sentiment_counts = hospital_pd['sentiment_label'].value_counts()

plt.figure(figsize=(6,6))
plt.pie(
    sentiment_counts.values,
    labels=sentiment_counts.index,
    autopct='%1.1f%%',
    startangle=140,
    colors=['red', 'gold', 'green']  # negative, neutral, positive
)
plt.title("Sentiment Distribution of Hospital Reviews")
plt.show()


Rating Distribution Bar Chart

In [0]:
import seaborn as sns

plt.figure(figsize=(7,4))
sns.countplot(x=hospital_pd['sentiment_label'], palette='viridis')
plt.title("Count of Reviews by Sentiment Class")
plt.xlabel("Sentiment Label")
plt.ylabel("Count")
plt.show()


In [0]:
display(
    diabetes_surv_df
    .groupBy("age", "event_early_readmit")
    .count()
    .orderBy("age")
)


Databricks visualization. Run in Databricks to view.

“Older age groups show a higher count of early readmissions compared to younger groups…”

In [0]:
from pyspark.sql.functions import col

display(
    diabetes_surv_df
    .withColumn(
        "time_in_hospital_num",
        col("time_in_hospital").cast("double")
    )
    .groupBy("readmitted")
    .avg("time_in_hospital_num")
    .withColumnRenamed("avg(time_in_hospital_num)", "avg_length_of_stay")
    .orderBy("readmitted")
)

Databricks visualization. Run in Databricks to view.

* Patients with <30 (early readmission) tend to have longer initial stays, meaning they are more complex or severe cases.

* “NO” readmission patients may have the shortest stays.

In [0]:
from pyspark.sql.functions import col

display(
    diabetes_surv_df
    .withColumn(
        "num_lab_procedures_num",
        col("num_lab_procedures").cast("double")
    )
    .groupBy("readmitted")
    .avg("num_lab_procedures_num")
    .withColumnRenamed("avg(num_lab_procedures_num)", "avg_lab_procedures")
    .orderBy("readmitted")
)

Databricks visualization. Run in Databricks to view.

* Patients who are readmitted early have slightly higher lab procedures → indicates more complex conditions.

* Those never readmitted often have fewer labs → less severe presentation.

Number of inpatient visits vs readmission

In [0]:
from pyspark.sql.functions import col

display(
    diabetes_surv_df
    .withColumn(
        "number_inpatient_num",
        col("number_inpatient").cast("double")
    )
    .groupBy("readmitted")
    .avg("number_inpatient_num")
    .withColumnRenamed("avg(number_inpatient_num)", "avg_previous_inpatient_visits")
    .orderBy("readmitted")
)

Databricks visualization. Run in Databricks to view.

In [0]:
display(
    diabetes_surv_df
    .groupBy("admission_type_desc", "event_early_readmit")
    .count()
    .orderBy("admission_type_desc")
)


In [0]:
from pyspark.sql.functions import col

display(
    diabetes_surv_df
    .withColumn(
        "num_medications_num",
        col("num_medications").cast("double")
    )
    .groupBy("readmitted")
    .avg("num_medications_num")
    .withColumnRenamed("avg(num_medications_num)", "avg_medications")
    .orderBy("readmitted")
)

Databricks visualization. Run in Databricks to view.

WordCloud for Positive & Negative Reviews

In [0]:
%pip install wordcloud

from wordcloud import WordCloud
%pip install wordcloud


In [0]:
import seaborn as sns
import matplotlib.pyplot as plt

# Convert needed columns to pandas
box_pd = diabetes_surv_df.select(
    "readmitted",
    "time_in_hospital",
    "num_lab_procedures",
    "num_medications",
    "number_inpatient"
).toPandas()

fig, axs = plt.subplots(2, 2, figsize=(14, 10))

sns.boxplot(data=box_pd, x="readmitted", y="time_in_hospital", ax=axs[0,0])
axs[0,0].set_title("Length of Stay by Readmission Category")

sns.boxplot(data=box_pd, x="readmitted", y="num_lab_procedures", ax=axs[0,1], color='green')
axs[0,1].set_title("Lab Procedures by Readmission Category")

sns.boxplot(data=box_pd, x="readmitted", y="num_medications", ax=axs[1,0], color='orange')
axs[1,0].set_title("Medication Count by Readmission Category")

sns.boxplot(data=box_pd, x="readmitted", y="number_inpatient", ax=axs[1,1], color='red')
axs[1,1].set_title("Previous Inpatient Visits by Readmission Category")

plt.tight_layout()
plt.show()


Boxplots demonstrate noticeable differences between readmission categories. Patients who experienced early readmission (<30 days) generally had higher medication counts, more lab procedures, and more previous inpatient visits, indicating that clinical complexity strongly correlates with readmission risk.

In [0]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert key fields to pandas
dist_pd = diabetes_surv_df.select(
    "time_in_hospital",
    "num_lab_procedures",
    "num_medications",
    "number_inpatient"
).toPandas()

fig, axs = plt.subplots(2, 2, figsize=(12,10))

sns.histplot(dist_pd['time_in_hospital'], kde=True, ax=axs[0,0])
axs[0,0].set_title("Distribution of Time in Hospital")

sns.histplot(dist_pd['num_lab_procedures'], kde=True, ax=axs[0,1], color='green')
axs[0,1].set_title("Distribution of Lab Procedures")

sns.histplot(dist_pd['num_medications'], kde=True, ax=axs[1,0], color='orange')
axs[1,0].set_title("Distribution of Number of Medications")

sns.histplot(dist_pd['number_inpatient'], kde=True, ax=axs[1,1], color='red')
axs[1,1].set_title("Distribution of Previous Inpatient Visits")

plt.tight_layout()
plt.show()


Most patients have a short to moderate initial length of stay.

Lab procedures and medication counts show skewed distributions, with a small number of complex cases receiving many labs/meds.

Prior inpatient visits distribution shows that frequent-readmission patients are a small but important high-risk cluster.

WordCloud for Positive & Negative Reviews

In [0]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Join text by sentiment
positive_text = " ".join(
    hospital_pd[hospital_pd.sentiment_label=="positive"]["clean_text"]
)

negative_text = " ".join(
    hospital_pd[hospital_pd.sentiment_label=="negative"]["clean_text"]
)

# Positive WordCloud
wc_pos = WordCloud(
    background_color="white",
    max_words=100
).generate(positive_text)

plt.figure(figsize=(8,5))
plt.imshow(wc_pos, interpolation="bilinear")
plt.axis("off")
plt.title("Positive Review WordCloud")
plt.show()

# Negative WordCloud
wc_neg = WordCloud(
    background_color="black",
    colormap="Reds",
    max_words=100
).generate(negative_text)

plt.figure(figsize=(8,5))
plt.imshow(wc_neg, interpolation="bilinear")
plt.axis("off")
plt.title("Negative Review WordCloud")
plt.show()


. Results and Interpretation
4.1 Survival Analysis Results

A Kaplan–Meier (KM) survival curve was generated to estimate the probability of diabetic patients not being readmitted within 30 days. Survival probability here represents the likelihood of remaining out of the hospital without an early readmission event.

Key Findings

Overall survival curve declines sharply within the early time window.
This indicates that a significant proportion of diabetic patients experience early readmission, confirming that 30-day readmission is a major clinical issue.

Average time-to-event differs between groups.
Patients labeled with early readmission (<30 days) had a mean estimated time-to-event of approximately 15 days, while censored patients (“NO” readmission) had the maximum time value of 60 days.

Risk factors appear visually in grouped survival curves.
When stratified by patient age group, older adults demonstrated faster survival curve drops, indicating a higher early readmission risk.

Interpretation

The survival analysis highlights that diabetic patients face considerable risk of early hospital readmission. Trends observed across age groups and admission categories suggest that:

Older patients and those admitted for urgent/emergency reasons may require closer post-discharge monitoring.

Survival modeling supports hospitals in allocating follow-up resources to the most vulnerable subpopulations.

This aligns with existing clinical literature that identifies diabetes as a high-risk condition susceptible to early readmission without adequate discharge planning.

4.2 Sentiment Analysis Results

A sentiment classification model was developed using TF–IDF features and Logistic Regression. Reviews were categorized as positive, neutral, or negative based on their textual content.

Model Performance

Overall accuracy: ~0.67

Positive sentiment: High recall (1.00), indicating strong ability to detect positive reviews

Negative sentiment: Moderate performance (F1 ≈ 0.63)

Neutral sentiment: Weak predictive power due to limited samples

Although accuracy is not extremely high, this is expected in short-review sentiment datasets with imbalanced classes.

Sentiment Distribution

Positive reviews form the majority, suggesting generally favorable patient experiences.

Negative reviews, though smaller in number, provide valuable insights into areas needing improvement.

Top Discriminative Words

(From logistic regression coefficients)

Positive indicators: “good”, “great”, “process”, “friendly”, “excellent”, “team”

Negative indicators: “bad”, “experience”, “discharge”, “clueless”, “waiting”, “only”

These keywords reveal the themes influencing patient sentiment.

Word Cloud Insights

Positive wordcloud emphasized staff attitude, good care, and smooth processes.

Negative wordcloud highlighted dissatisfaction with:

Discharge process

Waiting times

Poor communication

Interpretation

Patient reviews reflect both strengths and weaknesses in hospital service delivery.
Positive sentiment aligns with effective care and staff professionalism, while negative reviews reveal common pain points such as:

Delayed discharge workflows

Inadequate communication

Perceived inefficiencies

These insights complement clinical survival findings by linking patient experiences with potential readmission risk factors.

 4.3 Integrated Insights

By combining survival analysis with sentiment modeling:

Clinical data identifies which patient groups are most likely to be readmitted.

Sentiment data reveals patient-reported issues that may contribute to those outcomes.

Together, these insights support more holistic decision-making aimed at:

Improving discharge procedures

Enhancing communication

Implementing targeted follow-up for high-risk patients

This strengthens hospital quality improvement strategies and aligns with value-based care objectives.

This project developed a cloud-based advanced analytics pipeline that integrates survival analysis of diabetic patient readmission data with sentiment analysis of hospital reviews. The survival analysis demonstrated that diabetic patients face substantial risk of early readmission, particularly older adults and those admitted through emergency or urgent channels. These findings highlight the importance of targeted discharge planning, improved follow-up care, and risk stratification strategies for vulnerable groups.