<a href="https://colab.research.google.com/github/nagken/medpredictml/blob/main/medpredict1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd

url = "https://raw.githubusercontent.com/nagken/medpredictml/main/data/patient_data.csv"
df = pd.read_csv(url)

print(df.head())  # Confirm it's correct


   age admission_type     diagnosis medications  length_of_stay  \
0   45      emergency      diabetes     insulin               5   
1   60        routine  hypertension   metformin               3   
2   30         urgent        asthma   albuterol               2   

   previous_admissions  readmitted  
0                    2           1  
1                    1           0  
2                    0           1  


In [7]:
import pandas as pd

# Load Diabetic Data
diabetic_url = "https://raw.githubusercontent.com/nagken/medpredictml/main/data/diabetic_data.csv"
df_diabetic = pd.read_csv(diabetic_url)

# Load Patient Data
patient_url = "https://raw.githubusercontent.com/nagken/medpredictml/main/data/patient_data.csv"
df_patient = pd.read_csv(patient_url)

# Display first few rows
print("Diabetic Data Sample:")
print(df_diabetic.head())

print("\nPatient Data Sample:")
print(df_patient.head())


Diabetic Data Sample:
   encounter_id  patient_nbr             race  gender      age weight  \
0       2278392      8222157        Caucasian  Female   [0-10)      ?   
1        149190     55629189        Caucasian  Female  [10-20)      ?   
2         64410     86047875  AfricanAmerican  Female  [20-30)      ?   
3        500364     82442376        Caucasian    Male  [30-40)      ?   
4         16680     42519267        Caucasian    Male  [40-50)      ?   

   admission_type_id  discharge_disposition_id  admission_source_id  \
0                  6                        25                    1   
1                  1                         1                    7   
2                  1                         1                    7   
3                  1                         1                    7   
4                  1                         1                    7   

   time_in_hospital  ... citoglipton insulin  glyburide-metformin  \
0                 1  ...          No      N

In [9]:
# Check column names and missing values
print("\nDiabetic Data Info:")
print(df_diabetic.info())

print("\nPatient Data Info:")
print(df_patient.info())

# Check for missing values
print("\nMissing Values in Diabetic Data:")
print(df_diabetic.isnull().sum())

print("\nMissing Values in Patient Data:")
print(df_patient.isnull().sum())



Diabetic Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  int64 
 1   patient_nbr               101766 non-null  int64 
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  int64 
 7   discharge_disposition_id  101766 non-null  int64 
 8   admission_source_id       101766 non-null  int64 
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-n

In [10]:
from sklearn.preprocessing import OneHotEncoder

# Selecting categorical columns from Patient Data
categorical_cols = ["admission_type", "diagnosis", "medications"]
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")

# Encoding categorical variables
encoded_features = encoder.fit_transform(df_patient[categorical_cols])
encoded_df = pd.DataFrame(encoded_features)

# Concatenating with original dataset
df_patient_processed = pd.concat([df_patient.drop(columns=categorical_cols), encoded_df], axis=1)

print("\nProcessed Patient Data Sample:")
print(df_patient_processed.head())



Processed Patient Data Sample:
   age  length_of_stay  previous_admissions  readmitted    0    1    2    3  \
0   45               5                    2           1  1.0  0.0  0.0  0.0   
1   60               3                    1           0  0.0  1.0  0.0  0.0   
2   30               2                    0           1  0.0  0.0  1.0  1.0   

     4    5    6    7    8  
0  1.0  0.0  0.0  1.0  0.0  
1  0.0  1.0  0.0  0.0  1.0  
2  0.0  0.0  1.0  0.0  0.0  


In [11]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Define features and target variable
X = df_patient_processed.drop(columns=["readmitted"])  # Assuming 'readmitted' is the target column
y = df_patient_processed["readmitted"]

# Split data into train-test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train an XGBoost model
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Check accuracy
print("\nModel Accuracy:", accuracy_score(y_test, y_pred))


Parameters: { "use_label_encoder" } are not used.




Model Accuracy: 0.0


In [12]:
print(df_patient_processed["readmitted"].value_counts())


readmitted
1    2
0    1
Name: count, dtype: int64


In [13]:
print(df_patient_processed["readmitted"].unique())


[1 0]


In [15]:
df_patient_processed["readmitted"] = df_patient_processed["readmitted"].map({"<30": 1, ">30": 1, "NO": 0})


In [16]:
print(X.dtypes)


age                      int64
length_of_stay           int64
previous_admissions      int64
0                      float64
1                      float64
2                      float64
3                      float64
4                      float64
5                      float64
6                      float64
7                      float64
8                      float64
dtype: object


In [17]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Define features and target variable
X = df_patient_processed.drop(columns=["readmitted"])
y = df_patient_processed["readmitted"]

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train XGBoost with scale_pos_weight
model = XGBClassifier(eval_metric='logloss', scale_pos_weight=y_train.value_counts()[0] / y_train.value_counts()[1])
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_test)
print("\nFixed Model Accuracy:", accuracy_score(y_test, y_pred))


ValueError: Input y contains NaN.

In [18]:
print(y.isnull().sum())  # Check how many NaNs exist
print(y.unique())        # Check all unique values


3
[nan]


In [20]:
df_patient_processed = df_patient_processed.dropna(subset=["readmitted"])
X = df_patient_processed.drop(columns=["readmitted"])
y = df_patient_processed["readmitted"]


In [21]:
df_patient_processed["readmitted"].fillna(df_patient_processed["readmitted"].mode()[0], inplace=True)


KeyError: 0

In [22]:
print(y.isnull().sum())  # Should print 0


0


In [23]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# Load the dataset
url = "https://raw.githubusercontent.com/nagken/medpredictml/main/data/patient_data.csv"
df = pd.read_csv(url)

# 🔍 Step 1: Check for missing values
print("Missing values in each column:\n", df.isnull().sum())

# 🔍 Step 2: Handle missing values (Drop or Impute)
df = df.dropna(subset=["readmitted"])  # Drop rows with missing target variable

# 🔍 Step 3: Feature Engineering - Convert categorical variables
categorical_cols = ["admission_type", "diagnosis", "medications"]
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded_features = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())

# 🔍 Step 4: Standardize numerical features
numerical_cols = ["age", "length_of_stay", "previous_admissions"]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[numerical_cols])
scaled_df = pd.DataFrame(scaled_features, columns=numerical_cols)

# 🔍 Step 5: Prepare final dataset
X = pd.concat([scaled_df, encoded_df], axis=1)
y = df["readmitted"].astype(int)  # Convert target variable to integer

# 🔍 Step 6: Train-Test Split (Fix NaN issues in `y`)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 🔍 Step 7: Train XGBoost Model
model = XGBClassifier(eval_metric='logloss', scale_pos_weight=y_train.value_counts()[0] / y_train.value_counts()[1])
model.fit(X_train, y_train)

# 🔍 Step 8: Make Predictions
y_pred = model.predict(X_test)

# 🔍 Step 9: Evaluate Model Performance
accuracy = accuracy_score(y_test, y_pred)
print("\n✅ Model Training Completed!")
print("🎯 Model Accuracy:", accuracy)


Missing values in each column:
 age                    0
admission_type         0
diagnosis              0
medications            0
length_of_stay         0
previous_admissions    0
readmitted             0
dtype: int64


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [24]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# Load the dataset
url = "https://raw.githubusercontent.com/nagken/medpredictml/main/data/diabetic_data.csv"
df = pd.read_csv(url)

# 🔍 Check the first few rows
print(df.head())

# 🔍 Check for missing values
print("Missing values in each column:\n", df.isnull().sum())


   encounter_id  patient_nbr             race  gender      age weight  \
0       2278392      8222157        Caucasian  Female   [0-10)      ?   
1        149190     55629189        Caucasian  Female  [10-20)      ?   
2         64410     86047875  AfricanAmerican  Female  [20-30)      ?   
3        500364     82442376        Caucasian    Male  [30-40)      ?   
4         16680     42519267        Caucasian    Male  [40-50)      ?   

   admission_type_id  discharge_disposition_id  admission_source_id  \
0                  6                        25                    1   
1                  1                         1                    7   
2                  1                         1                    7   
3                  1                         1                    7   
4                  1                         1                    7   

   time_in_hospital  ... citoglipton insulin  glyburide-metformin  \
0                 1  ...          No      No                   No

In [25]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# Load the dataset
url = "https://raw.githubusercontent.com/nagken/medpredictml/main/data/patient_data.csv"
df = pd.read_csv(url)

# 🔍 Step 1: Check for missing values
print("Missing values in each column:\n", df.isnull().sum())

# 🔍 Step 2: Handle missing values (Drop or Impute)
df = df.dropna(subset=["readmitted"])  # Drop rows with missing target variable

# 🔍 Step 3: Feature Engineering - Convert categorical variables
categorical_cols = ["admission_type", "diagnosis", "medications"]
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded_features = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())

# 🔍 Step 4: Standardize numerical features
numerical_cols = ["age", "length_of_stay", "previous_admissions"]
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[numerical_cols])
scaled_df = pd.DataFrame(scaled_features, columns=numerical_cols)

# 🔍 Step 5: Prepare final dataset
X = pd.concat([scaled_df, encoded_df], axis=1)
y = df["readmitted"].astype(int)  # Convert target variable to integer

# 🔍 Step 6: Train-Test Split (Fix NaN issues in `y`)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 🔍 Step 7: Train XGBoost Model
model = XGBClassifier(eval_metric='logloss', scale_pos_weight=y_train.value_counts()[0] / y_train.value_counts()[1])
model.fit(X_train, y_train)

# 🔍 Step 8: Make Predictions
y_pred = model.predict(X_test)

# 🔍 Step 9: Evaluate Model Performance
accuracy = accuracy_score(y_test, y_pred)
print("\n✅ Model Training Completed!")
print("🎯 Model Accuracy:", accuracy)


Missing values in each column:
 age                    0
admission_type         0
diagnosis              0
medications            0
length_of_stay         0
previous_admissions    0
readmitted             0
dtype: int64


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [26]:
# 📌 Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# 📌 Load the dataset
url = "https://raw.githubusercontent.com/nagken/medpredictml/main/data/patient_data.csv"
df = pd.read_csv(url)

# 🔍 Step 1: Check for missing values
print("Missing values in each column:\n", df.isnull().sum())

# 🔍 Step 2: Convert categorical `readmitted` column to numeric (Binary Classification)
df["readmitted"] = df["readmitted"].astype(int)  # Ensure it's an integer

# 🔍 Step 3: Ensure `readmitted` has more than 1 instance per class
print("Class distribution in 'readmitted':\n", df["readmitted"].value_counts())

# 🔍 Fix for `ValueError`: Remove classes with only 1 instance
df = df[df["readmitted"].map(df["readmitted"].value_counts()) > 1]

# 📌 Step 4: Define Features (X) & Target (y)
categorical_cols = ["admission_type", "diagnosis", "medications"]
numerical_cols = ["age", "length_of_stay", "previous_admissions"]

# 🔍 Encode categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded_features = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())

# 🔍 Standardize numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[numerical_cols])
scaled_df = pd.DataFrame(scaled_features, columns=numerical_cols)

# 📌 Combine all features
X = pd.concat([scaled_df, encoded_df], axis=1)
y = df["readmitted"]

# 🔍 Step 5: Fix Train-Test Split Issue
if len(y.unique()) < 2:
    raise ValueError("Not enough unique classes in `readmitted`. Check dataset balance.")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# 📌 Step 6: Train XGBoost Model
model = XGBClassifier(eval_metric='logloss', scale_pos_weight=y_train.value_counts()[0] / y_train.value_counts()[1])
model.fit(X_train, y_train)

# 📌 Step 7: Make Predictions & Evaluate Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("\n✅ Patient Readmission Model Training Completed!")
print("🎯 Model Accuracy:", accuracy)


Missing values in each column:
 age                    0
admission_type         0
diagnosis              0
medications            0
length_of_stay         0
previous_admissions    0
readmitted             0
dtype: int64
Class distribution in 'readmitted':
 readmitted
1    2
0    1
Name: count, dtype: int64


ValueError: Not enough unique classes in `readmitted`. Check dataset balance.

In [27]:
# 📌 Import required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# 📌 Load the dataset
url = "https://raw.githubusercontent.com/nagken/medpredictml/main/data/patient_data.csv"
df = pd.read_csv(url)

# 🔍 Step 1: Check for missing values
print("Missing values in each column:\n", df.isnull().sum())

# 🔍 Step 2: Convert `readmitted` column to integer
df["readmitted"] = df["readmitted"].astype(int)

# 🔍 Step 3: Check Class Distribution
print("\nClass distribution in 'readmitted':\n", df["readmitted"].value_counts())

# 🔍 Fix: Ensure at least 10 instances per class
min_class_size = 10
if df["readmitted"].value_counts().min() < min_class_size:
    print("\n🚨 Not enough data for training! Adding synthetic samples...\n")

    # Duplicate minority class
    df = df.append(df[df["readmitted"] == 0].sample(min_class_size, replace=True))
    df = df.append(df[df["readmitted"] == 1].sample(min_class_size, replace=True))

# 🔍 Step 4: Define Features (X) & Target (y)
categorical_cols = ["admission_type", "diagnosis", "medications"]
numerical_cols = ["age", "length_of_stay", "previous_admissions"]

# 🔍 Encode categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded_features = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())

# 🔍 Standardize numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[numerical_cols])
scaled_df = pd.DataFrame(scaled_features, columns=numerical_cols)

# 📌 Combine all features
X = pd.concat([scaled_df, encoded_df], axis=1)
y = df["readmitted"]

# 🔍 Step 5: Fix Train-Test Split Issue
if len(y.unique()) < 2:
    raise ValueError("🚨 Not enough unique classes in `readmitted`. Check dataset balance.")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 📌 Step 6: Train XGBoost Model
model = XGBClassifier(eval_metric='logloss')
model.fit(X_train, y_train)

# 📌 Step 7: Make Predictions & Evaluate Model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print("\n✅ Patient Readmission Model Training Completed!")
print("🎯 Model Accuracy:", accuracy)


Missing values in each column:
 age                    0
admission_type         0
diagnosis              0
medications            0
length_of_stay         0
previous_admissions    0
readmitted             0
dtype: int64

Class distribution in 'readmitted':
 readmitted
1    2
0    1
Name: count, dtype: int64

🚨 Not enough data for training! Adding synthetic samples...



AttributeError: 'DataFrame' object has no attribute 'append'

In [29]:
# Drop unnecessary columns (if needed)
df = df.drop(columns=["encounter_id", "patient_nbr"], errors="ignore")  # These don't impact prediction

# Replace "?" with NaN for proper missing value handling
df.replace("?", np.nan, inplace=True)

# 🔍 Drop rows where the target column `readmitted` is missing
df = df.dropna(subset=["readmitted"])

# 🔍 Encode target variable (`readmitted` as binary: 1 = readmitted, 0 = not readmitted)
df["readmitted"] = df["readmitted"].apply(lambda x: 1 if x in ["<30", ">30"] else 0)


In [30]:
# Select categorical & numerical features
categorical_cols = ["race", "gender", "admission_type_id", "discharge_disposition_id", "admission_source_id"]
numerical_cols = ["age", "time_in_hospital", "num_lab_procedures", "num_medications", "number_diagnoses"]

# One-Hot Encode Categorical Columns
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded_features = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())

# Standardize Numerical Features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[numerical_cols])
scaled_df = pd.DataFrame(scaled_features, columns=numerical_cols)

# Combine Processed Features
X = pd.concat([scaled_df, encoded_df], axis=1)
y = df["readmitted"].astype(int)  # Convert target variable to integer


KeyError: "None of [Index(['race', 'gender', 'admission_type_id', 'discharge_disposition_id',\n       'admission_source_id'],\n      dtype='object')] are in the [columns]"

In [31]:
# 📌 Step 1: Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# 📌 Step 2: Load Dataset
url = "https://raw.githubusercontent.com/nagken/medpredictml/main/data/diabetic_data.csv"
df = pd.read_csv(url)

# 🔍 Check for missing values
print("Missing values in each column:\n", df.isnull().sum())

# 📌 Step 3: Preprocessing & Handling Missing Data
df = df.drop(columns=["encounter_id", "patient_nbr"], errors="ignore")  # Drop unnecessary columns
df.replace("?", np.nan, inplace=True)  # Convert "?" to NaN
df = df.dropna(subset=["readmitted"])  # Drop rows with missing target column

# 🔍 Encode `readmitted` as binary (1 = readmitted, 0 = not readmitted)
df["readmitted"] = df["readmitted"].apply(lambda x: 1 if x in ["<30", ">30"] else 0)

# 📌 Step 4: Feature Engineering
categorical_cols = ["race", "gender", "admission_type_id", "discharge_disposition_id", "admission_source_id"]
numerical_cols = ["age", "time_in_hospital", "num_lab_procedures", "num_medications", "number_diagnoses"]

# One-Hot Encoding for Categorical Variables
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded_features = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())

# Standardize Numerical Features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[numerical_cols])
scaled_df = pd.DataFrame(scaled_features, columns=numerical_cols)

# Combine Processed Features
X = pd.concat([scaled_df, encoded_df], axis=1)
y = df["readmitted"].astype(int)

# 📌 Step 5: Train & Evaluate XGBoost Model
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train XGBoost Classifier with class imbalance handling
model = XGBClassifier(eval_metric='logloss', scale_pos_weight=y_train.value_counts()[0] / y_train.value_counts()[1])
model.fit(X_train, y_train)

# Make Predictions
y_pred = model.predict(X_test)

# Evaluate Model Performance
accuracy = accuracy_score(y_test, y_pred)
print("\n✅ Diabetes Model Training Completed!")
print("🎯 Model Accuracy:", accuracy)


Missing values in each column:
 encounter_id                    0
patient_nbr                     0
race                            0
gender                          0
age                             0
weight                          0
admission_type_id               0
discharge_disposition_id        0
admission_source_id             0
time_in_hospital                0
payer_code                      0
medical_specialty               0
num_lab_procedures              0
num_procedures                  0
num_medications                 0
number_outpatient               0
number_emergency                0
number_inpatient                0
diag_1                          0
diag_2                          0
diag_3                          0
number_diagnoses                0
max_glu_serum               96420
A1Cresult                   84748
metformin                       0
repaglinide                     0
nateglinide                     0
chlorpropamide                  0
glimepiride     

ValueError: could not convert string to float: '[0-10)'

In [32]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# Load the dataset
url = "https://raw.githubusercontent.com/nagken/medpredictml/main/data/diabetic_data.csv"
df = pd.read_csv(url)

# Drop unnecessary columns
df = df.drop(columns=["encounter_id", "patient_nbr"], errors="ignore")

# Replace "?" with NaN for proper handling
df.replace("?", np.nan, inplace=True)

# Drop columns with excessive missing values
df = df.drop(columns=["max_glu_serum", "A1Cresult"], errors="ignore")

# Convert `age` from ranges like "[10-20)" to numerical midpoints
age_mapping = {
    "[0-10)": 5, "[10-20)": 15, "[20-30)": 25, "[30-40)": 35, "[40-50)": 45,
    "[50-60)": 55, "[60-70)": 65, "[70-80)": 75, "[80-90)": 85, "[90-100)": 95
}
df["age"] = df["age"].map(age_mapping)

# Drop rows where target column `readmitted` is missing
df = df.dropna(subset=["readmitted"])

# Encode target variable (`readmitted` as binary: 1 = readmitted, 0 = not readmitted)
df["readmitted"] = df["readmitted"].apply(lambda x: 1 if x in ["<30", ">30"] else 0)

# Define categorical and numerical columns
categorical_cols = ["race", "gender", "admission_type_id", "discharge_disposition_id", "admission_source_id"]
numerical_cols = ["age", "time_in_hospital", "num_lab_procedures", "num_medications", "number_diagnoses"]

# One-Hot Encode Categorical Variables
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded_features = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())

# Standardize Numerical Features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[numerical_cols])
scaled_df = pd.DataFrame(scaled_features, columns=numerical_cols)

# Combine Processed Features
X = pd.concat([scaled_df, encoded_df], axis=1)
y = df["readmitted"].astype(int)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Train XGBoost Classifier with class imbalance handling
model = XGBClassifier(eval_metric='logloss', scale_pos_weight=y_train.value_counts()[0] / y_train.value_counts()[1])
model.fit(X_train, y_train)

# Make Predictions
y_pred = model.predict(X_test)

# Evaluate Model Performance
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy:", accuracy)


Model Accuracy: 0.5945760047165176


In [33]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

# Load dataset
url = "https://raw.githubusercontent.com/nagken/medpredictml/main/data/diabetic_data.csv"
df = pd.read_csv(url)

# Drop unnecessary columns
df = df.drop(columns=["encounter_id", "patient_nbr"], errors="ignore")

# Replace "?" with NaN
df.replace("?", np.nan, inplace=True)

# Drop columns with excessive missing values
df = df.drop(columns=["max_glu_serum", "A1Cresult"], errors="ignore")

# Convert `age` from categorical ranges to numerical values
age_mapping = {
    "[0-10)": 5, "[10-20)": 15, "[20-30)": 25, "[30-40)": 35, "[40-50)": 45,
    "[50-60)": 55, "[60-70)": 65, "[70-80)": 75, "[80-90)": 85, "[90-100)": 95
}
df["age"] = df["age"].map(age_mapping)

# Drop rows where target column `readmitted` is missing
df = df.dropna(subset=["readmitted"])

# Encode target variable (`readmitted` as binary: 1 = readmitted, 0 = not readmitted)
df["readmitted"] = df["readmitted"].apply(lambda x: 1 if x in ["<30", ">30"] else 0)

# Define categorical and numerical columns
categorical_cols = ["race", "gender", "admission_type_id", "discharge_disposition_id", "admission_source_id"]
numerical_cols = ["age", "time_in_hospital", "num_lab_procedures", "num_medications", "number_diagnoses"]

# One-Hot Encode Categorical Variables
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded_features = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())

# Standardize Numerical Features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[numerical_cols])
scaled_df = pd.DataFrame(scaled_features, columns=numerical_cols)

# Combine Processed Features
X = pd.concat([scaled_df, encoded_df], axis=1)
y = df["readmitted"].astype(int)

# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

# Hyperparameter tuning for XGBoost
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'scale_pos_weight': [1, y_train.value_counts()[0] / y_train.value_counts()[1]]
}

xgb = XGBClassifier(eval_metric='logloss')
grid_search = GridSearchCV(xgb, param_grid, scoring='accuracy', cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Train the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate Model
accuracy = accuracy_score(y_test, y_pred)
print("Optimized Model Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))


Fitting 3 folds for each of 54 candidates, totalling 162 fits
Optimized Model Accuracy: 0.6239405814271394
Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.60      0.61     10997
           1       0.62      0.65      0.63     10949

    accuracy                           0.62     21946
   macro avg       0.62      0.62      0.62     21946
weighted avg       0.62      0.62      0.62     21946



In [34]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
url = "https://raw.githubusercontent.com/nagken/medpredictml/main/data/diabetic_data.csv"
df = pd.read_csv(url)

# Drop unnecessary columns
df = df.drop(columns=["encounter_id", "patient_nbr"], errors="ignore")

# Replace "?" with NaN
df.replace("?", np.nan, inplace=True)

# Drop rows with missing target values
df = df.dropna(subset=["readmitted"])

# Encode target variable
df["readmitted"] = df["readmitted"].apply(lambda x: 1 if x in ["<30", ">30"] else 0)

# Define categorical and numerical features
categorical_cols = ["race", "gender", "admission_type_id", "discharge_disposition_id", "admission_source_id"]
numerical_cols = ["age", "time_in_hospital", "num_lab_procedures", "num_medications", "number_diagnoses"]

# One-Hot Encoding
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded_features = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())

# Standardize numerical features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[numerical_cols])
scaled_df = pd.DataFrame(scaled_features, columns=numerical_cols)

# Combine processed features
X = pd.concat([scaled_df, encoded_df], axis=1)
y = df["readmitted"].astype(int)

# Apply SMOTE to balance the classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Train XGBoost Classifier
model = XGBClassifier(eval_metric='logloss')
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Model Evaluation
accuracy = accuracy_score(y_test, y_pred)
print("Model Accuracy after SMOTE:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


ValueError: could not convert string to float: '[0-10)'

In [35]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

# Load the dataset
url = "https://raw.githubusercontent.com/nagken/medpredictml/main/data/diabetic_data.csv"
df = pd.read_csv(url)

# Display first few rows
print("Sample Data:\n", df.head())

# Check for missing values
print("\nMissing values in each column:\n", df.isnull().sum())

# 🔹 Step 1: Drop unnecessary columns
df = df.drop(columns=["encounter_id", "patient_nbr"], errors="ignore")  # These IDs are not useful for prediction

# 🔹 Step 2: Convert "?" values to NaN
df.replace("?", np.nan, inplace=True)

# 🔹 Step 3: Handle Age Column (Convert Categorical Ranges to Numeric Midpoints)
age_mapping = {
    "[0-10)": 5, "[10-20)": 15, "[20-30)": 25, "[30-40)": 35, "[40-50)": 45,
    "[50-60)": 55, "[60-70)": 65, "[70-80)": 75, "[80-90)": 85, "[90-100)": 95
}
df["age"] = df["age"].map(age_mapping)

# 🔹 Step 4: Handle Missing Data
# Drop rows where `readmitted` is missing (our target variable)
df = df.dropna(subset=["readmitted"])

# Encode the target variable (`readmitted`: 1 if readmitted, 0 if not)
df["readmitted"] = df["readmitted"].apply(lambda x: 1 if x in ["<30", ">30"] else 0)

# 🔹 Step 5: Feature Engineering
categorical_cols = ["race", "gender", "admission_type_id", "discharge_disposition_id", "admission_source_id"]
numerical_cols = ["age", "time_in_hospital", "num_lab_procedures", "num_medications", "number_diagnoses"]

# One-Hot Encode Categorical Features
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded_features = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())

# Standardize Numerical Features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[numerical_cols])
scaled_df = pd.DataFrame(scaled_features, columns=numerical_cols)

# Combine Processed Features
X = pd.concat([scaled_df, encoded_df], axis=1)
y = df["readmitted"].astype(int)  # Convert target variable to integer

# 🔹 Step 6: Handle Class Imbalance (Check Distribution)
print("\nClass distribution in 'readmitted':\n", y.value_counts())

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Handle Class Imbalance Using scale_pos_weight
scale_pos_weight = y_train.value_counts()[0] / y_train.value_counts()[1]

# 🔹 Step 7: Train XGBoost Model
model = XGBClassifier(eval_metric='logloss', scale_pos_weight=scale_pos_weight)
model.fit(X_train, y_train)

# 🔹 Step 8: Make Predictions
y_pred = model.predict(X_test)

# 🔹 Step 9: Evaluate Model Performance
accuracy = accuracy_score(y_test, y_pred)
print("\nModel Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Sample Data:
    encounter_id  patient_nbr             race  gender      age weight  \
0       2278392      8222157        Caucasian  Female   [0-10)      ?   
1        149190     55629189        Caucasian  Female  [10-20)      ?   
2         64410     86047875  AfricanAmerican  Female  [20-30)      ?   
3        500364     82442376        Caucasian    Male  [30-40)      ?   
4         16680     42519267        Caucasian    Male  [40-50)      ?   

   admission_type_id  discharge_disposition_id  admission_source_id  \
0                  6                        25                    1   
1                  1                         1                    7   
2                  1                         1                    7   
3                  1                         1                    7   
4                  1                         1                    7   

   time_in_hospital  ... citoglipton insulin  glyburide-metformin  \
0                 1  ...          No      No       

In [38]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE  # To handle class imbalance

# Load dataset
url = "https://raw.githubusercontent.com/nagken/medpredictml/main/data/diabetic_data.csv"
df = pd.read_csv(url)

# Step 1: Drop unnecessary columns
df = df.drop(columns=["encounter_id", "patient_nbr"], errors="ignore")

# Step 2: Handle Missing Values
df.replace("?", np.nan, inplace=True)

# Drop max_glu_serum and A1Cresult (too many missing values)
df = df.drop(columns=["max_glu_serum", "A1Cresult"], errors="ignore")

# Fill missing categorical values with "Unknown"
for col in df.select_dtypes(include=["object"]).columns:
    df[col] = df[col].fillna("Unknown")

# Step 3: Convert Categorical Age to Numeric Midpoints
age_mapping = {
    "[0-10)": 5, "[10-20)": 15, "[20-30)": 25, "[30-40)": 35, "[40-50)": 45,
    "[50-60)": 55, "[60-70)": 65, "[70-80)": 75, "[80-90)": 85, "[90-100)": 95
}
df["age"] = df["age"].map(age_mapping)

# Step 4: Encode Categorical Columns
categorical_cols = ["race", "gender", "admission_type_id", "discharge_disposition_id", "admission_source_id",
                    "insulin", "change", "diabetesMed"]

# Convert "No", "Steady", "Up" to numerical values
insulin_mapping = {"No": 0, "Steady": 1, "Up": 2, "Down": 3}
df["insulin"] = df["insulin"].map(insulin_mapping)

# Convert "No" / "Ch" in 'change' column
df["change"] = df["change"].map({"No": 0, "Ch": 1})

# Convert "Yes" / "No" in 'diabetesMed' column
df["diabetesMed"] = df["diabetesMed"].map({"No": 0, "Yes": 1})

# One-Hot Encode Other Categorical Variables
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
encoded_features = encoder.fit_transform(df[categorical_cols])
encoded_df = pd.DataFrame(encoded_features, columns=encoder.get_feature_names_out())

# Step 5: Standardize Numerical Features
numerical_cols = ["age", "time_in_hospital", "num_lab_procedures", "num_medications", "number_diagnoses"]

scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[numerical_cols])
scaled_df = pd.DataFrame(scaled_features, columns=numerical_cols)

# Step 6: Combine All Features
X = pd.concat([scaled_df, encoded_df], axis=1)
y = df["readmitted"].apply(lambda x: 1 if x in ["<30", ">30"] else 0)  # Convert target to binary (1 = Readmitted)

# Handle Class Imbalance using SMOTE
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 7: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled)

# Step 8: Train Optimized XGBoost Model
model = XGBClassifier(eval_metric='logloss', scale_pos_weight=y_train.value_counts()[0] / y_train.value_counts()[1])
model.fit(X_train, y_train)

# Step 9: Make Predictions
y_pred = model.predict(X_test)

# Step 10: Evaluate Model Performance
accuracy = accuracy_score(y_test, y_pred)
print("\nModel Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Model Accuracy: 0.6279048573771986

Classification Report:
               precision    recall  f1-score   support

           0       0.63      0.61      0.62     10973
           1       0.62      0.65      0.63     10973

    accuracy                           0.63     21946
   macro avg       0.63      0.63      0.63     21946
weighted avg       0.63      0.63      0.63     21946

