In [3]:
import pandas as pd

# loading the dataset
df = pd.read_csv('hospital_readmissions.csv')
df.head()

Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,diag_2,diag_3,glucose_test,A1Ctest,change,diabetes_med,readmitted
0,[70-80),8,72,1,18,2,0,0,Missing,Circulatory,Respiratory,Other,no,no,no,yes,no
1,[70-80),3,34,2,13,0,0,0,Other,Other,Other,Other,no,no,no,yes,no
2,[50-60),5,45,0,18,0,0,0,Missing,Circulatory,Circulatory,Circulatory,no,no,yes,yes,yes
3,[70-80),2,36,0,12,1,0,0,Missing,Circulatory,Other,Diabetes,no,no,yes,yes,yes
4,[60-70),1,42,0,7,0,0,0,InternalMedicine,Other,Circulatory,Respiratory,no,no,no,yes,no


In [4]:
# Checking for missing values
print(df.isnull().sum())

age                  0
time_in_hospital     0
n_lab_procedures     0
n_procedures         0
n_medications        0
n_outpatient         0
n_inpatient          0
n_emergency          0
medical_specialty    0
diag_1               0
diag_2               0
diag_3               0
glucose_test         0
A1Ctest              0
change               0
diabetes_med         0
readmitted           0
dtype: int64


In [5]:
# Checking for unique values in readmission-related columns
for col in df.columns:
    if 'readmit' in col.lower() or 'readmission' in col.lower():
        print(f"{col}: {df[col].unique()}")

readmitted: ['no' 'yes']


In [6]:
# Checking the distribution of readmission cases
print(df['readmitted'].value_counts())
print("\nPercentages:")
print(df['readmitted'].value_counts(normalize=True) * 100)

readmitted
no     13246
yes    11754
Name: count, dtype: int64

Percentages:
readmitted
no     52.984
yes    47.016
Name: proportion, dtype: float64


In [7]:
# Converting target variable to numeric (0 for 'no', 1 for 'yes')
df['readmitted_encoded'] = df['readmitted'].map({'no': 0, 'yes': 1})
print("Target variable encoded:")
print(df['readmitted_encoded'].value_counts())

Target variable encoded:
readmitted_encoded
0    13246
1    11754
Name: count, dtype: int64


In [8]:
# Checking data types of all columns
print("Data types:")
print(df.dtypes)

Data types:
age                   object
time_in_hospital       int64
n_lab_procedures       int64
n_procedures           int64
n_medications          int64
n_outpatient           int64
n_inpatient            int64
n_emergency            int64
medical_specialty     object
diag_1                object
diag_2                object
diag_3                object
glucose_test          object
A1Ctest               object
change                object
diabetes_med          object
readmitted            object
readmitted_encoded     int64
dtype: object


In [9]:
from sklearn.preprocessing import LabelEncoder

# Now i create label encoders for all categorical columns
categorical_cols = ['age', 'medical_specialty', 'diag_1', 'diag_2', 'diag_3', 
                   'glucose_test', 'A1Ctest', 'change', 'diabetes_med']

df_encoded = df.copy()

for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col + '_encoded'] = le.fit_transform(df_encoded[col])

print("Categorical columns encoded!")

Categorical columns encoded!


In [10]:
# Viewing my current dataframe
df_encoded.head()

Unnamed: 0,age,time_in_hospital,n_lab_procedures,n_procedures,n_medications,n_outpatient,n_inpatient,n_emergency,medical_specialty,diag_1,...,readmitted_encoded,age_encoded,medical_specialty_encoded,diag_1_encoded,diag_2_encoded,diag_3_encoded,glucose_test_encoded,A1Ctest_encoded,change_encoded,diabetes_med_encoded
0,[70-80),8,72,1,18,2,0,0,Missing,Circulatory,...,0,3,4,0,7,6,1,1,0,1
1,[70-80),3,34,2,13,0,0,0,Other,Other,...,0,3,5,6,6,6,1,1,0,1
2,[50-60),5,45,0,18,0,0,0,Missing,Circulatory,...,1,1,4,0,0,0,1,1,1,1
3,[70-80),2,36,0,12,1,0,0,Missing,Circulatory,...,1,3,4,0,6,1,1,1,1,1
4,[60-70),1,42,0,7,0,0,0,InternalMedicine,Other,...,0,2,3,6,0,7,1,1,0,1


In [11]:
# Selecting all encoded features for the model
feature_cols = ['time_in_hospital', 'n_lab_procedures', 'n_procedures', 
                'n_medications', 'n_outpatient', 'n_inpatient', 'n_emergency',
                'age_encoded', 'medical_specialty_encoded', 'diag_1_encoded', 
                'diag_2_encoded', 'diag_3_encoded', 'glucose_test_encoded', 
                'A1Ctest_encoded', 'change_encoded', 'diabetes_med_encoded']

X = df_encoded[feature_cols]
y = df_encoded['readmitted_encoded']

print("Final feature matrix shape:", X.shape)

Final feature matrix shape: (25000, 16)


In [12]:
from sklearn.model_selection import train_test_split

# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

Training set shape: (20000, 16)
Test set shape: (5000, 16)


In [13]:
from sklearn.ensemble import RandomForestClassifier

# Creating and training the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

print("Model trained successfully!")

Model trained successfully!


In [14]:
from sklearn.metrics import accuracy_score, classification_report

# Making predictions on test set
y_pred = model.predict(X_test)

# Checking accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.3f}")
print(f"Model Accuracy: {accuracy*100:.1f}%")

Model Accuracy: 0.602
Model Accuracy: 60.2%


In [15]:
# viewing detailed classification metrics
print("Detailed Classification Report:")
print(classification_report(y_test, y_pred, target_names=['No Readmission', 'Readmission']))

Detailed Classification Report:
                precision    recall  f1-score   support

No Readmission       0.61      0.69      0.65      2658
   Readmission       0.59      0.50      0.54      2342

      accuracy                           0.60      5000
     macro avg       0.60      0.60      0.60      5000
  weighted avg       0.60      0.60      0.60      5000



In [40]:
# Determining feature importance
feature_importance = model.feature_importances_
feature_names = X.columns

# Creating importance dataframe and sorting
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print("Top 10 Most Important Features:")
print(importance_df.head(10))

Top 10 Most Important Features:
                      feature  importance
1            n_lab_procedures    0.176605
3               n_medications    0.142970
0            time_in_hospital    0.101701
7                 age_encoded    0.071964
9              diag_1_encoded    0.069869
8   medical_specialty_encoded    0.067477
10             diag_2_encoded    0.066020
11             diag_3_encoded    0.065834
2                n_procedures    0.059166
5                 n_inpatient    0.055138


In [17]:
# Making predictions on actual patients from test set
#  Predicting for the first 5 patients in my test data
real_patients = X_test.head(5)
real_predictions = model.predict(real_patients)
real_probabilities = model.predict_proba(real_patients)

# Showing results for real patients
for i in range(5):
    actual = y_test.iloc[i]
    predicted = real_predictions[i]
    prob = real_probabilities[i][1]
    
    print(f"Patient {i+1}:")
    print(f"  Actual: {'Readmission' if actual == 1 else 'No Readmission'}")
    print(f"  Predicted: {'Readmission' if predicted == 1 else 'No Readmission'}")
    print(f"  Readmission Probability: {prob:.3f}")
    print()

Patient 1:
  Actual: Readmission
  Predicted: No Readmission
  Readmission Probability: 0.470

Patient 2:
  Actual: No Readmission
  Predicted: No Readmission
  Readmission Probability: 0.260

Patient 3:
  Actual: No Readmission
  Predicted: No Readmission
  Readmission Probability: 0.420

Patient 4:
  Actual: No Readmission
  Predicted: Readmission
  Readmission Probability: 0.550

Patient 5:
  Actual: Readmission
  Predicted: No Readmission
  Readmission Probability: 0.380



In [18]:
from sklearn.metrics import confusion_matrix
import numpy as np

# Creating confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print("                 Predicted")
print("               No    Yes")
print(f"Actual No   {cm[0,0]:5d}  {cm[0,1]:5d}")
print(f"Actual Yes  {cm[1,0]:5d}  {cm[1,1]:5d}")

Confusion Matrix:
                 Predicted
               No    Yes
Actual No    1832    826
Actual Yes   1162   1180


In [19]:
import pickle

# Saving the trained model
with open('hospital_readmission_model.pkl', 'wb') as file:
    pickle.dump(model, file)

print("Model saved as 'hospital_readmission_model.pkl'")
print("You can now use this model anytime without retraining!")

Model saved as 'hospital_readmission_model.pkl'
You can now use this model anytime without retraining!


In [20]:
# Creating project summary for my portfolio
print("="*50)
print("HOSPITAL READMISSION PREDICTION PROJECT")
print("="*50)
print(f"Dataset: {len(df)} patients")
print(f"Features: {len(feature_cols)} variables")
print(f"Model: Random Forest Classifier")
print(f"Accuracy: {accuracy*100:.1f}%")
print(f"Key Finding: Lab procedures and medications are top predictors")
print("="*50)

HOSPITAL READMISSION PREDICTION PROJECT
Dataset: 25000 patients
Features: 16 variables
Model: Random Forest Classifier
Accuracy: 60.2%
Key Finding: Lab procedures and medications are top predictors
