In [4]:
import pandas as pd

In [5]:
df = pd.read_csv('/content/healthcare_updated.csv')

In [6]:
print(df.shape)

(110526, 17)


In [7]:
df.head(5)

Unnamed: 0,PatientId,AppointmentID,Gender,ScheduledDay,AppointmentDay,Age,Neighbourhood,Scholarship,Hipertension,Diabetes,Alcoholism,Handcap,SMS_received,No-show,waitingDays,AppointmentDayOfWeek,IsWeekend
0,29872500000000.0,5642903,F,2016-04-29 18:38:08,2016-04-29,62,JARDIM DA PENHA,0,1,0,0,0,0,0,-1,Friday,0
1,558997800000000.0,5642503,M,2016-04-29 16:08:27,2016-04-29,56,JARDIM DA PENHA,0,0,0,0,0,0,0,-1,Friday,0
2,4262962000000.0,5642549,F,2016-04-29 16:19:04,2016-04-29,62,MATA DA PRAIA,0,0,0,0,0,0,0,-1,Friday,0
3,867951200000.0,5642828,F,2016-04-29 17:29:31,2016-04-29,8,PONTAL DE CAMBURI,0,0,0,0,0,0,0,-1,Friday,0
4,8841186000000.0,5642494,F,2016-04-29 16:07:23,2016-04-29,56,JARDIM DA PENHA,0,1,1,0,0,0,0,-1,Friday,0


### Data Preparation For Modelling

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 1. Drop unnecessary columns
df_model = df.drop(['PatientId','AppointmentID', 'ScheduledDay', 'AppointmentDay'], axis=1)

# 2. Encode Categorical variables
le = LabelEncoder()
df_model['Gender'] = le.fit_transform(df_model['Gender']) # F=0, M=1
df_model['Neighbourhood'] = le.fit_transform(df_model['Neighbourhood'])
df_model['AppointmentDayOfWeek'] = le.fit_transform(df_model['AppointmentDayOfWeek'])
df_model['IsWeekend'] = df_model['IsWeekend'].astype(int)

# 3. Separate Features And Target
X = df_model.drop('No-show', axis=1)
y = df_model['No-show']

# 4. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Show the shape of data
print("Training Features:",X_train.shape)
print("Testing Features:",X_test.shape)

Training Features: (88420, 12)
Testing Features: (22106, 12)


### Logistic Regression

### Fix the class imblance

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Intialize the Logistic Regression Model
model = LogisticRegression(max_iter=1000)

# Train the model on the training data
model.fit(X_train,y_train)

# make prediction on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("classification_report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

classification_report:
              precision    recall  f1-score   support

           0       0.80      0.99      0.89     17715
           1       0.33      0.02      0.03      4391

    accuracy                           0.80     22106
   macro avg       0.57      0.50      0.46     22106
weighted avg       0.71      0.80      0.72     22106

Accuracy: 0.797701981362526


In [10]:
! pip install imbalanced-learn



In [11]:
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split
# Apply SMOTE to the Training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# train the logistic regression  model on the resampled data
model_smote = LogisticRegression(max_iter=1000, random_state=42)
model_smote.fit(X_train_resampled, y_train_resampled)

# Predict on the test data
y_pred_smote = model_smote.predict(X_test)

# Evaluation
print("Classification Report  after SMOTE:\n", classification_report(y_test, y_pred_smote))
print("Accuracy:", accuracy_score(y_test, y_pred_smote))


Classification Report  after SMOTE:
               precision    recall  f1-score   support

           0       0.83      0.59      0.69     17715
           1       0.24      0.52      0.33      4391

    accuracy                           0.58     22106
   macro avg       0.54      0.56      0.51     22106
weighted avg       0.71      0.58      0.62     22106

Accuracy: 0.5785307156428119


### Random Forest Model

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

print("Classification Report for Random Forest:\n", classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))

Classification Report for Random Forest:
               precision    recall  f1-score   support

           0       0.83      0.92      0.87     17715
           1       0.41      0.21      0.28      4391

    accuracy                           0.78     22106
   macro avg       0.62      0.57      0.58     22106
weighted avg       0.74      0.78      0.76     22106

Accuracy: 0.7833619831719895


In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Set parameters manually (based on commonly good results)
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

# Train the model
rf_model.fit(X_train_resampled, y_train_resampled)

# Predict on test set
y_pred = rf_model.predict(X_test)

# Evaluate
print("Classification Report for Tuned Random Forest:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))


Classification Report for Tuned Random Forest:
              precision    recall  f1-score   support

           0       0.87      0.70      0.77     17715
           1       0.32      0.56      0.41      4391

    accuracy                           0.67     22106
   macro avg       0.59      0.63      0.59     22106
weighted avg       0.76      0.67      0.70     22106

Accuracy: 0.6733013661449381
