In [19]:
import pandas as pd
import joblib
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


In [20]:
# Load dataset
df = pd.read_excel("dummy_npi_data.xlsx")

In [21]:
df.head()

Unnamed: 0,NPI,State,Login Time,Logout Time,Usage Time (mins),Region,Speciality,Count of Survey Attempts
0,1000000000,NY,2025-03-08 06:09:00,2025-03-08 06:28:00,19,Northeast,Cardiology,3
1,1000000001,MI,2025-03-08 12:28:00,2025-03-08 13:10:00,42,Midwest,Oncology,5
2,1000000002,CA,2025-03-08 15:11:00,2025-03-08 15:37:00,26,West,Oncology,8
3,1000000003,TX,2025-03-08 14:17:00,2025-03-08 15:36:00,79,Northeast,Orthopedics,9
4,1000000004,GA,2025-03-08 15:59:00,2025-03-08 17:37:00,98,West,Oncology,0


In [22]:
# Convert Login & Logout Time to datetime
df['Login Time'] = pd.to_datetime(df['Login Time'], errors='coerce')
df['Logout Time'] = pd.to_datetime(df['Logout Time'], errors='coerce')

In [23]:

# Extract relevant features
df['Login Hour'] = df['Login Time'].dt.hour
df['Active Hours'] = (df['Logout Time'] - df['Login Time']).dt.total_seconds() / 3600



In [24]:
# Fill missing values
df.fillna(df.median(numeric_only=True), inplace=True)


In [25]:
# Encode categorical variables
label_encoders = {}
for col in ['Speciality', 'Region']:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))
    label_encoders[col] = le

In [26]:

# Normalize numerical values
scaler = MinMaxScaler()
df[['Login Hour', 'Active Hours']] = scaler.fit_transform(df[['Login Hour', 'Active Hours']])


In [27]:
# Define target variable (Ensure correct label assignment)
if 'Survey Completed' in df.columns:
    df['Attended Survey'] = df['Survey Completed'].apply(lambda x: 1 if x == "Yes" else 0)
else:
    df['Attended Survey'] = df['Count of Survey Attempts'].apply(lambda x: 1 if x > 2 else 0)


In [28]:
# Split data into train & test
X = df[['Login Hour', 'Active Hours', 'Speciality', 'Region']]
y = df['Attended Survey']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [29]:
# Train model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [30]:

# Evaluate model
y_pred = model.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")


Model Accuracy: 63.00%


In [31]:

# # Save model & encoders
# joblib.dump(model, "doctor_prediction_model.pkl")
# joblib.dump(scaler, "scaler.pkl")
# joblib.dump(label_encoders, "label_encoders.pkl")

In [32]:
from xgboost import XGBClassifier
import joblib
from sklearn.metrics import accuracy_score

# Train model using XGBoost
model = XGBClassifier(n_estimators=100, random_state=42, use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_test)
print(f"Model Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")

# Save model & encoders
joblib.dump(model, "doctor_prediction_model.pkl")
joblib.dump(scaler, "scaler.pkl")
joblib.dump(label_encoders, "label_encoders.pkl")


Model Accuracy: 65.50%


Parameters: { "use_label_encoder" } are not used.



['label_encoders.pkl']