In [1]:
import pandas as pd
import numpy as np
import time
from joblib import dump, load
from sklearn.model_selection import train_test_split
import re
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("placeholder.csv")

In [3]:
selected_features = ['Total Time','General Physician','Specialization Count','City','Pediatrician','Urologist','Neurologist',
                     'Total_Reviews', 'No_of_qualifications', 'Doctor Qualification', 'Experience_Years', 'Experience_Group',
                     'Patient_Satisfaction_Rate', 'Wait_Time', 'Titles', 'Doctors Link', 'Avg_time_per_Patient',
                     'Hospital Address', 'Gastroenterologist']

In [4]:
top_15_specializations = ['General Physician','Gynecologist','Pediatrician','Orthopedic Surgeon','Dermatologist','Gastroenterologist','Pulmonologist',
                          'Neuro Surgeon','Andrologist','Urologist','Neurologist','Nephrologist','Ent Specialist','Eye Surgeon','Ophthalmologist']

In [None]:
#variables
city_replace = 'Lahore'
qualification_replace = 'MBBS'
Specialization_replace = 'General Physician'
Hospital_Address_replace = 'No Address Available'
Doctors_Link_replace = 'No Link Available'
Titles_replace = 'Dr'
Region_replace = 'Punjab Region'
Fee_Category_replace = 'Medium-Priced'
Experience_replace = 11.7
Total_Reviews_replace = 91.6
Patient_Satisfaction_Rate_replace = 96.6
Avg_time_per_Patient_replace = 14.09
Wait_Time_replace = 11.26
Specialization_Count_replace = 1
No_of_qualifications_replace = 2
Total_Time_replace = 25.351875

In [5]:
len(top_15_specializations)

15

# Handling nulls

In [38]:
df.columns

Index(['City', 'Doctor Qualification', 'Experience_Years', 'Total_Reviews',
       'Patient_Satisfaction_Rate', 'Avg_time_per_Patient', 'Wait_Time',
       'Hospital Address', 'Doctors Link', 'Fee Category', 'Titles', 'Region',
       'Specialization Count', 'No_of_qualifications', 'Experience_Group',
       'Total Time', 'Andrologist', 'Dermatologist', 'Ent Specialist',
       'Eye Surgeon', 'Gastroenterologist', 'General Physician',
       'Gynecologist', 'Nephrologist', 'Neuro Surgeon', 'Neurologist',
       'Ophthalmologist', 'Orthopedic Surgeon', 'Others', 'Pediatrician',
       'Pulmonologist', 'Urologist'],
      dtype='object')

In [37]:
def fillna_with_replacements(df, replacements):
    for column, replace_value in replacements.items():
        df[column].fillna(replace_value, inplace=True)
    return df

# Replacements
replacements = {
    'City': 'Lahore',
    'Doctor Qualification': 'MBBS',
    'Experience_Years': 11.7,
    'Total_Reviews': 91.6,
    'Patient_Satisfaction_Rate': 96.6,
    'Avg_time_per_Patient': 14.09,
    'Wait_Time': 11.26,
    'Hospital Address': 'No Address Available',
    'Doctors Link': 'No Link Available',
    'Fee Category': 'Medium-Priced',
    'Titles': 'Dr',
    'Region': 'Punjab Region',
    'Specialization Count': 1,
    'No_of_qualifications': 2,
    'Total Time': 25.351875
}

df = fillna_with_replacements(df, replacements)

# Loading

In [6]:
#load all encoders
file_path = 'C:\\Users\\noura\\Downloads\\final ms2 ml\\Models\\fee_encoder.joblib'
fee_encoder = load(file_path)

file_path = 'C:\\Users\\noura\\Downloads\\final ms2 ml\\Models\\qualification_encoder.joblib'
qualification_encoder = load(file_path)

file_path = 'C:\\Users\\noura\\Downloads\\final ms2 ml\\Models\\Experience_Group_encoder.joblib'
Experience_Group_encoder = load(file_path)

file_path = 'C:\\Users\\noura\\Downloads\\final ms2 ml\\Models\\titles_encoder.joblib'
titles_encoder = load(file_path)

file_path = 'C:\\Users\\noura\\Downloads\\final ms2 ml\\Models\\city_encoder.joblib'
city_encoder = load(file_path)

file_path = 'C:\\Users\\noura\\Downloads\\final ms2 ml\\Models\\Region_encoder.joblib'
Region_encoder = load(file_path)

In [7]:
#load models
file_path = 'C:\\Users\\noura\\Downloads\\final ms2 ml\\Models\\LightGBM_model.joblib'
LightGBM_model = load(file_path)

file_path = 'C:\\Users\\noura\\Downloads\\final ms2 ml\\Models\\XGBoost_model.joblib'
XGBoost_model = load(file_path)

file_path = 'C:\\Users\\noura\\Downloads\\final ms2 ml\\Models\\Random Forest_model.joblib'
Random_Forest_model = load(file_path)

# Defining Functions

In [8]:
#0
def rename_cols(df):
  df.rename(columns={'Fee(PKR)': 'Fee'}, inplace=True)
  df.rename(columns={'Patient Satisfaction Rate(%age)': 'Patient_Satisfaction_Rate'}, inplace=True)
  df.rename(columns={'Experience(Years)': 'Experience_Years'}, inplace=True)
  df.rename(columns={'Avg Time to Patients(mins)': 'Avg_time_per_Patient'}, inplace=True)
  df.rename(columns={'Wait Time(mins)': 'Wait_Time'}, inplace=True)

# # Sample call
rename_cols(df)

In [9]:
#1
def extract_titles_and_clean_name(name):
    # Define a regex pattern for titles
    title_regex = r'(Dr\.|Prof\.|Mr\.|Ms\.|Colonel|Assoc\. Prof\. Dr\.|Asst\. Prof\. Dr\.|Prof\. Dr\.)'

    # List of accepted titles
    accepted_titles = ["Dr", "Asst Prof Dr", "Prof, Dr", "Assoc Prof Dr"]

    # Find all titles in the name
    titles = re.findall(title_regex, name)

    # Clean the name by removing the extracted titles
    cleaned_name = re.sub(title_regex, '', name).strip()

    # Convert titles to a cleaned string without periods
    title_str = ', '.join(titles).replace('.', '').strip()

    # Check if the concatenated title string is in the list of accepted titles
    if title_str not in accepted_titles:
        title_str = 'others'

    return title_str, cleaned_name

# # Sample call
df[['Titles', 'Doctor Name']] = df['Doctor Name'].apply(lambda x: pd.Series(extract_titles_and_clean_name(x)))

In [10]:
#2
def assign_region_clean_city(df):
    # Define the city-region mappings
    punjab_cities = [
        'Lahore', 'Islamabad', 'Multan', 'Sahiwal', 'Okara', 'Faisalabad', 'Sargodha',
        'Gujranwala', 'Rawalakot', 'Gujrat', 'Sialkot', 'Sheikhupura', 'Kasur', 'Narowal',
        'Jhang', 'Khanewal', 'Toba Tek Singh', 'Chiniot', 'Pakpattan', 'Burewala', 'Vehari',
        'Rahim Yar Khan', 'Bahawalpur', 'Bahawalnagar', 'Lodhran', 'Layyah', 'Mianwali',
        'Muzaffar Garh', 'Dera Ghazi Khan', 'Bhakkar', 'Khushab', 'Mian Channu', 'Chichawatni',
        'Gojra', 'Shorkot', 'Samundri', 'Tando Muhammad Khan', 'Talagang', 'Kamoke', 'Shahkot',
        'Dinga', 'Bhalwal', 'Chakwal', 'Kharian', 'Daska', 'Hafizabad', 'Sadiqabad', 'Nankana Sahib',
        'Pattoki', 'Alipur', "Rajan Pur", "Jhelum", "Attock", "Lalamusa", "Wah Cantt",
        "Dunyapur", "Khanpur", "Kot Addu", "Mandi Bahauddin", "Renala Khurd",
        "Taxila", "Jauharabad", "Gujar Khan", "Wazirabad", "Pasrur",
        "Muridke", "Chishtian", "Kabirwala", "Jaranwala", "Dijkot"
    ]

    sindh_cities = [
        'Karachi', 'Hyderabad', 'Mirpur Khas', 'Sukkur', 'Nawabshah', 'Larkana', 'Jacobabad',
        'Khairpur', 'Thatta', 'Jamshoro', 'Ghotki', 'Shikarpur', 'Badin', 'Dadu', 'Khairpur Nathan Shah',
        'Moro', 'Hala', "Kandiaro", "Umarkot", "Kashmor", "Mithi", "Matiari", "Shahdadpur", "Baden"
    ]

    kpk_cities = [
        'Peshawar', 'Abbottabad', 'Nowshera', 'Swabi', 'Mardan', 'Mansehra', 'Haripur', 'Bannu',
        'Kohat', 'Dera Ismail Khan', 'Mingora', 'Charsadda', 'Timergara', 'Buner', 'Chitral', 'Dargai',
        "Hangu", "Swat", "Malakand", "Bajaur Agency"
    ]

    balochistan_cities = [
        'Quetta', 'Turbat', 'Chaman', 'Khuzdar', 'Gwadar', 'Loralai', 'Zhob', 'Sibi', 'Nushki',
        'Barkhan', 'Mastung', 'Duki'
    ]

    international_cities = [
        "Istanbul", "Riyadh", "Izmir"
    ]

    kashmir_cities = [
        'Gilgit', 'Kotli', 'Mirpur', 'Skardu'
    ]

    # Create a dictionary to map cities to their respective regions
    city_regions = {
        **{city: 'Punjab Region' for city in punjab_cities},
        **{city: 'Sindh Region' for city in sindh_cities},
        **{city: 'KPK Region' for city in kpk_cities},
        **{city: 'Balochistan Region' for city in balochistan_cities},
        **{city: 'International Region' for city in international_cities},
        **{city: 'Kashmir Region' for city in kashmir_cities}
    }

    # Clean the 'City' column
    df['City'] = df['City'].str.replace('-', ' ').str.title()

    # Map the cities to their respective regions using the dictionary
    df['Region'] = df['City'].map(city_regions)

    return df
# # Sample call
df = assign_region_clean_city(df)

In [11]:
#3
def all_specialization_preprocessing(df):
  specialization_mapping = {
    "Pediatrician,Pediatric": "Pediatrician",
    "Lung Specialist": "Pulmonologist",
    "Eye Surgeon,Eye Specialist": "Ophthalmologist",
    "Sexologist": "Andrologist",
    "Cosmetic Surgeon,Dermatologist": "Cosmetic Dermatologist",
    "Internal Medicine Specialist,General Physician,Infectious Diseases": "Infectious Disease Specialist",
    'Dermatologist, Dermatologist, Allergy Specialist': 'Dermatologist, Allergy Specialist',
    'Plastic Surgeon, Cosmetic Surgeon, Plastic Surgeon, Dermatologist': "Cosmetic Dermatologist",
  }
  #clean
  def process_specialization(entry):
    entry = entry.replace('/', ',')

    specialties = [s.strip() for s in entry.split(',')]
    unique_specialties = []
    for specialty in specialties:
        if specialty not in unique_specialties:
            unique_specialties.append(specialty)
    # Join back into a string
    unique_specialties_str = ','.join(unique_specialties)
    return unique_specialties_str
  df['Specialization'] = df['Specialization'].apply(process_specialization)
  #map
  def map_specialization(specialization):
      for key, value in specialization_mapping.items():
          if key in specialization:
              return value
      return specialization
  df['Specialization'] = df['Specialization'].apply(map_specialization)

# # Sample call
all_specialization_preprocessing(df)

In [12]:
#4
def calc_Specialization_count(df):
  df['Specialization Count'] = df['Specialization'].str.count(',') + 1

# # Sample call
calc_Specialization_count(df)

In [13]:
#5
def clean_qualifications(df):
    # Combine and update all replacements into a single dictionary
    replacements = {
        r'\bPhD\b': 'PHD', r'\bM\.D\.\b': 'MD', r'\bD\.M\.S\b': 'DMS',
        r'\bB\.Sc\.\b': 'BSC', r'\bM\.S\.\b': 'MS', r'\bM\.Phil\b': 'MPHIL',
        r'\bG\.A\.M\.S\b': 'GAMS', r'\(D\.H\.B\)': 'DHB', r'\(D\.Ac\)': 'PHD',
        r'Ophtamology': 'Ophthalmology', r'Gastroentrology': 'Gastroenterology',
        r'OtoRhinoLaryngology': 'Otorhinolaryngology', r'Paediatrics': 'Pediatrics',
        r'Pulmonology': 'Pulmonary', r'ENT': 'Otolaryngology', r'OrthopedicSurgery': 'Orthopedic Surgery',
        r'NeuroSurgery': 'Neurosurgery', r'Medicine': 'Internal Medicine',
        r'OBSTETRICS&GYNAECOLOGY': 'Obstetrics&Gynecology', r'Gynecology&amp;Obstetrics': 'Gynecology and Obstetrics',
        r'Genecology&amp;Obstetrics': 'Gynecology and Obstetrics', r'OtorhinolaryngologicENT': 'Otorhinolaryngologic,ENT',
        r'MasterOfSurgery': 'Master of Surgery', r'MD\d*': 'MD', r'MDGastroenterology': 'MD,Gastroenterology',
        r'FCPSPediatrics': 'FCPS,Pediatrics', r'MBBSMD': 'MBBS,MD', r'FRCSOrthopedics': 'FRCS,Orthopedics',
        r'MCPSGynae/Obs': 'MCPS(Gynecology/Obs)', r'MD-RMP': 'MD, RMP', r'Masters\(NeuroSurgeon\)': 'Masters, Neurosurgery',
        r'\(|\)': '', r'[^a-zA-Z,]': '', r'Ophthalmologist': 'Ophthalmology', r'GASTROENTEROLOGY': 'Gastroenterology',
        r'MCPS,': 'MCPS', r'M\.D': 'MD', 'MD 1': 'MD'
    }

    # Apply all replacements
    df['Doctor Qualification'] = df['Doctor Qualification'].replace(replacements, regex=True)

    # Additional replacements to handle specific concatenations
    concatenations = {
        r'FCPSOBSTETRICSampGYNAECOLOGY': 'FCPS,Obstetrics&Gynecology',
        r'FCPSOtolaryngology': 'FCPS,Otolaryngology',
        r'MCPSFCPS': 'MCPS,FCPS'
    }
    df['Doctor Qualification'] = df['Doctor Qualification'].replace(concatenations, regex=True)

    # Remove all unnecessary spaces, then remove spaces around commas
    df['Doctor Qualification'] = df['Doctor Qualification'].str.replace(r'\s+', '')
    df['Doctor Qualification'] = df['Doctor Qualification'].str.replace(r'\s*,\s*', ',', regex=True)

    # Enhanced cleaning function
    def enhance_cleaning(qualification):
        # Replace HTML entities and correct specific cases
        qualification = qualification.replace('&amp;', '&')
        qualification = re.sub(r'(?<!\w)([A-Z]+)(?!\w)', lambda x: x.group(1), qualification)
        qualification = qualification.replace('DiplomainTBandChestDiseases', 'DTBCD')

        # Split, sort, and remove duplicates
        parts = sorted(set(qualification.split(',')))  # Remove duplicates and sort
        return ','.join(parts)

    # Apply the enhanced cleaning function
    df['Doctor Qualification'] = df['Doctor Qualification'].apply(enhance_cleaning)

    return df
# # Sample Call
df = clean_qualifications(df)

In [14]:
#6
def calc_No_of_qualifications(df):
  df['No_of_qualifications'] = df['Doctor Qualification'].apply(lambda x: len(x.split(",")))

# # Sample Call
calc_No_of_qualifications(df)

In [15]:
#7
def binning_Experience(df):
  df['Experience_Years'] = df['Experience_Years'].round()
  bins = [0, 5, 10, 15, 20, 25, float('inf')]
  labels = ['Novice', 'Beginner', 'Competent', 'Proficient', 'Expert', 'Master']
  df['Experience_Group'] = pd.cut(df['Experience_Years'], bins=bins, labels=labels, right=True)

# # Sample Call
binning_Experience(df)

In [16]:
#8
def calc_Total_time(df):
  df['Total Time'] = df['Avg_time_per_Patient'] + df['Wait_Time']

# # Sample Call
calc_Total_time(df)

In [17]:
#9
cols_minmax = ['Experience_Years','Total_Reviews','Total Time']
def apply_log_transform(df, columns):
    for col in columns:
        df[col] = np.log1p(df[col])

# columns_to_transform = ['Experience_Years', 'Patient_Satisfaction_Rate',
#                        'Avg_time_per_Patient', 'Wait_Time', 'Total Time','Total_Reviews']

# # Sample Call
apply_log_transform(df, cols_minmax)

In [None]:
def process_data(column_name, column_data, df):
    print(f"Processing column '{column_name}':")
    print("Original column data:")
    print(column_data)
    
    # Calculate mode of the column
    mode_value = df[column_name].mode()[0]
    
    # Iterate over each row in the column data
    for i in range(len(column_data)):
        if column_data[i] not in df[column_name].values:
            column_data[i] = mode_value

# Encoding

In [19]:
df['Fee Category'] = fee_encoder.transform(df[['Fee Category']])

In [20]:
df['Specialization'] = df['Specialization'].apply(lambda x: x if x in top_15_specializations else 'Others')
onehot_encoded = pd.get_dummies(df['Specialization']).astype(int)
df = pd.concat([df, onehot_encoded], axis=1)

In [21]:
df['Doctor Qualification'] = qualification_encoder.transform(df['Doctor Qualification'])

In [22]:
df.columns

Index(['Doctor Name', 'City', 'Specialization', 'Doctor Qualification',
       'Experience_Years', 'Total_Reviews', 'Patient_Satisfaction_Rate',
       'Avg_time_per_Patient', 'Wait_Time', 'Hospital Address', 'Doctors Link',
       'Fee Category', 'Titles', 'Region', 'Specialization Count',
       'No_of_qualifications', 'Experience_Group', 'Total Time', 'Andrologist',
       'Dermatologist', 'Ent Specialist', 'Eye Surgeon', 'Gastroenterologist',
       'General Physician', 'Gynecologist', 'Nephrologist', 'Neuro Surgeon',
       'Neurologist', 'Ophthalmologist', 'Orthopedic Surgeon', 'Others',
       'Pediatrician', 'Pulmonologist', 'Urologist'],
      dtype='object')

In [23]:
df['Experience_Group'] = Experience_Group_encoder.transform(df['Experience_Group'])

In [24]:
df['Titles'] = titles_encoder.transform(df['Titles'])

In [25]:
df['Hospital Address'] = df['Hospital Address'].apply(lambda x: 0 if x == 'No Address Available' else 1)

In [26]:
df['Doctors Link'] = df['Doctors Link'].apply(lambda x: 0 if x == 'No Link Available' else 1)

In [27]:
df['City'] = city_encoder.transform(df['City'])

In [28]:
df['Region'] = Region_encoder.transform(df['Region'])

In [29]:
df.drop(columns=["Doctor Name","Specialization"], inplace=True)

# Spliting and testing

In [30]:
X = df.drop(columns="Fee Category")
y = df["Fee Category"]

In [31]:
X

Unnamed: 0,City,Doctor Qualification,Experience_Years,Total_Reviews,Patient_Satisfaction_Rate,Avg_time_per_Patient,Wait_Time,Hospital Address,Doctors Link,Titles,...,Gynecologist,Nephrologist,Neuro Surgeon,Neurologist,Ophthalmologist,Orthopedic Surgeon,Others,Pediatrician,Pulmonologist,Urologist
0,1.155479,1.062318,1.945910,2.484907,100,19,6,1,1,1.002109,...,0,0,0,0,0,0,0,0,0,0
1,0.903322,0.396594,0.693147,0.000000,94,14,11,0,0,1.002109,...,0,0,0,0,0,0,0,0,0,0
2,1.057468,1.192427,1.945910,2.302585,100,10,0,1,1,1.002109,...,0,0,0,0,0,0,0,0,0,0
3,1.168368,0.932210,2.484907,4.276666,96,18,10,1,1,1.002109,...,1,0,0,0,0,0,0,0,0,0
4,1.152318,1.192427,2.564949,5.298317,100,16,2,1,1,1.002109,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2381,1.152318,1.192427,2.397895,2.484907,100,14,11,1,1,1.002109,...,0,0,0,0,0,0,1,0,0,0
2382,1.132140,1.168242,2.397895,2.079442,100,14,11,1,1,1.002109,...,0,0,0,0,0,0,0,1,0,0
2383,0.741365,1.203328,2.397895,4.644391,95,15,12,1,1,1.002109,...,0,0,0,0,0,0,0,0,0,0
2384,1.211507,1.185591,2.564949,3.178054,100,14,12,1,1,1.457490,...,0,1,0,0,0,0,0,0,0,0


In [32]:
y_test = pd.DataFrame(y, columns=['Fee Category'])

In [33]:
y_test

Unnamed: 0,Fee Category
0,1.0
1,0.0
2,2.0
3,0.0
4,2.0
...,...
2381,2.0
2382,2.0
2383,2.0
2384,2.0


In [34]:
X_test_selected = X[selected_features]

In [35]:
# Predict using all models
y_test_pred = LightGBM_model.predict(X_test_selected)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"LightGBM_model accuracy: {test_accuracy}")

y_test_pred = XGBoost_model.predict(X_test_selected)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"XGBoost_model accuracy: {test_accuracy}")

y_test_pred = Random_Forest_model.predict(X_test_selected)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Random_Forest_model accuracy: {test_accuracy}")

LightGBM_model accuracy: 0.932523051131601
XGBoost_model accuracy: 0.9430008382229673
Random_Forest_model accuracy: 0.9492875104777871
