# Import necessary Libraries

In [5]:
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler,SMOTE
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder

In [6]:
pd.set_option('display.max_columns', None)

# Load Dataset

In [7]:
df = pd.read_csv('E:/data science & python/mid project mahmoud sayed/patient_dataset.csv')
df

Unnamed: 0,Visit_Date,Patient_ID,Age,Gender,Diagnosis,Has_Insurance,Area,Total_Cost,Registration time,Nursing time,Laboratory time,Consultation time,Pharmacy time
0,1/1/2019,4174,28,Male,Type 1 Diabetes,Yes,Downtown Dubai,711.975905,21.6,21.1,29.6,25.8,28.2
1,1/1/2019,4507,62,Female,Type 2 Diabetes,Yes,Downtown Dubai,507.680147,13.0,50.9,34.8,26.3,23.2
2,1/1/2019,1860,47,Male,Prediabetes,Yes,Downtown Dubai,317.582770,27.3,20.9,23.9,35.1,14.8
3,1/1/2019,2294,10,Female,Type 2 Diabetes,Yes,Downtown Dubai,484.387822,19.2,30.8,22.9,20.6,21.9
4,1/1/2019,2130,66,Male,Type 2 Diabetes,Yes,Downtown Dubai,118.649105,13.3,29.1,19.5,34.2,36.6
...,...,...,...,...,...,...,...,...,...,...,...,...,...
504995,9/12/2024,3161,69,Male,,,Downtown Dubai,722.692827,1.3,35.8,20.1,37.2,41.9
504996,8/8/2021,4417,49,Male,,,Jumeirah,1030.457617,20.2,17.9,29.4,23.7,19.1
504997,11/22/2021,1910,53,Male,,,Dubai Marina,232.914917,22.4,46.0,31.3,30.6,34.9
504998,8/26/2020,3759,30,Male,,,Downtown Dubai,251.977863,16.3,15.2,24.8,24.3,40.6


# Data Exploration

In [8]:
# check data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 505000 entries, 0 to 504999
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Visit_Date         505000 non-null  object 
 1   Patient_ID         505000 non-null  int64  
 2   Age                505000 non-null  int64  
 3   Gender             494798 non-null  object 
 4   Diagnosis          472288 non-null  object 
 5   Has_Insurance      451050 non-null  object 
 6   Area               505000 non-null  object 
 7   Total_Cost         505000 non-null  float64
 8   Registration time  505000 non-null  float64
 9   Nursing time       505000 non-null  float64
 10  Laboratory time    505000 non-null  float64
 11  Consultation time  505000 non-null  float64
 12  Pharmacy time      505000 non-null  float64
dtypes: float64(6), int64(2), object(5)
memory usage: 50.1+ MB


In [9]:
# Check summary statistics fro numerical
df.describe().round(2)

Unnamed: 0,Patient_ID,Age,Total_Cost,Registration time,Nursing time,Laboratory time,Consultation time,Pharmacy time
count,505000.0,505000.0,505000.0,505000.0,505000.0,505000.0,505000.0,505000.0
mean,2999.23,34.81,431.17,32.02,36.74,36.78,36.7,36.69
std,1154.15,19.26,278.66,58.94,58.24,58.22,58.02,58.07
min,1000.0,0.0,-2039.94,0.0,0.0,0.0,0.0,0.0
25%,2000.0,21.0,241.26,14.3,19.3,19.4,19.3,19.3
50%,2997.0,34.0,386.13,21.8,26.8,26.9,26.8,26.8
75%,3999.0,48.0,576.77,30.0,35.0,35.1,35.0,35.0
max,4999.0,100.0,2853.43,499.0,499.0,499.0,499.0,499.0


In [10]:
# Check summary statistics fro categorical
df.describe(include= 'object')

Unnamed: 0,Visit_Date,Gender,Diagnosis,Has_Insurance,Area
count,505000,494798,472288,451050,505000
unique,2192,3,7,2,9
top,8/31/2024,Male,Type 2 Diabetes,Yes,Downtown Dubai
freq,237,303448,327064,380038,218636


In [11]:
# check missing values
df.isna().sum()

Visit_Date               0
Patient_ID               0
Age                      0
Gender               10202
Diagnosis            32712
Has_Insurance        53950
Area                     0
Total_Cost               0
Registration time        0
Nursing time             0
Laboratory time          0
Consultation time        0
Pharmacy time            0
dtype: int64

In [12]:
# check missing values percentage
(df.isna().mean()*100).round(2)

Visit_Date            0.00
Patient_ID            0.00
Age                   0.00
Gender                2.02
Diagnosis             6.48
Has_Insurance        10.68
Area                  0.00
Total_Cost            0.00
Registration time     0.00
Nursing time          0.00
Laboratory time       0.00
Consultation time     0.00
Pharmacy time         0.00
dtype: float64

In [13]:
# check duplicates
df.duplicated().sum()

np.int64(4243)

In [14]:
df[df.duplicated()]

Unnamed: 0,Visit_Date,Patient_ID,Age,Gender,Diagnosis,Has_Insurance,Area,Total_Cost,Registration time,Nursing time,Laboratory time,Consultation time,Pharmacy time
500000,4/1/2020,4284,19,Female,,,Jumeirah,456.054694,29.8,29.7,26.1,25.9,19.7
500001,5/24/2021,4431,92,Female,,,Al Barsha,389.295801,20.9,34.0,27.9,32.8,38.3
500002,9/6/2020,2733,0,Female,,,Al Qusais,578.583086,31.6,33.5,26.2,30.3,41.8
500003,8/4/2020,1970,16,Male,,,Al Barsha,227.433144,27.3,41.0,34.8,27.9,10.5
500004,11/26/2023,2310,31,Male,,,Dubai Silicon Oasis,392.196215,15.2,8.9,19.5,32.6,36.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...
504994,9/25/2024,2452,53,Male,,,Dubai Marina,145.817529,24.4,24.4,36.6,25.6,19.7
504995,9/12/2024,3161,69,Male,,,Downtown Dubai,722.692827,1.3,35.8,20.1,37.2,41.9
504996,8/8/2021,4417,49,Male,,,Jumeirah,1030.457617,20.2,17.9,29.4,23.7,19.1
504997,11/22/2021,1910,53,Male,,,Dubai Marina,232.914917,22.4,46.0,31.3,30.6,34.9


In [15]:
# delete duplicates 
df.drop_duplicates(inplace= True , ignore_index= True)

In [16]:
# check after delete duplicates 
df.duplicated().sum()

np.int64(0)

In [17]:
# check missing values percentage
(df.isna().mean()* 100).round(2)

Visit_Date           0.00
Patient_ID           0.00
Age                  0.00
Gender               2.02
Diagnosis            5.69
Has_Insurance        9.93
Area                 0.00
Total_Cost           0.00
Registration time    0.00
Nursing time         0.00
Laboratory time      0.00
Consultation time    0.00
Pharmacy time        0.00
dtype: float64

# Data Cleaning

In [18]:
df.columns = df.columns.str.strip().str.replace(' ' , '_')
df.columns

Index(['Visit_Date', 'Patient_ID', 'Age', 'Gender', 'Diagnosis',
       'Has_Insurance', 'Area', 'Total_Cost', 'Registration_time',
       'Nursing_time', 'Laboratory_time', 'Consultation_time',
       'Pharmacy_time'],
      dtype='object')

In [19]:
# rename columns
df = df.rename(columns= {'Registration_time': 'Registration_Duration' , 'Nursing_time':'Nursing_Duration','Laboratory_time': 'Laboratory_Duration'
               ,'Consultation_time':'Consultation_Duration','Pharmacy_time':'Pharmacy_Duration'} )


In [20]:
df.columns

Index(['Visit_Date', 'Patient_ID', 'Age', 'Gender', 'Diagnosis',
       'Has_Insurance', 'Area', 'Total_Cost', 'Registration_Duration',
       'Nursing_Duration', 'Laboratory_Duration', 'Consultation_Duration',
       'Pharmacy_Duration'],
      dtype='object')

### In Depth Check For categorical columns 

In [21]:
cat_col =df.select_dtypes(include='O').columns
cat_col

Index(['Visit_Date', 'Gender', 'Diagnosis', 'Has_Insurance', 'Area'], dtype='object')

In [22]:
for col in cat_col:
    print(col)
    print(df[col].unique())
    print(df[col].nunique())
    print('-' * 100)

Visit_Date
['1/1/2019' '1/2/2019' '1/3/2019' ... '12/29/2024' '12/30/2024'
 '12/31/2024']
2192
----------------------------------------------------------------------------------------------------
Gender
['Male' 'Female' nan 'Other']
3
----------------------------------------------------------------------------------------------------
Diagnosis
['Type 1 Diabetes' 'Type 2 Diabetes' 'Prediabetes' 'Type 3 Diabetes'
 'Gestational Diabetes' nan 'Unknown' ' ']
7
----------------------------------------------------------------------------------------------------
Has_Insurance
['Yes' nan 'No']
2
----------------------------------------------------------------------------------------------------
Area
['Downtown Dubai' 'Bur Dubai' 'Al Barsha' 'Jumeirah' 'Deira' 'Al Qusais'
 'Dubai Marina' 'International City' 'Dubai Silicon Oasis']
9
----------------------------------------------------------------------------------------------------


In [23]:
df['Diagnosis'].value_counts()

Diagnosis
Type 2 Diabetes         327064
Type 1 Diabetes          46752
Prediabetes              46466
Gestational Diabetes     23273
                         14003
Type 3 Diabetes          10100
Unknown                   4630
Name: count, dtype: int64

In [24]:
# replace drop_Diagnosis_empty to np.nan

df['Diagnosis'].replace(' ', np.nan, inplace= True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Diagnosis'].replace(' ', np.nan, inplace= True)


In [25]:
df['Diagnosis'].value_counts()

Diagnosis
Type 2 Diabetes         327064
Type 1 Diabetes          46752
Prediabetes              46466
Gestational Diabetes     23273
Type 3 Diabetes          10100
Unknown                   4630
Name: count, dtype: int64

### In Depth Check For numerical columns 

In [26]:
num_col = df.select_dtypes(include= 'number').columns
num_col

Index(['Patient_ID', 'Age', 'Total_Cost', 'Registration_Duration',
       'Nursing_Duration', 'Laboratory_Duration', 'Consultation_Duration',
       'Pharmacy_Duration'],
      dtype='object')

In [27]:
#for col in num_col:
#    px.histogram(data_frame= df , x= col).show()

In [28]:
# delete value 0 in age 
age_0 = df[df.Age == 0].index
df.drop(age_0 , inplace=True )

In [29]:
df.head(5)

Unnamed: 0,Visit_Date,Patient_ID,Age,Gender,Diagnosis,Has_Insurance,Area,Total_Cost,Registration_Duration,Nursing_Duration,Laboratory_Duration,Consultation_Duration,Pharmacy_Duration
0,1/1/2019,4174,28,Male,Type 1 Diabetes,Yes,Downtown Dubai,711.975905,21.6,21.1,29.6,25.8,28.2
1,1/1/2019,4507,62,Female,Type 2 Diabetes,Yes,Downtown Dubai,507.680147,13.0,50.9,34.8,26.3,23.2
2,1/1/2019,1860,47,Male,Prediabetes,Yes,Downtown Dubai,317.58277,27.3,20.9,23.9,35.1,14.8
3,1/1/2019,2294,10,Female,Type 2 Diabetes,Yes,Downtown Dubai,484.387822,19.2,30.8,22.9,20.6,21.9
4,1/1/2019,2130,66,Male,Type 2 Diabetes,Yes,Downtown Dubai,118.649105,13.3,29.1,19.5,34.2,36.6


In [30]:
# delete value negative & zero  in total cost 
total_cost_less0 = df[df.Total_Cost  <= 0].index
df.drop(total_cost_less0 , inplace=True ,)

In [31]:
df.reset_index(drop= True, inplace= True)
df.head(5)

Unnamed: 0,Visit_Date,Patient_ID,Age,Gender,Diagnosis,Has_Insurance,Area,Total_Cost,Registration_Duration,Nursing_Duration,Laboratory_Duration,Consultation_Duration,Pharmacy_Duration
0,1/1/2019,4174,28,Male,Type 1 Diabetes,Yes,Downtown Dubai,711.975905,21.6,21.1,29.6,25.8,28.2
1,1/1/2019,4507,62,Female,Type 2 Diabetes,Yes,Downtown Dubai,507.680147,13.0,50.9,34.8,26.3,23.2
2,1/1/2019,1860,47,Male,Prediabetes,Yes,Downtown Dubai,317.58277,27.3,20.9,23.9,35.1,14.8
3,1/1/2019,2294,10,Female,Type 2 Diabetes,Yes,Downtown Dubai,484.387822,19.2,30.8,22.9,20.6,21.9
4,1/1/2019,2130,66,Male,Type 2 Diabetes,Yes,Downtown Dubai,118.649105,13.3,29.1,19.5,34.2,36.6


In [32]:
df.isna().sum()

Visit_Date                   0
Patient_ID                   0
Age                          0
Gender                    9577
Diagnosis                40560
Has_Insurance            46286
Area                         0
Total_Cost                   0
Registration_Duration        0
Nursing_Duration             0
Laboratory_Duration          0
Consultation_Duration        0
Pharmacy_Duration            0
dtype: int64

In [33]:
# drop missing valu in gender 
df.dropna(subset= ['Gender'], inplace= True , ignore_index= True )

In [34]:
# check gender after dropna

df['Gender'].isna().sum()

np.int64(0)

In [35]:
df.head(5)

Unnamed: 0,Visit_Date,Patient_ID,Age,Gender,Diagnosis,Has_Insurance,Area,Total_Cost,Registration_Duration,Nursing_Duration,Laboratory_Duration,Consultation_Duration,Pharmacy_Duration
0,1/1/2019,4174,28,Male,Type 1 Diabetes,Yes,Downtown Dubai,711.975905,21.6,21.1,29.6,25.8,28.2
1,1/1/2019,4507,62,Female,Type 2 Diabetes,Yes,Downtown Dubai,507.680147,13.0,50.9,34.8,26.3,23.2
2,1/1/2019,1860,47,Male,Prediabetes,Yes,Downtown Dubai,317.58277,27.3,20.9,23.9,35.1,14.8
3,1/1/2019,2294,10,Female,Type 2 Diabetes,Yes,Downtown Dubai,484.387822,19.2,30.8,22.9,20.6,21.9
4,1/1/2019,2130,66,Male,Type 2 Diabetes,Yes,Downtown Dubai,118.649105,13.3,29.1,19.5,34.2,36.6


# Feature Engineering

In [36]:
# Extract Year from visit date column 
df['Year'] = df['Visit_Date'].str[-4:].astype(int)
df['Year']

0         2019
1         2019
2         2019
3         2019
4         2019
          ... 
463998    2019
463999    2020
464000    2019
464001    2019
464002    2020
Name: Year, Length: 464003, dtype: int64

In [37]:
df.head()

Unnamed: 0,Visit_Date,Patient_ID,Age,Gender,Diagnosis,Has_Insurance,Area,Total_Cost,Registration_Duration,Nursing_Duration,Laboratory_Duration,Consultation_Duration,Pharmacy_Duration,Year
0,1/1/2019,4174,28,Male,Type 1 Diabetes,Yes,Downtown Dubai,711.975905,21.6,21.1,29.6,25.8,28.2,2019
1,1/1/2019,4507,62,Female,Type 2 Diabetes,Yes,Downtown Dubai,507.680147,13.0,50.9,34.8,26.3,23.2,2019
2,1/1/2019,1860,47,Male,Prediabetes,Yes,Downtown Dubai,317.58277,27.3,20.9,23.9,35.1,14.8,2019
3,1/1/2019,2294,10,Female,Type 2 Diabetes,Yes,Downtown Dubai,484.387822,19.2,30.8,22.9,20.6,21.9,2019
4,1/1/2019,2130,66,Male,Type 2 Diabetes,Yes,Downtown Dubai,118.649105,13.3,29.1,19.5,34.2,36.6,2019


# Data Analysis

### Q1 What is the percentage of Diagnosis ?

In [38]:
#px.pie(data_frame= df , names= 'Diagnosis')

Insight Q1:


Diagnosis Percentage AnalysisWhat This Metric Tells Us
The percentage of each Diagnosis shows how common each medical condition is among
all patient visits.It helps identify:
The most frequent health issues,
Potential areas for preventive care,
Resource allocation priorities.

### Q2 What is the percentage of Gender ?

In [39]:
#px.pie(data_frame= df , names= 'Gender')

insight Q2:


 after comparing the a count   It was found that the number of male visitors was greater than the number of females and others.

### Q3 Does age affect the total medical cost?

In [40]:
corresion = df[['Age','Total_Cost']].corr()
corresion

Unnamed: 0,Age,Total_Cost
Age,1.0,0.001799
Total_Cost,0.001799,1.0


In [41]:
#px.scatter(data_frame= df , x= 'Age' , y='Total_Cost')

Insight Q3:

There is virtually no correlation between age and total medical cost in this dataset. 
Age does not significantly impact the total cost.

In [42]:
df.columns

Index(['Visit_Date', 'Patient_ID', 'Age', 'Gender', 'Diagnosis',
       'Has_Insurance', 'Area', 'Total_Cost', 'Registration_Duration',
       'Nursing_Duration', 'Laboratory_Duration', 'Consultation_Duration',
       'Pharmacy_Duration', 'Year'],
      dtype='object')

### Bivariate Analysis

### Q4 What Is The Average Total Cost Per Gender ?

In [43]:
average_total_cost_per_gender = df.groupby('Gender')['Total_Cost'].mean().sort_values(ascending= False).reset_index()

In [44]:
px.bar(data_frame= average_total_cost_per_gender, x= 'Gender', y= 'Total_Cost' , labels= {'Total_Cost': 'avg total cost per gender'}, 
       title= 'average_total_cost_per_gender' , text_auto=True)

Insight Q4:

The average cost is nearly identical across all genders, indicating that gender does not influence total cost in this dataset.

### Q5 What Is The Total Cost Per Year ?

In [45]:
total_cost_per_year = df.groupby('Year')['Total_Cost'].sum().sort_values(ascending= False).reset_index().round(2)
total_cost_per_year

Unnamed: 0,Year,Total_Cost
0,2019,34177657.25
1,2020,34110473.83
2,2024,34042986.85
3,2022,34016850.02
4,2023,33917439.72
5,2021,33873235.29


In [46]:
px.bar(data_frame= total_cost_per_year, x= 'Year', y= 'Total_Cost' , labels= {'Total_Cost': 'total cost per year'}, 
       title= 'total cost per year' , text_auto=True)

Insight Q5:

Total medical costs are consistent year-over-year, hovering around 34 million AED.

### Q6 What Is The Average Consultation Duration Per Diagnosis?

In [47]:
avg_Consultation_Duration_per_diagnosis   = df.groupby('Diagnosis')['Consultation_Duration'].mean().sort_values(ascending= False).reset_index().rename(columns={'Consultation_Duration': 'Avg_Consultation_Duration'})
avg_Consultation_Duration_per_diagnosis

Unnamed: 0,Diagnosis,Avg_Consultation_Duration
0,Type 3 Diabetes,348.476372
1,Gestational Diabetes,30.161615
2,Prediabetes,29.983463
3,Type 1 Diabetes,29.918964
4,Type 2 Diabetes,29.826186
5,Unknown,29.633048


In [48]:
px.bar(data_frame= avg_Consultation_Duration_per_diagnosis ,x='Diagnosis',y='Avg_Consultation_Duration' ,
       labels= {'Consultation_Duration' : 'Average Consultation Duration'}, title= 'Average Consultation Duration per Diagnosis',
       text_auto= True) 

Insight Q6:

Type 3 Diabetes patients tend to have the longest consultations on average.

# Multivariate Analysis

### Q7 What Is The Avg Total Cost Per Area Per Diagnosis ?

In [49]:
avg_total_cost_per_area = df.groupby(['Gender', 'Diagnosis'])['Total_Cost'].mean().reset_index()
avg_total_cost_per_area

Unnamed: 0,Gender,Diagnosis,Total_Cost
0,Female,Gestational Diabetes,435.303589
1,Female,Prediabetes,440.424119
2,Female,Type 1 Diabetes,437.395685
3,Female,Type 2 Diabetes,441.029905
4,Female,Type 3 Diabetes,433.557805
5,Female,Unknown,438.990634
6,Male,Gestational Diabetes,436.697918
7,Male,Prediabetes,439.460084
8,Male,Type 1 Diabetes,441.09706
9,Male,Type 2 Diabetes,439.882007


In [50]:
px.bar(data_frame= avg_total_cost_per_area ,x='Diagnosis' , y='Total_Cost',
         labels= {'Total_Cost' : 'Average No of Total_Cost'}, title= 'Avg Total_Cost per Diagnosis per Gender',
       text_auto= True, color= 'Gender', barmode= 'group')  

Insight Q7:

Costs vary slightly by area and diagnosis, with Downtown Dubai showing consistently higher costs compared to others.

 # Deployment (stremlit)

In [51]:
df.to_csv('cleand_data.csv')

In [52]:
import streamlit as st

In [60]:
%%writefile diabetes_patient.py
import pandas as pd
import streamlit as st
import plotly.express as px
import seaborn as sns
st.set_page_config(layout= 'wide', page_title= 'diabetes_patient')
st.markdown("<h1 style='text-align: center; color: white;'>diabetes dataset with Analysis</h1>", unsafe_allow_html=True)
st.image('https://img.freepik.com/free-vector/diabetes-flat-composition-medical-with-patient-symptoms-complications-blood-sugar-meter-treatments-medication_1284-28998.jpg')
df = pd.read_csv('cleand_data.csv',index_col=0)
page = st.sidebar.radio('Pages', ['Introduction', 'Analysis Questions', 'Reporting'])
if page == 'Introduction':

    st.dataframe(df.head())

    st.header('diabetes_patient Description')

    st.write('''The dataset contains 232002 entries and 13 columns. Here's a summary of the columns:

    Visit_Date: The date on which the patient visited the medical facility (past 6 years).
    Patient_ID:  A unique identifier assigned to each patient.
    Age: Age of the patient at the time of the visit (0–100 years).
    Gender: The gender of the patient ('Male', 'Female', or possibly other identifiers).
    Diagnosis: Diabetes-related diagnosis (Type 1, Type 2, Prediabetes, Gestational, or missing).
    Has_Insurance: Indicates whether the patient has health insurance coverage (e.g., Yes/No or 1/0).
    Total_Cost: The total cost incurred for the patient's visit, including all services.
    Area: Geographical area or region where the patient resides or where the facility is located
      within the emirate (e.g., Al Ain, Palm Jumeirah).
    Registration duration: Time spent during registration (in minutes).
    Nursing duration: Time spent with nursing staff (in minutes).
    Laboratory duration: Time spent in the laboratory (in minutes).
    Consultation duration: Time spent in consultation (in minutes).
    Pharmacy duration: Time spent at the pharmacy (in minutes)
    Year :Year in which the patient visit took place (extracted from Visit_Date ).''')

elif page == 'Analysis Questions':

    st.header(' Q1 What is the percentage of Diagnosis ?')
    st.plotly_chart(px.pie(data_frame= df , names= 'Diagnosis'))
    st.write(''' Insight Q1:

Diagnosis Percentage AnalysisWhat This Metric Tells Us
The percentage of each Diagnosis shows how common each medical condition is among
all patient visits.It helps identify:
The most frequent health issues,
Potential areas for preventive care,
Resource allocation priorities.''')
    st.header('Q2 What is the percentage of Gender ?')
    st.plotly_chart(px.pie(data_frame= df , names= 'Gender'))
    st.write('''insight Q2:

 after comparing the a count   It was found that the number of male visitors was greater than the number of females and others.''')
    st.header('Q3 Does age affect the total medical cost?')
    st.plotly_chart(px.scatter(data_frame= df , x= 'Age' , y='Total_Cost'))
    st.write(''' Insight Q3:

    There is virtually no correlation between age and total medical cost in this dataset. 
             Age does not significantly impact the total cost. ''')
    st.header(' Q4 What Is The Average Cost Per Gender ?')
    average_total_cost_per_gender = df.groupby('Gender')['Total_Cost'].mean().sort_values(ascending= False).reset_index()
    st.plotly_chart(px.bar(average_total_cost_per_gender , x= 'Gender', y= 'Total_Cost' , labels= {'Total_Cost': 'avg total cost per gender'}, 
       title= 'average_total_cost_per_gender' , text_auto=True))
    st.write('''Insight Q4:

The average cost is nearly identical across all genders,
 indicating that gender does not influence total cost in this dataset.
     ''')
    st.header('Q5 What Is The Total Cost Per Year ?')
    total_cost_per_year = df.groupby('Year')['Total_Cost'].sum().sort_values(ascending= False).reset_index().round(2)
    st.plotly_chart(px.bar(total_cost_per_year, x= 'Year', y= 'Total_Cost' , labels= {'Total_Cost': 'total cost per year'}, 
       title= 'total cost per year' , text_auto=True))
    st.write('''Insight Q5:

Total medical costs are consistent year-over-year, hovering around 34 million AED.
     ''')
    st.header('Q6 What Is The Average Consultation Duration Per Diagnosis?')
    avg_Consultation_Duration_per_diagnosis = df.groupby('Diagnosis')['Consultation_Duration'].mean().sort_values(ascending= False).reset_index().rename(columns={'Consultation_Duration': 'Avg_Consultation_Duration'})
    st.plotly_chart(px.bar(avg_Consultation_Duration_per_diagnosis, x='Diagnosis',y='Avg_Consultation_Duration' ,
       labels= {'Consultation_Duration' : 'Average Consultation Duration'}, title= 'Average Consultation Duration per Diagnosis',
       text_auto= True) )
    st.write('''Insight Q6:

Type 3 Diabetes patients tend to have the longest consultations on average.
     ''')
    st.header('Q7 What Is The Avg Total Cost Per Gender Per Diagnosis ?')
    avg_total_cost_per_area = df.groupby(['Gender', 'Diagnosis'])['Total_Cost'].mean().reset_index()
    st.plotly_chart(px.bar(avg_total_cost_per_area ,x='Diagnosis' , y='Total_Cost',
         labels= {'Total_Cost' : 'Avg of Total_Cost'}, title= 'Avg Total_Cost per Diagnosis per Gender',
       text_auto= True, color= 'Gender', barmode= 'group'))

    st.write('''Insight Q7:

Costs vary slightly by area and diagnosis, 
with Downtown Dubai showing consistently higher costs compared to others.
     ''')
    st.header('Q8 What Is The Average Cost Per Area?')
    avg_total_cost_per_area = df.groupby('Area')['Total_Cost'].mean().sort_values(ascending=False).reset_index()
    st.plotly_chart(px.bar(avg_total_cost_per_area, x='Area', y='Total_Cost',
        labels={'Total_Cost': 'Avg Total Cost'},
        title='Avg Total Cost per Area', text_auto=True))
    st.write('''Insight Q8:

Some areas, such as Downtown Dubai or Al Ain, show higher average medical costs, 
which may reflect differences in healthcare facilities, services, or demographics.''')
elif page == 'Reporting':
    Gender = st.sidebar.selectbox('Gender' ,df['Gender'].unique())
    Year = st.sidebar.selectbox('Year', df['Year'].unique())
    Area = st.sidebar.selectbox('Area',df['Area'].unique())
    Diagnosis = st.sidebar.selectbox('Diagnosis', df['Diagnosis'].unique())
    df2 = df[(df['Gender']== Gender) & (df['Year'] == Year) &(df['Area']== Area) & (df['Diagnosis']==Diagnosis) ]
    st.subheader("Distribution of Total Cost for Selected Filters")
    if not df2.empty:
        st.plotly_chart(px.histogram(df2, x='Total_Cost', nbins=20, title='Total Cost Distribution'))
    else:
        st.warning("No data available for the selected filters.")
    st.dataframe(df2.head(50))    

    st.subheader("Average Consultation Duration by Gender")
    if not df2.empty:
        avg_duration = df2.groupby('Gender')['Consultation_Duration'].mean().reset_index()
        st.plotly_chart(px.bar(avg_duration, x='Gender', y='Consultation_Duration',
            title='Average Consultation Duration by Gender',
            labels={'Consultation_Duration': 'Avg Duration (minutes)'}))
    else:
        st.warning("No data available for the selected filters.") 


Overwriting diabetes_patient.py


In [59]:
!streamlit run diabetes_patient.py

^C


In [None]:
import pipreqs

In [None]:
! pipreqs

INFO: Not scanning for jupyter notebooks.
Please, verify manually the final list of requirements.txt to avoid possible dependency confusions.
Please, verify manually the final list of requirements.txt to avoid possible dependency confusions.
INFO: Successfully saved requirements file in e:\data science & python\mid project mahmoud sayed\requirements.txt


## Data Preprocessing

In [None]:
df.head()

Unnamed: 0,Visit_Date,Patient_ID,Age,Gender,Diagnosis,Has_Insurance,Area,Total_Cost,Registration_Duration,Nursing_Duration,Laboratory_Duration,Consultation_Duration,Pharmacy_Duration,Year
0,1/1/2019,4174,28,Male,Type 1 Diabetes,Yes,Downtown Dubai,711.975905,21.6,21.1,29.6,25.8,28.2,2019
1,1/1/2019,4507,62,Female,Type 2 Diabetes,Yes,Downtown Dubai,507.680147,13.0,50.9,34.8,26.3,23.2,2019
2,1/1/2019,1860,47,Male,Prediabetes,Yes,Downtown Dubai,317.58277,27.3,20.9,23.9,35.1,14.8,2019
3,1/1/2019,2294,10,Female,Type 2 Diabetes,Yes,Downtown Dubai,484.387822,19.2,30.8,22.9,20.6,21.9,2019
4,1/1/2019,2130,66,Male,Type 2 Diabetes,Yes,Downtown Dubai,118.649105,13.3,29.1,19.5,34.2,36.6,2019


In [None]:
df.drop(['Visit_Date'], axis=1 , inplace= True)

In [None]:
df.duplicated().sum()

np.int64(0)

### Split Data into Input features and Target Feature

In [None]:
x= df.drop('Total_Cost', axis= 1)
y = df['Total_Cost']

### Split Data into Train and Test

In [None]:
from sklearn.model_selection import train_test_split 

x_train , x_test , y_train , y_test = train_test_split(x,y, test_size= 0.2 , random_state= 42)

In [None]:
x_train.reset_index(drop=True , inplace= True)
x_test.reset_index(drop=True , inplace= True)
y_train.reset_index(drop=True , inplace= True)
y_test.reset_index(drop=True , inplace= True)

In [None]:
x_train

Unnamed: 0,Patient_ID,Age,Gender,Diagnosis,Has_Insurance,Area,Registration_Duration,Nursing_Duration,Laboratory_Duration,Consultation_Duration,Pharmacy_Duration,Year
0,2284,27,Female,Type 2 Diabetes,Yes,Downtown Dubai,22.9,32.9,19.2,25.6,33.1,2024
1,3576,60,Female,Type 2 Diabetes,Yes,Downtown Dubai,24.1,13.9,39.0,42.8,26.1,2022
2,4340,39,Male,Type 1 Diabetes,Yes,Jumeirah,26.9,17.1,32.8,29.9,13.8,2024
3,2037,60,Male,,,Jumeirah,28.9,36.6,15.3,49.6,56.9,2022
4,1010,75,Female,Type 2 Diabetes,No,Dubai Silicon Oasis,2.4,53.7,16.8,45.3,36.0,2024
...,...,...,...,...,...,...,...,...,...,...,...,...
371197,2670,13,Male,Type 1 Diabetes,Yes,Downtown Dubai,20.9,33.8,24.8,0.8,31.2,2022
371198,1965,56,Male,Type 2 Diabetes,Yes,Downtown Dubai,33.2,2.6,24.4,23.9,16.3,2023
371199,2620,4,Male,Type 2 Diabetes,No,Al Barsha,45.1,45.2,36.8,36.2,32.1,2020
371200,1488,22,Female,Type 1 Diabetes,Yes,Downtown Dubai,14.6,21.3,20.2,17.9,27.7,2020


In [None]:
x_test

Unnamed: 0,Patient_ID,Age,Gender,Diagnosis,Has_Insurance,Area,Registration_Duration,Nursing_Duration,Laboratory_Duration,Consultation_Duration,Pharmacy_Duration,Year
0,4151,20,Female,Prediabetes,No,Dubai Silicon Oasis,18.9,23.2,48.8,33.5,73.8,2020
1,4304,29,Male,Type 1 Diabetes,Yes,Downtown Dubai,14.9,17.8,19.8,16.3,29.5,2024
2,1503,50,Male,Type 2 Diabetes,Yes,Downtown Dubai,13.6,28.4,10.1,9.7,21.3,2024
3,3662,32,Male,Type 2 Diabetes,Yes,Downtown Dubai,21.0,25.7,7.2,27.3,20.5,2021
4,4086,18,Female,Type 2 Diabetes,Yes,Downtown Dubai,21.5,24.8,0.4,22.7,27.6,2023
...,...,...,...,...,...,...,...,...,...,...,...,...
92796,2916,60,Male,Type 2 Diabetes,Yes,Downtown Dubai,12.3,27.9,42.2,21.5,29.7,2024
92797,1247,52,Female,Type 2 Diabetes,No,International City,22.4,28.7,32.3,34.0,29.1,2021
92798,2608,8,Male,,,Downtown Dubai,15.0,34.6,33.7,30.8,24.7,2021
92799,1922,39,Male,Unknown,Yes,Downtown Dubai,23.9,22.2,21.8,32.5,39.2,2019


In [None]:
y_train

0          184.557073
1          362.122415
2          505.909587
3            6.814337
4          441.684509
             ...     
371197     614.893158
371198     222.823501
371199     569.280185
371200     618.054066
371201    1315.350825
Name: Total_Cost, Length: 371202, dtype: float64

In [None]:
y_test

0        193.777766
1        359.039146
2        420.333965
3        255.521423
4        163.647980
            ...    
92796    463.048650
92797     34.652727
92798    586.137299
92799    300.795880
92800    567.253219
Name: Total_Cost, Length: 92801, dtype: float64

##  handling Categorical Data 

In [None]:
cat_col =x_train.select_dtypes(include='O').columns
cat_col

Index(['Gender', 'Diagnosis', 'Has_Insurance', 'Area'], dtype='object')

In [None]:
# check missing value 
x_train.isna().sum()

Patient_ID                   0
Age                          0
Gender                       0
Diagnosis                31750
Has_Insurance            36359
Area                         0
Registration_Duration        0
Nursing_Duration             0
Laboratory_Duration          0
Consultation_Duration        0
Pharmacy_Duration            0
Year                         0
dtype: int64

In [None]:
# check missing value percentage 
round(x_train.isna().mean()*100,2)

Patient_ID               0.00
Age                      0.00
Gender                   0.00
Diagnosis                8.55
Has_Insurance            9.79
Area                     0.00
Registration_Duration    0.00
Nursing_Duration         0.00
Laboratory_Duration      0.00
Consultation_Duration    0.00
Pharmacy_Duration        0.00
Year                     0.00
dtype: float64

In [None]:
# Impute missing value in Diagnosis used SimpleImputer

from sklearn.impute import KNNImputer, SimpleImputer

sim_impu = SimpleImputer(strategy= 'constant' , fill_value= 'Unknown' )

x_train[['Diagnosis']] = sim_impu.fit_transform(x_train[['Diagnosis']])
x_test[['Diagnosis']] = sim_impu.transform(x_test[['Diagnosis']])

In [None]:
# Impute missing value in Has_Insurance used SimpleImputer

from sklearn.impute import KNNImputer, SimpleImputer

sim_impu2 = SimpleImputer(strategy= 'most_frequent'  )

x_train[['Has_Insurance']] = sim_impu2.fit_transform(x_train[['Has_Insurance']])
x_test[['Has_Insurance']] = sim_impu2.transform(x_test[['Has_Insurance']])

In [None]:
# check after impute missing values

x_train.isna().sum()

Patient_ID               0
Age                      0
Gender                   0
Diagnosis                0
Has_Insurance            0
Area                     0
Registration_Duration    0
Nursing_Duration         0
Laboratory_Duration      0
Consultation_Duration    0
Pharmacy_Duration        0
Year                     0
dtype: int64

In [None]:
# check after impute missing values
x_test.isna().sum()

Patient_ID               0
Age                      0
Gender                   0
Diagnosis                0
Has_Insurance            0
Area                     0
Registration_Duration    0
Nursing_Duration         0
Laboratory_Duration      0
Consultation_Duration    0
Pharmacy_Duration        0
Year                     0
dtype: int64

## encoding categorical columns

In [None]:
cat_col =x_train.select_dtypes(include='O').columns
cat_col

Index(['Gender', 'Diagnosis', 'Has_Insurance', 'Area'], dtype='object')

In [None]:
for col in cat_col:

    print(col)
    print(x_train[col].nunique())
    print(x_train[col].unique())
    print(x_train[col].value_counts())
    print("*" *100)

Gender
3
['Female' 'Male' 'Other']
Gender
Male      227666
Female    132198
Other      11338
Name: count, dtype: int64
****************************************************************************************************
Diagnosis
6
['Type 2 Diabetes' 'Type 1 Diabetes' 'Unknown' 'Prediabetes'
 'Gestational Diabetes' 'Type 3 Diabetes']
Diagnosis
Type 2 Diabetes         244689
Unknown                  35238
Type 1 Diabetes          35062
Prediabetes              34937
Gestational Diabetes     17467
Type 3 Diabetes           3809
Name: count, dtype: int64
****************************************************************************************************
Has_Insurance
2
['Yes' 'No']
Has_Insurance
Yes    318617
No      52585
Name: count, dtype: int64
****************************************************************************************************
Area
9
['Downtown Dubai' 'Jumeirah' 'Dubai Silicon Oasis' 'Bur Dubai' 'Deira'
 'Al Barsha' 'Al Qusais' 'Dubai Marina' 'International City']
Are

In [None]:
# handeling categorical columns used OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(drop= 'first', sparse_output= False)

ohe_x_train= ohe.fit_transform(x_train[['Gender','Diagnosis','Has_Insurance','Area']])
ohe_x_test= ohe.transform(x_test[['Gender','Diagnosis','Has_Insurance','Area']])

In [None]:
ohe_x_train = pd.DataFrame(ohe_x_train, columns= ohe.get_feature_names_out())
ohe_x_train

Unnamed: 0,Gender_Male,Gender_Other,Diagnosis_Prediabetes,Diagnosis_Type 1 Diabetes,Diagnosis_Type 2 Diabetes,Diagnosis_Type 3 Diabetes,Diagnosis_Unknown,Has_Insurance_Yes,Area_Al Qusais,Area_Bur Dubai,Area_Deira,Area_Downtown Dubai,Area_Dubai Marina,Area_Dubai Silicon Oasis,Area_International City,Area_Jumeirah
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371197,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
371198,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
371199,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
371200,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
ohe_x_test = pd.DataFrame(ohe_x_test, columns= ohe.get_feature_names_out())
ohe_x_test

Unnamed: 0,Gender_Male,Gender_Other,Diagnosis_Prediabetes,Diagnosis_Type 1 Diabetes,Diagnosis_Type 2 Diabetes,Diagnosis_Type 3 Diabetes,Diagnosis_Unknown,Has_Insurance_Yes,Area_Al Qusais,Area_Bur Dubai,Area_Deira,Area_Downtown Dubai,Area_Dubai Marina,Area_Dubai Silicon Oasis,Area_International City,Area_Jumeirah
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92796,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
92797,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
92798,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
92799,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
x_train= pd.concat([x_train,ohe_x_train] , axis= 1 ).drop(['Gender', 'Diagnosis', 'Has_Insurance', 'Area'], axis= 1)
x_train

Unnamed: 0,Patient_ID,Age,Registration_Duration,Nursing_Duration,Laboratory_Duration,Consultation_Duration,Pharmacy_Duration,Year,Gender_Male,Gender_Other,Diagnosis_Prediabetes,Diagnosis_Type 1 Diabetes,Diagnosis_Type 2 Diabetes,Diagnosis_Type 3 Diabetes,Diagnosis_Unknown,Has_Insurance_Yes,Area_Al Qusais,Area_Bur Dubai,Area_Deira,Area_Downtown Dubai,Area_Dubai Marina,Area_Dubai Silicon Oasis,Area_International City,Area_Jumeirah
0,2284,27,22.9,32.9,19.2,25.6,33.1,2024,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,3576,60,24.1,13.9,39.0,42.8,26.1,2022,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,4340,39,26.9,17.1,32.8,29.9,13.8,2024,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,2037,60,28.9,36.6,15.3,49.6,56.9,2022,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1010,75,2.4,53.7,16.8,45.3,36.0,2024,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371197,2670,13,20.9,33.8,24.8,0.8,31.2,2022,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
371198,1965,56,33.2,2.6,24.4,23.9,16.3,2023,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
371199,2620,4,45.1,45.2,36.8,36.2,32.1,2020,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
371200,1488,22,14.6,21.3,20.2,17.9,27.7,2020,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
x_test = pd.concat([x_test,ohe_x_test] , axis= 1).drop(['Gender', 'Diagnosis', 'Has_Insurance', 'Area'], axis= 1 )
x_test

Unnamed: 0,Patient_ID,Age,Registration_Duration,Nursing_Duration,Laboratory_Duration,Consultation_Duration,Pharmacy_Duration,Year,Gender_Male,Gender_Other,Diagnosis_Prediabetes,Diagnosis_Type 1 Diabetes,Diagnosis_Type 2 Diabetes,Diagnosis_Type 3 Diabetes,Diagnosis_Unknown,Has_Insurance_Yes,Area_Al Qusais,Area_Bur Dubai,Area_Deira,Area_Downtown Dubai,Area_Dubai Marina,Area_Dubai Silicon Oasis,Area_International City,Area_Jumeirah
0,4151,20,18.9,23.2,48.8,33.5,73.8,2020,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,4304,29,14.9,17.8,19.8,16.3,29.5,2024,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,1503,50,13.6,28.4,10.1,9.7,21.3,2024,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,3662,32,21.0,25.7,7.2,27.3,20.5,2021,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,4086,18,21.5,24.8,0.4,22.7,27.6,2023,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
92796,2916,60,12.3,27.9,42.2,21.5,29.7,2024,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
92797,1247,52,22.4,28.7,32.3,34.0,29.1,2021,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
92798,2608,8,15.0,34.6,33.7,30.8,24.7,2021,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
92799,1922,39,23.9,22.2,21.8,32.5,39.2,2019,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [None]:
x_train.columns

Index(['Patient_ID', 'Age', 'Registration_Duration', 'Nursing_Duration',
       'Laboratory_Duration', 'Consultation_Duration', 'Pharmacy_Duration',
       'Year', 'Gender_Male', 'Gender_Other', 'Diagnosis_Prediabetes',
       'Diagnosis_Type 1 Diabetes', 'Diagnosis_Type 2 Diabetes',
       'Diagnosis_Type 3 Diabetes', 'Diagnosis_Unknown', 'Has_Insurance_Yes',
       'Area_Al Qusais', 'Area_Bur Dubai', 'Area_Deira', 'Area_Downtown Dubai',
       'Area_Dubai Marina', 'Area_Dubai Silicon Oasis',
       'Area_International City', 'Area_Jumeirah'],
      dtype='object')

In [None]:
x_test.columns

Index(['Patient_ID', 'Age', 'Registration_Duration', 'Nursing_Duration',
       'Laboratory_Duration', 'Consultation_Duration', 'Pharmacy_Duration',
       'Year', 'Gender_Male', 'Gender_Other', 'Diagnosis_Prediabetes',
       'Diagnosis_Type 1 Diabetes', 'Diagnosis_Type 2 Diabetes',
       'Diagnosis_Type 3 Diabetes', 'Diagnosis_Unknown', 'Has_Insurance_Yes',
       'Area_Al Qusais', 'Area_Bur Dubai', 'Area_Deira', 'Area_Downtown Dubai',
       'Area_Dubai Marina', 'Area_Dubai Silicon Oasis',
       'Area_International City', 'Area_Jumeirah'],
      dtype='object')

In [None]:
scaled_cols = ['Patient_ID', 'Age', 'Registration_Duration', 'Nursing_Duration',
       'Laboratory_Duration', 'Consultation_Duration', 'Pharmacy_Duration',
       'Year', 'Gender_Male', 'Gender_Other', 'Diagnosis_Prediabetes',
       'Diagnosis_Type 1 Diabetes', 'Diagnosis_Type 2 Diabetes',
       'Diagnosis_Type 3 Diabetes', 'Diagnosis_Unknown', 'Has_Insurance_Yes',
       'Area_Al Qusais', 'Area_Bur Dubai', 'Area_Deira', 'Area_Downtown Dubai',
       'Area_Dubai Marina', 'Area_Dubai Silicon Oasis',
       'Area_International City', 'Area_Jumeirah']

In [None]:
from sklearn.preprocessing import RobustScaler

rc = RobustScaler()

x_train[scaled_cols] = rc.fit_transform(x_train[scaled_cols])

x_test[scaled_cols] = rc.transform(x_test[scaled_cols])

In [None]:
x_train

Unnamed: 0,Patient_ID,Age,Registration_Duration,Nursing_Duration,Laboratory_Duration,Consultation_Duration,Pharmacy_Duration,Year,Gender_Male,Gender_Other,Diagnosis_Prediabetes,Diagnosis_Type 1 Diabetes,Diagnosis_Type 2 Diabetes,Diagnosis_Type 3 Diabetes,Diagnosis_Unknown,Has_Insurance_Yes,Area_Al Qusais,Area_Bur Dubai,Area_Deira,Area_Downtown Dubai,Area_Dubai Marina,Area_Dubai Silicon Oasis,Area_International City,Area_Jumeirah
0,-0.3560,-0.346154,0.077922,0.406452,-0.483871,-0.071429,0.419355,1.000000,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.2900,0.923077,0.155844,-0.819355,0.793548,1.045455,-0.032258,0.333333,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.6720,0.115385,0.337662,-0.612903,0.393548,0.207792,-0.825806,1.000000,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.4795,0.923077,0.467532,0.645161,-0.735484,1.487013,1.954839,0.333333,0.0,0.0,0.0,0.0,-1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.9930,1.500000,-1.253247,1.748387,-0.638710,1.207792,0.606452,1.000000,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
371197,-0.1630,-0.884615,-0.051948,0.464516,-0.122581,-1.681818,0.296774,0.333333,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
371198,-0.5155,0.769231,0.746753,-1.548387,-0.148387,-0.181818,-0.664516,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
371199,-0.1880,-1.230769,1.519481,1.200000,0.651613,0.616883,0.354839,-0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
371200,-0.7540,-0.538462,-0.461039,-0.341935,-0.419355,-0.571429,0.070968,-0.333333,-1.0,0.0,0.0,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
