<a href="https://colab.research.google.com/github/mpatel5/FoundationsOfDataScience/blob/main/PythonProject2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [254]:
# Install required libraries
!pip install plotly cufflinks pandas numpy




In [255]:
# Import necessary libraries
import pandas as pd
import numpy as np
import plotly.graph_objs as go
import plotly.io as pio
import plotly.express as px
%matplotlib inline
pio.renderers.default = "colab" # Attempt to set Colab renderer

In [256]:
#Load dataset
df = pd.read_csv('https://raw.githubusercontent.com/niteen11/DataAnalyticsAcademy/master/Python/dataset_diabetes/diabetic_data.csv')


In [257]:
# Preview the dataset before processing
print("\nDataset Sample Before Cleaning:")
df.head()


Dataset Sample Before Cleaning:


Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [258]:
#Data Cleaning - Handle missing values
df.replace('?', np.nan, inplace=True)  # Convert '?' to NaN
original_row_count = len(df)


In [259]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [260]:
# Drop rows where race or A1Cresult is missing
df.dropna(subset=['race', 'A1Cresult'], inplace=True)
dropped_row_count = original_row_count - len(df)
percent_dropped = (dropped_row_count/original_row_count) * 100
print(f"\nPercentage of rows dropped due to missing race or A1Cresult: {percent_dropped:.2f}%")



Percentage of rows dropped due to missing race or A1Cresult: 83.69%


In [261]:
# Calculate counts for each A1C result category
a1c_counts = df['A1Cresult'].value_counts()

In [262]:
# Print the total counts
print("\nTotal A1C Counts:")
print(a1c_counts)


Total A1C Counts:
A1Cresult
>8      7961
Norm    4905
>7      3730
Name: count, dtype: int64


In [263]:
# Calculate percentages for '>7' and '>8'
percent_greater_7 = (a1c_counts['>7'] / a1c_counts.sum()) * 100
percent_greater_8 = (a1c_counts['>8'] / a1c_counts.sum()) * 100


In [264]:
# Create pie chart
fig = px.pie(
    values=a1c_counts.values,
    names=a1c_counts.index,
    title='Total A1C Counts'
)
fig.show()

In [265]:
# Conclusion for total A1C counts
print("\nConclusion:")
print(f"A significant portion of patients have elevated A1C levels, with {percent_greater_7:.2f}% having A1C >7 and {percent_greater_8:.2f}% having A1C >8. This highlights the prevalence of uncontrolled diabetes and the need for targeted interventions to improve their glycemic management.")


Conclusion:
A significant portion of patients have elevated A1C levels, with 22.48% having A1C >7 and 47.97% having A1C >8. This highlights the prevalence of uncontrolled diabetes and the need for targeted interventions to improve their glycemic management.


In [266]:
# Calculate A1C counts by race
a1c_counts_by_race = pd.crosstab(df['race'], df['A1Cresult'])


In [267]:
# Display the table
print("\nA1C Counts by Race:")
print(a1c_counts_by_race)


A1C Counts by Race:
A1Cresult          >7    >8  Norm
race                             
AfricanAmerican   559  1845  1106
Asian              36    62    37
Caucasian        2967  5639  3549
Hispanic           98   265   128
Other              70   150    85


In [268]:
# Calculate percentages for each race
a1c_percentages = a1c_counts_by_race.div(a1c_counts_by_race.sum(axis=1), axis=0) * 100

In [269]:
# Display the percentages
print("\nA1C Percentages by Race:")
print(a1c_percentages)


A1C Percentages by Race:
A1Cresult               >7         >8       Norm
race                                            
AfricanAmerican  15.925926  52.564103  31.509972
Asian            26.666667  45.925926  27.407407
Caucasian        24.409708  46.392431  29.197861
Hispanic         19.959267  53.971487  26.069246
Other            22.950820  49.180328  27.868852


In [270]:
#Detailed Statistics
print("\nDetailed Statistics:")

for race in a1c_percentages.index:
    print(f"\nRace: {race}")
    total_race_patients = a1c_counts_by_race.loc[race].sum()
    print(f"  Total Patients: {total_race_patients}")
    for a1c_result in a1c_percentages.columns:
        count = a1c_counts_by_race.loc[race, a1c_result]
        percentage = a1c_percentages.loc[race, a1c_result]
        print(f"    {a1c_result}: Count = {count}, Percentage = {percentage:.2f}%")




Detailed Statistics:

Race: AfricanAmerican
  Total Patients: 3510
    >7: Count = 559, Percentage = 15.93%
    >8: Count = 1845, Percentage = 52.56%
    Norm: Count = 1106, Percentage = 31.51%

Race: Asian
  Total Patients: 135
    >7: Count = 36, Percentage = 26.67%
    >8: Count = 62, Percentage = 45.93%
    Norm: Count = 37, Percentage = 27.41%

Race: Caucasian
  Total Patients: 12155
    >7: Count = 2967, Percentage = 24.41%
    >8: Count = 5639, Percentage = 46.39%
    Norm: Count = 3549, Percentage = 29.20%

Race: Hispanic
  Total Patients: 491
    >7: Count = 98, Percentage = 19.96%
    >8: Count = 265, Percentage = 53.97%
    Norm: Count = 128, Percentage = 26.07%

Race: Other
  Total Patients: 305
    >7: Count = 70, Percentage = 22.95%
    >8: Count = 150, Percentage = 49.18%
    Norm: Count = 85, Percentage = 27.87%


In [271]:
#Data Visualization
traces = []
colors = ['blue', 'orange', 'green', 'red']
a1c_results = a1c_percentages.columns

for i, a1c_result in enumerate(a1c_results):
    traces.append(go.Bar(
        x=a1c_percentages.index,
        y=a1c_percentages[a1c_result],
        name=a1c_result,
        marker=dict(color=colors[i])
    ))

layout = go.Layout(
    barmode='stack',
    title='Percentage of A1C Levels by Race',
    xaxis=dict(title='Race'),
    yaxis=dict(title='Percentage of Patients')
)

fig = go.Figure(data=traces, layout=layout)
fig.show()



In [272]:
# Conclusion
print("\nConclusion: A1C results vary across racial groups. By observing the percentage based stacked bar chart, and the percentage table, it is easy to see the proportional difference in A1C results between the different racial groups.")


Conclusion: A1C results vary across racial groups. By observing the percentage based stacked bar chart, and the percentage table, it is easy to see the proportional difference in A1C results between the different racial groups.


In [273]:
# Pie chart for each race
for race in a1c_percentages.index:
    fig = go.Figure(data=[go.Pie(
        labels=a1c_percentages.columns,  # A1C result categories (Norm, >7, >8)
        values=a1c_percentages.loc[race],  # Percentages for the current race
        textinfo='label+percent',  # Show label and percentage on slices
        insidetextorientation='radial'  # Format percentage text
    )])
    fig.update_layout(title_text=f'A1C Distribution for {race}')
    fig.show()

In [274]:
# Descriptive statistics for num_medications by A1C category
print("\nDescriptive Statistics for Number of Medications by A1C Category:")
print(df.groupby('A1Cresult')['num_medications'].describe())


Descriptive Statistics for Number of Medications by A1C Category:
            count       mean       std  min   25%   50%   75%   max
A1Cresult                                                          
>7         3730.0  16.893566  8.549836  1.0  11.0  16.0  21.0  66.0
>8         7961.0  16.071725  8.548943  1.0  10.0  15.0  20.0  75.0
Norm       4905.0  16.542712  8.460629  1.0  11.0  15.0  21.0  69.0


In [275]:
# Group by 'A1Cresult' and calculate the total medications for each group
medications_by_a1c = df.groupby('A1Cresult')['num_medications'].sum().reset_index()

In [276]:
# Create a bar chart
fig = px.bar(medications_by_a1c, x='A1Cresult', y='num_medications',
             title='Total Number of Medications by A1C Category',
             labels={'A1Cresult': 'A1C Result', 'num_medications': 'Total Medications'})
fig.show()

In [277]:
# Print conclusion
print("\nConclusion:")
# Get the A1C category with the highest medication count
highest_a1c_category = medications_by_a1c.loc[medications_by_a1c['num_medications'].idxmax(), 'A1Cresult']
# Get the A1C category with the lowest medication count
lowest_a1c_category = medications_by_a1c.loc[medications_by_a1c['num_medications'].idxmin(), 'A1Cresult']

print(f"The A1C category '{highest_a1c_category}' has the highest total medication count indicating that patients with higher A1C levels tend to be prescribed more medications. This suggests a correlation between A1C levels and the number of medications prescribed.")


Conclusion:
The A1C category '>8' has the highest total medication count indicating that patients with higher A1C levels tend to be prescribed more medications. This suggests a correlation between A1C levels and the number of medications prescribed.


In [278]:
# Sort races by '>7' A1C percentage in descending order
ranked_races_7 = a1c_percentages.sort_values(by=['>7'], ascending=False)


In [279]:
# Sort races by '>8' A1C percentage in descending order
ranked_races_8 = a1c_percentages.sort_values(by=['>8'], ascending=False)


In [280]:
# Print the ranked races for '>7'
print("\nRaces Ranked by '>7' A1C:")
print(ranked_races_7[['>7']])  # Print only the '>7' column



Races Ranked by '>7' A1C:
A1Cresult               >7
race                      
Asian            26.666667
Caucasian        24.409708
Other            22.950820
Hispanic         19.959267
AfricanAmerican  15.925926


In [281]:
# Print the ranked races for '>8'
print("\nRaces Ranked by '>8' A1C:")
print(ranked_races_8[['>8']])  # Print only the '>8' column


Races Ranked by '>8' A1C:
A1Cresult               >8
race                      
Hispanic         53.971487
AfricanAmerican  52.564103
Other            49.180328
Caucasian        46.392431
Asian            45.925926


In [282]:
# Pie chart for '>7' A1C
fig_7 = px.pie(ranked_races_7, values='>7', names=ranked_races_7.index,
              title="Races Ranked by '>7' A1C Percentage")
fig_7.show()

In [283]:
# Pie chart for '>8' A1C
fig_8 = px.pie(ranked_races_8, values='>8', names=ranked_races_8.index,
              title="Races Ranked by '>8' A1C Percentage")
fig_8.show()

In [284]:
# Conclusion
print("\nConclusion:")
print(f"Based on the analysis of '>7' A1C percentages, the race with the highest percentage is: {highest_a1c_race_7}")
print(f"Based on the analysis of '>8' A1C percentages, the race with the highest percentage is: {highest_a1c_race_8}")


Conclusion:
Based on the analysis of '>7' A1C percentages, the race with the highest percentage is: Asian
Based on the analysis of '>8' A1C percentages, the race with the highest percentage is: Hispanic
