In [None]:
#Importing requisite libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib
%matplotlib inline 

In [None]:
#Import the dataset
df_brazil=pd.read_csv('brazil_data.csv')

In [None]:
#Checking data
df_brazil.head()

In [None]:
df_brazil.tail()

In [None]:
#checking for null values in the dataset.
df_brazil.isnull().sum()

In [None]:
df_brazil.shape
#dataframe has 110527 rows and 14 columns

In [None]:
#checking for duplicates 
df_brazil.duplicated().sum()

In [None]:
df_brazil.nunique()

In [None]:
df_brazil.dtypes

In [None]:
df_brazil.describe()

In [None]:
df_brazil.info()

In [None]:
#checking for duplicated patientID
df_brazil['PatientId'].duplicated().sum()
#attributed to the patients booking multiple appointments

In [None]:
#checking for duplicated AppointmentID
df_brazil['AppointmentID'].duplicated().sum()
#unique IDs none was repeated as it was unique ID

### Data Cleaning

In [None]:
# A number of gaps were seen with the data. In this step we will clean the data and make it ready for analyis


In [None]:
#renaming columns that had been misspelt.
df_brazil.rename(columns={'Hipertension': 'Hypertension','Handcap':'Handicap'},inplace=True)
df_brazil.head(1)

In [None]:
#Removing Negative Ages if any by multiplying by -1
df_brazil[df_brazil['Age'] < 0]
df_brazil['Age']=df_brazil['Age'].apply(lambda x: x*-1 if x < 0 else x)
df_brazil[df_brazil['Age'] < 0]
df_brazil.head(1)

In [None]:
#changing datatypes of ScheduledDay and AppointmentDay
df_brazil['AppointmentDay']=pd.to_datetime(df_brazil['AppointmentDay'])
df_brazil['ScheduledDay']=pd.to_datetime(df_brazil['ScheduledDay'])
defr=df_brazil['AppointmentDay']-df_brazil['ScheduledDay']

In [None]:
#lowercasing column names and removing '-' replacing with '_'
df_brazil.rename(columns=lambda x: x.strip().lower().replace("-", "_"), inplace=True)

In [None]:
#renaming the columns 'patientid','appointmentid','scheduledday','appointmentday'
df_brazil.rename(columns={'patientid':'patient_id','appointmentid':'appointment_id','scheduledday':'scheduled_day','appointmentday':'appointment_day'},inplace=True)
df_brazil.head(1)

In [None]:
#Data is now consistent and ready for analysis. we proceed to the next stage which is Exploratory data analysis.

## Exploratory Data Analysis
Now that the data is clean and ready for use for analysis we will proceed to answer questions we had been asked earlier on regarding the data.
We will compute some of those statistics and use visualizations to represent our data

1. What is the proportion of patients who missed their appointments?

In [None]:

appointments=df_brazil.groupby(['no_show'])['no_show'].count().to_frame()
appointments

In [None]:
cmap=plt.get_cmap('tab20c')
color=cmap(np.array([1,2,5,6,9,10]))
plt.pie(appointments.no_show,colors=color,labels=appointments.index,startangle=90,autopct="%1.0f%%",explode=None,shadow=True)
plt.title("Attendance of Patients To Medical Appointments")
plt.show()

From the above chart we can see 20% which transalates to 22,319 patients missing their appointments. 80% of the patients attended their appointments which translates to 88,209 patients.
This understanding is premised on the fact that in the no_show column,'Yes' meansthe patient does not show and 'No' is that the patient shows up for the appointment.

 2. Did the scholarships affect attendance of medical appointments
##### The scholarship is called Bolsa Família (Meaning; Family Allowance) is a social protection program of the Government of Brazil, part of network of federal assistance programs. Bolsa Família provided financial aid to poor Brazilian families. In order to be eligible, families had to ensure that children attend school and get vaccinated. A great initiative that was a major factor contributing to the reduction of poverty in Brazil, which fell 27.7% during the first term in the administration of Lula.

In [None]:
#exploring the no_show column
df_brazil.no_show.hist()
plt.title('No Show Appointment')
plt.ylabel('Frequency')
plt.xlabel('Status')

In [None]:
df_brazil.no_show.value_counts()

In [None]:
#exploring the scholariships column
df_brazil.scholarship.plot(kind='hist',title='Beneficiaries of scholarship')
plt.ylabel('Frequency')
plt.xlabel('Status')

In [None]:
#count of scholarship beneficiaries
df_brazil.scholarship.value_counts()

In [None]:
#proceed to group by 'no_show' and see the way it relates to the scholarship column
df_brazil.groupby('no_show').scholarship.value_counts()

From the above we can see the total number of scholarship beneficiaries is 10,861.
This translates to 10.1765% of the total number of patients.

In [None]:
#plotting graphs to show the relationship better
df_brazil.groupby('no_show').scholarship.value_counts().plot(kind='bar',title='Beneficiary Status')
plt.ylabel('Total number')
plt.xlabel('Status of patient')

We had made an earlier assumption on the no_show columns that: 'No' means the patient does show and 'Yes' means that the patient doesn't show. For the scholarships, "0" means not a beneficary and "1" indicates the patient is a beneficiary of the scholarship.

We can see that a total of 8283 patients out of the 10861 beneficiaries showed up for their appointment.That means more than half of the beneficiaries showed up for their appointment.
This implies then that the Bolsa Familia social protection program improved attendance of medical appointments by patients.

### 3. What is the proportion of women and men who have benefitted from the scholarship?

In [None]:
df_brazil.groupby('scholarship').gender.value_counts()

In [None]:
df_brazil.groupby('scholarship').gender.value_counts().plot(kind='bar',title='Beneficiary Status by Gender')
plt.ylabel('Total number')
plt.xlabel('Gender and Beneficiary Status')

From this we can see that 62,987 women failed to get the government social protection support this is vis a vis 36,679 men; while 8,853 (8.01%) men vis a vis 2008(1.817%) men got the governemt social protection support. We can thus deduce that more women than men got government social support protection program. The total number of people who got into the government support program was still low and might need to be improved.

3. How did diagnosis/medical condition and medical appointment attendance compare?

In [None]:
df_brazil.groupby('no_show').hypertension.value_counts()

In [None]:
#Hypertension
df_brazil.groupby('no_show').hypertension.value_counts().plot(kind='bar',title='Appointment Attendance by Condition-Hypertension')
plt.ylabel('Total number')
plt.xlabel('Condition and Appointment Attendance-Hypertension')

In [None]:
#Diabetes
df_brazil.groupby('no_show').diabetes.value_counts()

In [None]:
df_brazil.groupby('no_show').hypertension.value_counts().plot(kind='bar',title='Appointment Attendance by Condition-Diabetes')
plt.ylabel('Total number')
plt.xlabel('Condition and Appointment Attendance-Diabetes')

In [None]:
#Handicap
df_brazil.groupby('no_show').handicap.value_counts()

In [None]:
df_brazil.groupby('no_show').hypertension.value_counts().plot(kind='bar',title='Appointment Attendance by Condition-Handicap')
plt.ylabel('Total number')
plt.xlabel('Condition and Appointment Attendance-Handicap')

In [None]:
#Alcoholism
df_brazil.groupby('no_show').alcoholism.value_counts()

In [None]:
df_brazil.groupby('no_show').hypertension.value_counts().plot(kind='bar',title='Appointment Attendance by Condition-Alcoholism')
plt.ylabel('Total number')
plt.xlabel('Condition and Appointment Attendance-Alcoholism')

explain the results above