In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('../data/patients.csv')

In [None]:
#Understanding the data
df.shape
df.columns
df = df.drop(columns=['SSN', 'DRIVERS', 'PASSPORT', 'PREFIX',
       'FIRST', 'LAST', 'SUFFIX', 'MAIDEN'])
df.dtypes

In [4]:
df['BIRTHDATE'] = pd.to_datetime(df['BIRTHDATE'])
df['DEATHDATE'] =pd.to_datetime(df['DEATHDATE'])

In [6]:
df.isna().sum()
df.shape

(1163, 17)

In [None]:
#Exploratory Data Analysis
df['RACE'].value_counts()
df['GENDER'].value_counts()
df['ETHNICITY'].value_counts()
df['CITY'].value_counts()

In [None]:
#demographic info
plt.figure()
ax = df['RACE'].value_counts().plot(kind = 'bar', title = 'Patients by Race')
ax.set_xlabel('Race')
ax.set_ylabel('Count')
plt.savefig('../plots/patients/race.png')
plt.show()

plt.figure()
ax1 = df['GENDER'].value_counts().plot(kind= 'bar', title = 'Patients by Gender')
ax1.set_xlabel('Gender')
ax1.set_ylabel('Count')
plt.savefig('../plots/patients/gender.png')
plt.show()

In [None]:
df[df['HEALTHCARE_EXPENSES'] > 0.3e7].value_counts()

plt.figure()
ax = df[(df['HEALTHCARE_EXPENSES']>=0.01e7) & (df['HEALTHCARE_EXPENSES'] <= 0.3e7)]\
    ['HEALTHCARE_EXPENSES'].plot(
        kind='hist', bins=30, alpha = 0.5, label ='Expenses')

ax1 = df[(df['HEALTHCARE_COVERAGE']>=0.01e7) & (df['HEALTHCARE_COVERAGE'] <= 0.3e7)]\
    ['HEALTHCARE_COVERAGE'].plot(
        kind='hist', bins=30, alpha = 0.7, label = 'Coverage')

plt.legend()
plt.title('Healthcare Expenses vs Coverage for Patients')
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.savefig('../plots/patients/expenses.png')
plt.show()

In [None]:
df['DEATHDATE'] = df['DEATHDATE'].fillna(pd.Timestamp.today())
df['AGE'] = df['DEATHDATE'].dt.year -df['BIRTHDATE'].dt.year

expenses = df[(df['HEALTHCARE_EXPENSES']>=0.01e7) & (df['HEALTHCARE_EXPENSES'] <= 0.3e7)]\
    ['HEALTHCARE_EXPENSES']

plt.figure()
sns.scatterplot(data=df, x= expenses, y = df['AGE'], hue=df['GENDER'], palette= 'pastel')
plt.title('Healthcare Expenses by Age')
plt.savefig('../plots/patients/expenses_age.png')
plt.show()

plt.figure()
sns.histplot(data=df, x='AGE', hue = 'GENDER', palette='bright', element='bars', linewidth = 0.5)
plt.title('Patient Count by Age and Gender')
plt.savefig('../plots/patients/age_gender.png')
plt.show()