In [4]:
import json
from collections import Counter
from datetime import datetime

In [5]:
file_path = '/content/DataEngineeringQ2.json'
with open(file_path, 'r') as file:
    data = json.load(file)

def calculate_percentage(part, whole):
    return round((part / whole) * 100, 2)

patient_details = [entry['patientDetails'] for entry in data]
medicines_data = [entry['consultationData']['medicines'] for entry in data]

In [12]:
# 1. Calculate percentage of missing values for firstName, lastName, and DOB
total_patients = len(patient_details)
missing_first_name = sum(1 for patient in patient_details if patient.get('firstName') in ["", None])
missing_last_name = sum(1 for patient in patient_details if patient.get('lastName') in ["", None])
missing_dob = sum(1 for patient in patient_details if patient.get('birthDate') in ["", None])

percent_missing_first_name = calculate_percentage(missing_first_name, total_patients)
percent_missing_last_name = calculate_percentage(missing_last_name, total_patients)
percent_missing_dob = calculate_percentage(missing_dob, total_patients)


In [7]:
# 2. Calculate the percentage of female gender after imputation
genders = [patient.get('gender') for patient in patient_details if patient.get('gender') not in ["", None]]
mode_gender = Counter(genders).most_common(1)[0][0]

# Impute missing gender values
imputed_genders = [patient.get('gender') if patient.get('gender') not in ["", None] else mode_gender for patient in patient_details]
percent_female_after_imputation = calculate_percentage(imputed_genders.count('F'), total_patients)

In [8]:
# 3. Add a column ageGroup and count the number of Adults
age_groups = []
current_date = datetime.now()

for patient in patient_details:
    birth_date_str = patient.get('birthDate')
    if birth_date_str not in ["", None]:
        birth_date = datetime.strptime(birth_date_str, "%Y-%m-%dT%H:%M:%S.%fZ")
        age = (current_date - birth_date).days // 365
        if 0 <= age <= 12:
            age_groups.append('Child')
        elif 13 <= age <= 19:
            age_groups.append('Teen')
        elif 20 <= age <= 59:
            age_groups.append('Adult')
        else:
            age_groups.append('Senior')
    else:
        age_groups.append(None)

adult_count = age_groups.count('Adult')

In [9]:

# 4. Calculate the average number of medicines prescribed
total_medicines_prescribed = sum(len(medicines) for medicines in medicines_data)
average_medicines_prescribed = round(total_medicines_prescribed / total_patients, 2)

In [10]:

# 5. Determine the 3rd most frequently prescribed medicineName
all_medicine_names = [medicine['medicineName'] for medicines in medicines_data for medicine in medicines]
medicine_name_counts = Counter(all_medicine_names)
third_most_frequent_medicine = medicine_name_counts.most_common(3)[-1][0]

In [11]:
# 6. Calculate the percentage distribution of active and inactive medicines
total_medicines = len(all_medicine_names)
active_medicines = sum(1 for medicines in medicines_data for medicine in medicines if medicine['isActive'])
inactive_medicines = total_medicines - active_medicines

percent_active_medicines = calculate_percentage(active_medicines, total_medicines)
percent_inactive_medicines = calculate_percentage(inactive_medicines, total_medicines)

{
    "percent_missing_first_name": percent_missing_first_name,
    "percent_missing_last_name": percent_missing_last_name,
    "percent_missing_dob": percent_missing_dob,
    "percent_female_after_imputation": percent_female_after_imputation,
    "adult_count": adult_count,
    "average_medicines_prescribed": average_medicines_prescribed,
    "third_most_frequent_medicine": third_most_frequent_medicine,
    "percent_active_medicines": percent_active_medicines,
    "percent_inactive_medicines": percent_inactive_medicines
}

{'percent_missing_first_name': 0.0,
 'percent_missing_last_name': 70.97,
 'percent_missing_dob': 32.26,
 'percent_female_after_imputation': 32.26,
 'adult_count': 21,
 'average_medicines_prescribed': 2.13,
 'third_most_frequent_medicine': 'C',
 'percent_active_medicines': 69.7,
 'percent_inactive_medicines': 30.3}

In [16]:
def is_valid_indian_phone_number(phone_number):
    if phone_number.startswith('+91'):
        phone_number = phone_number[3:]
    elif phone_number.startswith('91'):
        phone_number = phone_number[2:]

    if len(phone_number) == 10 and phone_number.isdigit():
        return 6000000000 <= int(phone_number) <= 9999999999
    return False

valid_phone_numbers_count = 0
for patient in patient_details:
    phone_number = patient.get('phoneNumber', '')
    if is_valid_indian_phone_number(phone_number):
        patient['isValidMobile'] = True
        valid_phone_numbers_count += 1
    else:
        patient['isValidMobile'] = False

valid_phone_numbers_count
from scipy.stats import pearsonr

def calculate_age(birth_date_str):
    if birth_date_str in ["", None]:
        return None
    birth_date = datetime.strptime(birth_date_str, "%Y-%m-%dT%H:%M:%S.%fZ")
    age = (current_date - birth_date).days // 365
    return age

ages = []
num_medicines_prescribed = []

for patient, medicines in zip(patient_details, medicines_data):
    age = calculate_age(patient.get('birthDate'))
    if age is not None:
        ages.append(age)
        num_medicines_prescribed.append(len(medicines))

correlation, _ = pearsonr(num_medicines_prescribed, ages)
correlation = round(correlation, 2)
correlation

-0.22