In [220]:
import pandas as pd # for data manipulation
import re # for regular expression operations

In [221]:
file_name = 'test.xlsx'
# file_name = 'root.xlsx'

# Specify the columns to be read:
# - Column 10: Treatment method (phương pháp điều trị)
# - Column 11: Name of the doctor (tên bác sĩ)
require_cols = [10, 11]

# Read data from the file
df = pd.read_excel(file_name, usecols = require_cols, header = 6)
print('Excel file read successfully!\n')

# Drop rows with any missing values
df = df.dropna()

print(df.head())

Excel file read successfully!

                                Phương pháp điều trị     Y, BS khám bệnh
1  SPORAL  100mg SL: 1 SN: 7; LINEZIN  4mg/ml SL:...  TRỊNH THỊ MINH HẢI
2  SPORAL  100mg SL: 14 SN: 7; LINEZIN  4mg/ml SL...  TRỊNH THỊ MINH HẢI
3  POVIDINE   10%, 20ml SL: 1 SN: ; COTRIMOXAZOLE...         HOÀNG LƯƠNG
4  CIPROBAY 500mg SL: 14 SN: 7; ACEMUC  200mg SL:...         HOÀNG LƯƠNG
5  METRONIDAZOL  250mg SL: 14 SN: 7; AUGMENTIN  1...         HOÀNG LƯƠNG


In [222]:
# Extract doctor names from the last column of the DataFrame
doctor_names = df.iloc[:, -1]

# Remove duplicate names
unique_doctor_names = doctor_names.drop_duplicates()

# Convert the unique doctor names to a list
doctor_names_list = unique_doctor_names.tolist()

print('Doctor names extracted successfully!\n')

Doctor names extracted successfully!



In [None]:
# Create a dictionary to store the medicine names and quantities
medicine_dict = {}

# Function to parse medicine name and quantity
def parse_medicine(medicine_str):
    # Use regular expression to extract the medicine name and its quantity
    # E.g.: 'SPORAL  100mg SL: 1 SN: 7' -> name: 'SPORAL  100mg', quantity: '1'
    match = re.match(r'(.+?) SL: (\d+)', medicine_str)

    # If the regular expression matches, return the medicine name and quantity
    if match:
        return match.groups()

    # If the regular expression does not match, return 'None' for both
    return None, None

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    # Get the doctor name from the last column
    doctor_name = row.iloc[-1]

    # Get the medicines from the second-to-last column and split them by ';'
    medicines = row.iloc[0].split(';')
    
    # Iterate through each medicine in the list of medicines
    for medicine in medicines:
        # Parse the medicine name and quantity using the parse_medicine() function
        med_name, quantity = parse_medicine(medicine.strip())
        
        # If the medicine name and quantity are not None
        if med_name and quantity:
            # If the medicine name is not in the dictionary,
            # add it with the doctor names as keys and the quantities as values
            if med_name not in medicine_dict:
                medicine_dict[med_name] = {doctor: 0 for doctor in doctor_names_list}

            # Add the quantity to the corresponding doctor
            medicine_dict[med_name][doctor_name] += int(quantity)

print('Medicine names and quantities processed successfully!\n')

# Print some values of the medicine_dict
# print('Here are the first 5 medicine names and its:')
print('Here are the first 5 medicine names and its quantities prescribed by each doctor:')
for med_name, quantities in list(medicine_dict.items())[:5]:
    print(f"Medicine: {med_name}")

    for doctor, quantity in quantities.items():
        print(f"  - {doctor}: {quantity}")
print('\n')

Processing doctor: TRỊNH THỊ MINH HẢI
['SPORAL  100mg SL: 1 SN: 7', ' LINEZIN  4mg/ml SL: 14 SN: 7', ' ENTEROGERMINA  4 tỷ/5ml SL: 14 SN: 7', ' SMC AG+ 250ml SL: 1 SN: ', ' BACTAMOX  1g SL: 14 SN: 7']
Processing doctor: TRỊNH THỊ MINH HẢI
['SPORAL  100mg SL: 14 SN: 7', ' LINEZIN  4mg/ml SL: 14 SN: 7', ' ENTEROGERMINA  4 tỷ/5ml SL: 14 SN: 7', ' SMC AG+ 250ml SL: 1 SN: ', ' BACTAMOX  1g SL: 14 SN: 7']
Processing doctor: HOÀNG LƯƠNG
['POVIDINE   10%, 20ml SL: 1 SN: ', ' COTRIMOXAZOLE  800/160mg SL: 14 SN: 7', ' DECOLGEN ND  SL: 14 SN: 7', ' XISAT Người lớn 75ml SL: 1 SN: ']
Processing doctor: HOÀNG LƯƠNG
['CIPROBAY 500mg SL: 14 SN: 7', ' ACEMUC  200mg SL: 14 SN: 7', ' TELFAST HD  180mg SL: 7 SN: 7', ' DECOLGEN ND  SL: 14 SN: 7', ' OTRIVIN  0,1% 10ml SL: 1 SN: ']
Processing doctor: HOÀNG LƯƠNG
['METRONIDAZOL  250mg SL: 14 SN: 7', ' AUGMENTIN  1g SL: 14 SN: 7', ' ATUSSIN TABLETS  SL: 21 SN: 7', ' PHOSPHALUGEL 12,38g/ gói 20g SL: 14 SN: 7', ' MEDROL  16mg SL: 7 SN: 7', ' XISAT Người lớn 75ml

In [224]:
# Create a DataFrame from the medicine dictionary
# with doctor names as columns headers and medicine names as index labels
result_df = pd.DataFrame.from_dict(medicine_dict, orient='index', columns=doctor_names_list)

print('Writing results...\n')
# Write the DataFrame to a new sheet called 'Result'
with pd.ExcelWriter(file_name, mode='a', engine='openpyxl', if_sheet_exists='replace') as writer:
    result_df.to_excel(writer, sheet_name='Result', startrow=0)

print('>>> ALL DONE! <<<')
print('Medicine names and quantities of each doctor have been written to the "Result" sheet.')

Writing results...

>>> ALL DONE! <<<
Medicine names and quantities of each doctor have been written to the "Result" sheet.
