In [1]:
import pandas as pd
from collections import defaultdict
import re

# Convert to DataFrame for easier processing
df = pd.read_csv('C:\\Users\\17675\\Downloads\\medicine_details.csv')

# Step 1: Define a regex pattern for splitting side effects
def split_side_effects(side_effects):
    # Regular expression to match words starting with uppercase letters (and followed by lowercase letters)
    return re.findall(r'[A-Z][a-z]+(?: [A-Z][a-z]*)*', side_effects)

# Step 2: Prepare transactions (compositions and side effects)
transactions = defaultdict(set)  # To hold compositions
side_effects_dict = defaultdict(list)  # To hold side effects

for _, row in df.iterrows():
    # Each medicine is a transaction of components
    composition = row["Composition"].split(' + ')  # Split multiple ingredients
    transactions[row["Medicine Name"]] = composition
    # Store side effects associated with each medicine (using the regex split function)
    side_effects = split_side_effects(row["Side_effects"])
    side_effects_dict[row["Medicine Name"]] = side_effects

# Show a sample transaction (medicine -> components)
for medicine_name, components in transactions.items():
    print(f"{medicine_name}: {components}")
    print(f"Side Effects: {side_effects_dict[medicine_name]}")

Avastin 400mg Injection: ['Bevacizumab (400mg)']
Side Effects: ['Rectal', 'Taste', 'Headache Nosebleeds Back', 'Dry', 'High', 'Protein', 'Inflammation']
Augmentin 625 Duo Tablet: ['Amoxycillin  (500mg)', ' Clavulanic Acid (125mg)']
Side Effects: ['Vomiting Nausea Diarrhea Mucocutaneous']
Azithral 500 Tablet: ['Azithromycin (500mg)']
Side Effects: ['Nausea Abdominal', 'Diarrhea']
Ascoril LS Syrup: ['Ambroxol (30mg/5ml)', 'Levosalbutamol (1mg/5ml)', 'Guaifenesin (50mg/5ml)']
Side Effects: ['Nausea Vomiting Diarrhea Upset', 'Stomach', 'Allergic', 'Dizziness Headache Rash Hives Tremors Palpitations Muscle', 'Increased']
Aciloc 150 Tablet: ['Ranitidine (150mg)']
Side Effects: ['Headache Diarrhea Gastrointestinal']
Allegra 120mg Tablet: ['Fexofenadine (120mg)']
Side Effects: ['Headache Drowsiness Dizziness Nausea']
Avil 25 Tablet: ['Pheniramine (25mg)']
Side Effects: ['Sedation']
Aricep 5 Tablet: ['Donepezil (5mg)']
Side Effects: ['Common', 'Urinary', 'Rash Nausea Diarrhea Insomnia', 'Weight

In [2]:
from itertools import combinations
# Apply A-Priori algorithm (same as in previous example)
def a_priori(transactions, min_support=0.01):
    item_counts = defaultdict(int)
    for transaction in transactions.values():
        for item in transaction:
            item_counts[item] += 1

    num_transactions = len(transactions)
    frequent_itemsets = {item: count / num_transactions for item, count in item_counts.items() if count / num_transactions >= min_support}

    current_itemsets = list(frequent_itemsets.keys())
    all_frequent_itemsets = dict(frequent_itemsets)

    k = 2
    while current_itemsets:
        next_itemsets = defaultdict(int)
        for transaction in transactions.values():
            for itemset in combinations(transaction, k):
                if all(item in transaction for item in itemset):
                    next_itemsets[itemset] += 1
        
        frequent_itemsets_k = {itemset: count / num_transactions for itemset, count in next_itemsets.items() if count / num_transactions >= min_support}
        
        if not frequent_itemsets_k:
            break
        
        all_frequent_itemsets.update(frequent_itemsets_k)
        current_itemsets = list(frequent_itemsets_k.keys())
        k += 1

    return all_frequent_itemsets

# Apply the A-Priori algorithm on the composition data
min_support = 0.001  # 1% support threshold
frequent_itemsets = a_priori(transactions, min_support)

# Show some of the frequent itemsets
for itemset, support in list(frequent_itemsets.items())[:5]:
    print(f"Itemset: {itemset} -> Support: {support:.4f}")

Itemset: Amoxycillin  (500mg) -> Support: 0.0034
Itemset:  Clavulanic Acid (125mg) -> Support: 0.0047
Itemset: Azithromycin (500mg) -> Support: 0.0025
Itemset: Ambroxol (30mg/5ml) -> Support: 0.0038
Itemset: Levosalbutamol (1mg/5ml) -> Support: 0.0027


In [3]:
# Step 3: Map frequent itemsets to side effects
itemset_side_effects = defaultdict(list)

# Loop through the medicines and their side effects
for medicine_name, components in transactions.items():
    side_effects = side_effects_dict[medicine_name]
    for itemset in frequent_itemsets:
        if all(item in components for item in itemset):
            itemset_side_effects[itemset].extend(side_effects)

# 存储去重后的规则
unique_itemsets = defaultdict(set)

# 去重：将成分组合按字母排序
for itemset, side_effects in itemset_side_effects.items():
    # 将 itemset 中的成分进行排序
    sorted_itemset = tuple(sorted(itemset))
    unique_itemsets[sorted_itemset] = side_effects

# Show frequent itemsets and their associated side effects
for itemset, side_effects in itemset_side_effects.items():
    print(f"Itemset: {itemset} -> Associated Side Effects: {set(side_effects)}")

Itemset: ('Amoxycillin  (500mg)', ' Clavulanic Acid (125mg)') -> Associated Side Effects: {'Vomiting Nausea Diarrhea Mucocutaneous'}
Itemset: ('Ambroxol (30mg/5ml)', 'Levosalbutamol (1mg/5ml)') -> Associated Side Effects: {'Bronchitis', 'Increased', 'Allergic', 'Dizziness Vomiting Inflammation', 'Pain Asthma Pharyngitis', 'Nausea Vomiting Diarrhea Upset', 'Stomach', 'Dizziness Headache Rash Hives Tremors Palpitations Muscle'}
Itemset: ('Ambroxol (30mg/5ml)', 'Guaifenesin (50mg/5ml)') -> Associated Side Effects: {'Dizziness Headache Rash Hives Allergic', 'Bronchitis', 'Increased', 'Tremors Increased', 'Nausea Diarrhea Vomiting Stomach', 'Sleepiness', 'Allergic', 'Dizziness Vomiting Inflammation', 'Pain Asthma Pharyngitis', 'Rash Hives Dizziness Headache Allergic', 'Nausea Vomiting Diarrhea Upset', 'Stomach', 'Dizziness Headache Rash Hives Tremors Palpitations Muscle', 'Palpitations'}
Itemset: ('Levosalbutamol (1mg/5ml)', 'Guaifenesin (50mg/5ml)') -> Associated Side Effects: {'Bronchitis