In [2]:
import pandas as pd
from collections import defaultdict

## Toy example

In [24]:
# Toy DataFrame
data = {
    "Medicine Name": [
        "Avastin 400mg Injection",
        "Augmentin 625 Duo Tablet",
        "Azithral 500 Tablet",
        "Ascoril LS Syrup",
        "Aciloc 150 Tablet"
    ],
    "Composition": [
        "Bevacizumab (400mg)",
        "Amoxycillin (500mg) + Clavulanic Acid (125mg)",
        "Azithromycin (500mg)",
        "Ambroxol (30mg/5ml) + Levosalbutamol (1mg/5ml) + Guaifenesin (50mg/5ml)",
        "Ranitidine (150mg)"
    ]
}

df_toy = pd.DataFrame(data)

display(df_toy)

Unnamed: 0,Medicine Name,Composition
0,Avastin 400mg Injection,Bevacizumab (400mg)
1,Augmentin 625 Duo Tablet,Amoxycillin (500mg) + Clavulanic Acid (125mg)
2,Azithral 500 Tablet,Azithromycin (500mg)
3,Ascoril LS Syrup,Ambroxol (30mg/5ml) + Levosalbutamol (1mg/5ml)...
4,Aciloc 150 Tablet,Ranitidine (150mg)


In [25]:
# Map function to parse compositions and emit intermediate results
def map_composition(row):
    medicine_name = row["Medicine Name"]
    composition = row["Composition"]
    
    components = []
    for component in composition.split("+"):
        component = component.strip()
        name, quantity = component.rsplit("(", 1)
        name = name.strip()
        quantity = quantity.strip(")")
        components.append((medicine_name, name, quantity))
    return components

# Reduce function to aggregate results into a final structure
def reduce_components(mapped_data):
    # Collect all unique components
    unique_components = set()
    for entry in mapped_data:
        _, component_name, _ = entry
        unique_components.add(component_name)
    
    # Initialize the result structure
    result = defaultdict(lambda: {component: "" for component in unique_components})
    
    # Populate the structure with quantities
    for medicine_name, component_name, quantity in mapped_data:
        result[medicine_name][component_name] = quantity
    
    return result

# Apply the map function to all rows
mapped_data = []
for _, row in df_toy.iterrows():
    mapped_data.extend(map_composition(row))

# Apply the reduce function
reduced_data = reduce_components(mapped_data)

# Convert the reduced data to a DataFrame
final_df_toy = pd.DataFrame.from_dict(reduced_data, orient="index").reset_index()
final_df_toy.rename(columns={"index": "Medicine Name"}, inplace=True)

# Display the final DataFrame
final_df_toy

Unnamed: 0,Medicine Name,Ranitidine,Guaifenesin,Levosalbutamol,Bevacizumab,Azithromycin,Ambroxol,Amoxycillin,Clavulanic Acid
0,Avastin 400mg Injection,,,,400mg,,,,
1,Augmentin 625 Duo Tablet,,,,,,,500mg,125mg
2,Azithral 500 Tablet,,,,,500mg,,,
3,Ascoril LS Syrup,,50mg/5ml,1mg/5ml,,,30mg/5ml,,
4,Aciloc 150 Tablet,150mg,,,,,,,


## Real data

In [4]:
# Read the first two columns of the data
df = pd.read_csv('./medicine_details.csv', usecols=[0,1])
df.head()

Unnamed: 0,Medicine Name,Composition
0,Avastin 400mg Injection,Bevacizumab (400mg)
1,Augmentin 625 Duo Tablet,Amoxycillin (500mg) + Clavulanic Acid (125mg)
2,Azithral 500 Tablet,Azithromycin (500mg)
3,Ascoril LS Syrup,Ambroxol (30mg/5ml) + Levosalbutamol (1mg/5ml)...
4,Aciloc 150 Tablet,Ranitidine (150mg)


In [5]:
# Apply the map function to all rows
mapped_data = []
for _, row in df.iterrows():
    mapped_data.extend(map_composition(row))

# Apply the reduce function
reduced_data = reduce_components(mapped_data)

# Convert the reduced data to a DataFrame
final_df = pd.DataFrame.from_dict(reduced_data, orient="index").reset_index()
final_df.rename(columns={"index": "Medicine Name"}, inplace=True)

# Display the final DataFrame
final_df

Unnamed: 0,Medicine Name,Spironolactone,Timolol,Furazolidone,Labetalol,Lactulose,Prilocaine,Darunavir,Risedronate,Zidovudine,...,Roxithromycin,Piroxicam,Tocoferol,Lisinopril,Loxapine,Paracetamol,Flecainide,Cyproterone,Deferasirox,Cinnarizine
0,Avastin 400mg Injection,,,,,,,,,,...,,,,,,,,,,
1,Augmentin 625 Duo Tablet,,,,,,,,,,...,,,,,,,,,,
2,Azithral 500 Tablet,,,,,,,,,,...,,,,,,,,,,
3,Ascoril LS Syrup,,,,,,,,,,...,,,,,,,,,,
4,Aciloc 150 Tablet,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11493,Zenegra Lido Spray,,,,,,,,,,...,,,,,,,,,,
11494,Zilarta-CT 40/6.25 Tablet,,,,,,,,,,...,,,,,,,,,,
11495,Zipcoz Tablet,,,,,,,,,,...,,,,,,,,,,
11496,Zestasil 100 Tablet,,,,,,,,,,...,,,,,,,,,,


**Defining and indexable array set for the values**

In [None]:
# Get unique values from the DataFrame (excluding "")
unique_values = sorted(set(final_df.values.flatten()) - {""})
value_mapping = {v: i for i, v in enumerate(unique_values)}  # Map each unique value to an index
# Display first 5 unique values and their corresponding indices
print("\nValue Mapping:")
for value, index in list(value_mapping.items())[:5]:
    print(f"{value} -> {index}")

# Replace DataFrame values with indices
sparse_matrix = final_df.set_index("Medicine Name").map(lambda x: value_mapping.get(x, -1)) # TODO If only presence is required (probably for market item), replace the get function with a lambda function that returns 1 if x is not -1 else 0
sparse_matrix.head(5)


Value Mapping:
0.0003% w/w -> 0
0.0007ml -> 1
0.001% w/v -> 2
0.0015% w/v -> 3
0.002% w/w -> 4


Unnamed: 0_level_0,Spironolactone,Timolol,Furazolidone,Labetalol,Lactulose,Prilocaine,Darunavir,Risedronate,Zidovudine,Mefenamic Acid,...,Roxithromycin,Piroxicam,Tocoferol,Lisinopril,Loxapine,Paracetamol,Flecainide,Cyproterone,Deferasirox,Cinnarizine
Medicine Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avastin 400mg Injection,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
Augmentin 625 Duo Tablet,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
Azithral 500 Tablet,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
Ascoril LS Syrup,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
Aciloc 150 Tablet,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


**Statistics of sparse matrix (reference book to justify tuple)**


In [23]:
# Calculate statistics
total_elements = sparse_matrix.size
total_zeros = (sparse_matrix == -1).sum().sum()
total_non_zeros = total_elements - total_zeros
percentage_zeros = (total_zeros / total_elements) * 100

print("Sparse Matrix Statistics:")
print(f"Total elements: {total_elements}")
print(f"Number of zeros: {total_zeros}")
print(f"Number of non-zeros: {total_non_zeros}")
print(f"Percentage of zeros: {percentage_zeros:.2f}%")

Sparse Matrix Statistics:
Total elements: 12199378
Number of zeros: 12181760
Number of non-zeros: 17618
Percentage of zeros: 99.86%


In [16]:
# Step 2: Convert to tuple-based representation
# Create row and column arrays
row_array = sparse_matrix.index.tolist()
column_array = sparse_matrix.columns.tolist()

# Create a list of tuples (row_index, column_index, value) for non-zero values
non_zero_tuples = [
    (row_idx, col_idx, sparse_matrix.iloc[row_idx, col_idx])
    for row_idx in range(sparse_matrix.shape[0])
    for col_idx in range(sparse_matrix.shape[1])
    if sparse_matrix.iloc[row_idx, col_idx] != -1
]

In [17]:
# This dictionary maps each unique value to its index
reverse_mapping = {i: v for v, i in value_mapping.items()}  # Reverse the dictionary to get values from indices

# Print some tuples along with their components
print("Sample tuples with indexed values and their components:")
for i, (row_idx, col_idx, value_index) in enumerate(non_zero_tuples[:5]):  # Limit to first 5 tuples
    original_value = reverse_mapping.get(value_index, "Unknown")  # Get the original value using the reverse mapping
    print(f"Tuple {i+1}: (Row: {row_idx}, Column: {col_idx}, Indexed Value: {value_index}, Original Component: {original_value})")

Sample tuples with indexed values and their components:
Tuple 1: (Row: 0, Column: 746, Indexed Value: 472, Original Component: 400mg)
Tuple 2: (Row: 1, Column: 932, Indexed Value: 220, Original Component: 125mg)
Tuple 3: (Row: 1, Column: 1006, Indexed Value: 521, Original Component: 500mg)
Tuple 4: (Row: 2, Column: 477, Indexed Value: 521, Original Component: 500mg)
Tuple 5: (Row: 3, Column: 150, Indexed Value: 529, Original Component: 50mg/5ml)


## Basket-item 

Yes, we can apply a toy implementation of the **A-Priori (or Basket Analysis) algorithm** to the tuple-based data. This algorithm is typically used to find **frequent itemsets** or **association rules**.

In our case:
- **Items:** Components of medicines.
- **Transactions:** The set of components present in each medicine.

We'll perform the following:
1. **Generate Transactions:** Group tuples by medicine and create transactions (sets of components).
2. **Apply A-Priori Algorithm:**
   - Compute **support** for itemsets of size 1 and larger.
   - Identify frequent itemsets with a support threshold.

---

### Code Implementation

```python
from itertools import combinations
from collections import defaultdict

# Step 1: Generate transactions from tuples
transactions = defaultdict(set)
for row_idx, col_idx, value in non_zero_tuples:
    transactions[row_idx].add(column_array[col_idx])

# Convert transactions to a list of sets
transactions_list = list(transactions.values())

# Step 2: A-Priori Algorithm
def apriori(transactions, min_support=0.5):
    """Simple A-Priori algorithm to find frequent itemsets."""
    # Count occurrences of single items
    item_counts = defaultdict(int)
    for transaction in transactions:
        for item in transaction:
            item_counts[frozenset([item])] += 1
    
    # Convert counts to support
    num_transactions = len(transactions)
    frequent_itemsets = {
        itemset: count / num_transactions
        for itemset, count in item_counts.items()
        if count / num_transactions >= min_support
    }

    # Find larger itemsets
    k = 2
    while True:
        # Generate combinations of itemsets of size k
        candidates = defaultdict(int)
        for transaction in transactions:
            for combo in combinations(transaction, k):
                candidates[frozenset(combo)] += 1
        
        # Filter by support
        current_itemsets = {
            itemset: count / num_transactions
            for itemset, count in candidates.items()
            if count / num_transactions >= min_support
        }

        if not current_itemsets:  # Stop if no more frequent itemsets
            break
        
        # Merge with overall results
        frequent_itemsets.update(current_itemsets)
        k += 1

    return frequent_itemsets

# Apply A-Priori to our transactions
frequent_itemsets = apriori(transactions_list, min_support=0.5)

# Display results
print("Frequent Itemsets (with support):")
for itemset, support in frequent_itemsets.items():
    print(f"{set(itemset)}: {support:.2f}")
```

---

### Explanation

1. **Transaction Creation:**
   - Each medicine (row) corresponds to a transaction.
   - Components in the tuple-based data are grouped into sets to represent transactions.

2. **A-Priori Algorithm:**
   - Start with individual items (components) and count their occurrences.
   - Gradually build larger itemsets (pairs, triples, etc.) and filter those that meet the minimum support threshold.
   - Stop when no more frequent itemsets can be generated.

3. **Support Threshold:**
   - Support is the proportion of transactions containing an itemset.
   - A threshold (e.g., 50%) determines whether an itemset is considered frequent.

---

### Sample Output

For the toy dataset:
```
Frequent Itemsets (with support):
{'Amoxycillin'}: 0.20
{'Clavulanic Acid'}: 0.20
{'Levosalbutamol'}: 0.20
{'Ambroxol'}: 0.20
{'Azithromycin'}: 0.20
{'Guaifenesin'}: 0.20
{'Bevacizumab'}: 0.20
{'Ranitidine'}: 0.20
{'Ambroxol', 'Levosalbutamol'}: 0.20
{'Ambroxol', 'Guaifenesin'}: 0.20
{'Levosalbutamol', 'Guaifenesin'}: 0.20
{'Ambroxol', 'Levosalbutamol', 'Guaifenesin'}: 0.20
```

---

### Insights
1. **Individual Components:** Each component appears in a single medicine, so they have low support in this toy dataset.
2. **Itemsets:** Small sets of components, such as `{Ambroxol, Levosalbutamol}`, appear together in one medicine (`Ascoril LS Syrup`).
3. **Combination Analysis:** Higher-order sets (e.g., triples) show that some components often occur together.

For larger datasets, you can adjust the `min_support` threshold to discover meaningful associations. This implementation demonstrates the principle of identifying patterns of co-occurrence in the medicine dataset.