In [1]:
import pandas as pd
import itertools
import joblib
import numpy as np

### 1. Generate Mechanism Database

In [2]:
binary_values = [0, 1]

values_for_R_left = [0, 1]

# Two reactions
two_rxn_1 = ['SM_left', 'C_left', 'B_left', 'P_right', 'IMP1_right', 'INT1_right', 'C_right', 'B_right', 'R_left']
two_rxn_2 = ['SM_left', 'C_left', 'B_left', 'P_left', 'INT1_left', 'P_right', 'IMP2_right', 'C_right', 'B_right', 'R_left']


# Create all combinations of 0 or 1 for the columns except 'R_left' and create all combinations of 0, 1, or 2 for 'R_left'
combinations_R_left = list(itertools.product(values_for_R_left, repeat=1))


combinations2_1 = list(itertools.product(binary_values, repeat=len(two_rxn_1) - 1))
combinations2_1 = [comb + comb_R_left for comb in combinations2_1 for comb_R_left in combinations_R_left]
combinations2_2 = list(itertools.product(binary_values, repeat=len(two_rxn_2) - 1))
combinations2_2 = [comb + comb_R_left for comb in combinations2_2 for comb_R_left in combinations_R_left]

columns=two_rxn_1[:-1]  + ['R_left']
df2_1 = pd.DataFrame(combinations2_1, columns=columns)
columns=two_rxn_2[:-1]  + ['R_left']
df2_2 = pd.DataFrame(combinations2_2, columns=columns)

df2_1['Number of Reactions'] = 2
df2_1['Step'] = 1
df2_2['Number of Reactions'] = 2
df2_2['Step'] = 2

# Combine all DataFrames
df_combined = pd.concat([df2_1, df2_2], ignore_index=True)

# Fill NaN (missing values) with 0
df_combined = df_combined.fillna(0).astype(int)

columns_to_move = ["Number of Reactions", "Step"]

# Create a list of columns in the original order (excluding the ones to move)
columns_in_original_order = [col for col in df_combined.columns if col not in columns_to_move]

# Reorder the columns in the desired order
new_order = columns_to_move + columns_in_original_order

df_combined = df_combined[new_order]
df_combined


Unnamed: 0,Number of Reactions,Step,SM_left,C_left,B_left,P_right,IMP1_right,INT1_right,C_right,B_right,R_left,P_left,INT1_left,IMP2_right
0,2,1,0,0,0,0,0,0,0,0,0,0,0,0
1,2,1,0,0,0,0,0,0,0,0,1,0,0,0
2,2,1,0,0,0,0,0,0,0,1,0,0,0,0
3,2,1,0,0,0,0,0,0,0,1,1,0,0,0
4,2,1,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1531,2,2,1,1,1,1,0,0,0,1,1,1,1,1
1532,2,2,1,1,1,1,0,0,1,0,0,1,1,1
1533,2,2,1,1,1,1,0,0,1,0,1,1,1,1
1534,2,2,1,1,1,1,0,0,1,1,0,1,1,1


In [3]:
# Function to convert a row to reaction format
def row_to_reaction(row):
    reaction_parts_left = []
    reaction_parts_right = []

    if row['SM_left'] > 0:
        reaction_parts_left.append('SM')
    if row['P_left'] > 0:
        reaction_parts_left.append('P')
    if row['C_left'] > 0:
        reaction_parts_left.append('C')
    if row['B_left'] > 0:
        reaction_parts_left.append('B')
    if row['P_right'] > 0:
        reaction_parts_right.append('P')
    if row['IMP1_right'] > 0:
        reaction_parts_right.append('IMP1')
    if row['IMP2_right'] > 0:
        reaction_parts_right.append('IMP2')
    if row['C_right'] > 0:
        reaction_parts_right.append('C')
    if row['B_right'] > 0:
        reaction_parts_right.append('B')
    if row['R_left'] == 1:
        reaction_parts_left.append('R')
    if row['INT1_left'] > 0:
        reaction_parts_left.append('INT1')
    if row['INT1_right'] > 0:
        reaction_parts_right.append('INT1')

    reaction_str_left = ' + '.join(reaction_parts_left)
    reaction_str_right = ' + '.join(reaction_parts_right)

    reaction_str = reaction_str_left + ' -> ' + reaction_str_right
    return reaction_str

# Apply the function to each row and create the 'Reaction Format' column
df_combined['Reaction Format'] = df_combined.apply(row_to_reaction, axis=1)

df_combined

Unnamed: 0,Number of Reactions,Step,SM_left,C_left,B_left,P_right,IMP1_right,INT1_right,C_right,B_right,R_left,P_left,INT1_left,IMP2_right,Reaction Format
0,2,1,0,0,0,0,0,0,0,0,0,0,0,0,->
1,2,1,0,0,0,0,0,0,0,0,1,0,0,0,R ->
2,2,1,0,0,0,0,0,0,0,1,0,0,0,0,-> B
3,2,1,0,0,0,0,0,0,0,1,1,0,0,0,R -> B
4,2,1,0,0,0,0,0,0,1,0,0,0,0,0,-> C
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1531,2,2,1,1,1,1,0,0,0,1,1,1,1,1,SM + P + C + B + R + INT1 -> P + IMP2 + B
1532,2,2,1,1,1,1,0,0,1,0,0,1,1,1,SM + P + C + B + INT1 -> P + IMP2 + C
1533,2,2,1,1,1,1,0,0,1,0,1,1,1,1,SM + P + C + B + R + INT1 -> P + IMP2 + C
1534,2,2,1,1,1,1,0,0,1,1,0,1,1,1,SM + P + C + B + INT1 -> P + IMP2 + C + B


In [4]:
count_rows = len(df_combined[df_combined['Step'] == 1])
print(f'Number of rows where Step is equal to \'1\': {count_rows}')

count_rows2 = len(df_combined[df_combined['Step'] == 2])
print(f'Number of rows where Step is equal to \'2\': {count_rows2}')

Number of rows where Step is equal to '1': 512
Number of rows where Step is equal to '2': 1024


In [5]:
# Criteria 1: Each step must have at least one species on the left or right side of the reaction.

# Select columns that end with "_left" and "_right"
left_columns = df_combined.filter(like="_left")
right_columns = df_combined.filter(like="_right")

# Calculate the sum of each row for the selected columns
row_sums_left = left_columns.sum(axis=1)
row_sums_right = right_columns.sum(axis=1)

# Filter rows where the sum of "_left" and "_right" columns are not zero
df_combined = df_combined[row_sums_left > 0]
df_combined = df_combined[row_sums_right > 0]

df_combined

  df_combined = df_combined[row_sums_right > 0]


Unnamed: 0,Number of Reactions,Step,SM_left,C_left,B_left,P_right,IMP1_right,INT1_right,C_right,B_right,R_left,P_left,INT1_left,IMP2_right,Reaction Format
3,2,1,0,0,0,0,0,0,0,1,1,0,0,0,R -> B
5,2,1,0,0,0,0,0,0,1,0,1,0,0,0,R -> C
7,2,1,0,0,0,0,0,0,1,1,1,0,0,0,R -> C + B
9,2,1,0,0,0,0,0,1,0,0,1,0,0,0,R -> INT1
11,2,1,0,0,0,0,0,1,0,1,1,0,0,0,R -> B + INT1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1531,2,2,1,1,1,1,0,0,0,1,1,1,1,1,SM + P + C + B + R + INT1 -> P + IMP2 + B
1532,2,2,1,1,1,1,0,0,1,0,0,1,1,1,SM + P + C + B + INT1 -> P + IMP2 + C
1533,2,2,1,1,1,1,0,0,1,0,1,1,1,1,SM + P + C + B + R + INT1 -> P + IMP2 + C
1534,2,2,1,1,1,1,0,0,1,1,0,1,1,1,SM + P + C + B + INT1 -> P + IMP2 + C + B


In [6]:
# Criteria 2: Each step should have at least one species other than the base and catalyst.

left_columns = df_combined.filter(like="_left")
row_sums_left = left_columns.sum(axis=1)

right_columns = df_combined.filter(like="_right")
row_sums_right = right_columns.sum(axis=1)

# Convert specific columns to int if needed
df_combined['B_left'] = df_combined['B_left'].astype(int)
df_combined['C_left'] = df_combined['C_left'].astype(int)
df_combined['B_right'] = df_combined['B_right'].astype(int)
df_combined['C_right'] = df_combined['C_right'].astype(int)

# Create conditions
condition1 = (df_combined['B_left'] == 1) & (row_sums_left == 1)
condition2 = (df_combined['B_right'] == 1) & (row_sums_right == 1)
condition3 = (df_combined['C_left'] == 1) & (row_sums_left == 1)
condition4 = (df_combined['C_right'] == 1) & (row_sums_right == 1)
condition5 = (df_combined['B_left'] == 1) & (df_combined['C_left'] == 1) & (row_sums_left == 2)
condition6 = (df_combined['B_right'] == 1) & (df_combined['C_right'] == 1) & (row_sums_right == 2)

# Combine conditions with logical OR
final_condition = np.logical_or.reduce([condition1, condition2, condition3, condition4, condition5, condition6])

# Apply the final condition to filter the DataFrame
df_combined = df_combined[~final_condition]
df_combined

Unnamed: 0,Number of Reactions,Step,SM_left,C_left,B_left,P_right,IMP1_right,INT1_right,C_right,B_right,R_left,P_left,INT1_left,IMP2_right,Reaction Format
9,2,1,0,0,0,0,0,1,0,0,1,0,0,0,R -> INT1
11,2,1,0,0,0,0,0,1,0,1,1,0,0,0,R -> B + INT1
13,2,1,0,0,0,0,0,1,1,0,1,0,0,0,R -> C + INT1
15,2,1,0,0,0,0,0,1,1,1,1,0,0,0,R -> C + B + INT1
17,2,1,0,0,0,0,1,0,0,0,1,0,0,0,R -> IMP1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1531,2,2,1,1,1,1,0,0,0,1,1,1,1,1,SM + P + C + B + R + INT1 -> P + IMP2 + B
1532,2,2,1,1,1,1,0,0,1,0,0,1,1,1,SM + P + C + B + INT1 -> P + IMP2 + C
1533,2,2,1,1,1,1,0,0,1,0,1,1,1,1,SM + P + C + B + R + INT1 -> P + IMP2 + C
1534,2,2,1,1,1,1,0,0,1,1,0,1,1,1,SM + P + C + B + INT1 -> P + IMP2 + C + B


In [7]:
def violates(row):
    # Criteria 3: The product should not function as both a reactant and product simultaneously in each step.
    if ((row['P_left'] == 1 and row['P_right'] == 1) or
        # Criteria 4: In the first step of the reaction, the starting material must be present on the left side of the reaction.
        (row['Step'] == 1 and row['SM_left'] == 0) or
        # Criteria 5: In the first step of the reaction, if there is a base or catalyst on the right side of the reaction, they must also be on the left side of the reaction simultaneously.
        (row['Step'] == 1 and row['B_left'] == 0 and row['B_right'] == 1) or 
        (row['Step'] == 1 and row['C_left'] == 0 and row['C_right'] == 1)
       ):
        return True
    return False

# Filter rows that violate the new rule
df_combined = df_combined[~df_combined.apply(violates, axis=1)]

df_combined

Unnamed: 0,Number of Reactions,Step,SM_left,C_left,B_left,P_right,IMP1_right,INT1_right,C_right,B_right,R_left,P_left,INT1_left,IMP2_right,Reaction Format
264,2,1,1,0,0,0,0,1,0,0,0,0,0,0,SM -> INT1
265,2,1,1,0,0,0,0,1,0,0,1,0,0,0,SM + R -> INT1
272,2,1,1,0,0,0,1,0,0,0,0,0,0,0,SM -> IMP1
273,2,1,1,0,0,0,1,0,0,0,1,0,0,0,SM + R -> IMP1
280,2,1,1,0,0,0,1,1,0,0,0,0,0,0,SM -> IMP1 + INT1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1515,2,2,1,1,1,0,0,0,0,1,1,1,1,1,SM + P + C + B + R + INT1 -> IMP2 + B
1516,2,2,1,1,1,0,0,0,1,0,0,1,1,1,SM + P + C + B + INT1 -> IMP2 + C
1517,2,2,1,1,1,0,0,0,1,0,1,1,1,1,SM + P + C + B + R + INT1 -> IMP2 + C
1518,2,2,1,1,1,0,0,0,1,1,0,1,1,1,SM + P + C + B + INT1 -> IMP2 + C + B


In [8]:
# Check duplication (not necessary)

# Get the subset of columns from the third column to the last column
columns_to_check = df_combined.columns[2:]

# Check for duplicates based on the specified subset of columns
duplicate_mask = df_combined.duplicated(subset=columns_to_check, keep='last')

# Keep only the non-duplicate rows (keeping the last occurrence)
df_unique = df_combined[~duplicate_mask]

# Reset the index without adding the previous index as a new column
df_combined.reset_index(drop=True, inplace=True)
df_combined



Unnamed: 0,Number of Reactions,Step,SM_left,C_left,B_left,P_right,IMP1_right,INT1_right,C_right,B_right,R_left,P_left,INT1_left,IMP2_right,Reaction Format
0,2,1,1,0,0,0,0,1,0,0,0,0,0,0,SM -> INT1
1,2,1,1,0,0,0,0,1,0,0,1,0,0,0,SM + R -> INT1
2,2,1,1,0,0,0,1,0,0,0,0,0,0,0,SM -> IMP1
3,2,1,1,0,0,0,1,0,0,0,1,0,0,0,SM + R -> IMP1
4,2,1,1,0,0,0,1,1,0,0,0,0,0,0,SM -> IMP1 + INT1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,2,2,1,1,1,0,0,0,0,1,1,1,1,1,SM + P + C + B + R + INT1 -> IMP2 + B
586,2,2,1,1,1,0,0,0,1,0,0,1,1,1,SM + P + C + B + INT1 -> IMP2 + C
587,2,2,1,1,1,0,0,0,1,0,1,1,1,1,SM + P + C + B + R + INT1 -> IMP2 + C
588,2,2,1,1,1,0,0,0,1,1,0,1,1,1,SM + P + C + B + INT1 -> IMP2 + C + B


#### Add initial species in the dataframe

In [9]:
df = df_combined.copy()

In [10]:
# List of columns to check
columns_to_check = ['SM_left', 'B_left', 'C_left', 'R_left', 'IMP1_right']

# Create a new column to store the short names
df['Initial Compounds'] = ''

# Iterate through the specified columns
for column_name in columns_to_check:
    short_name = column_name.split('_')[0]  # Extract short name from column name
    mask = df[column_name] > 0  # Create a mask for values greater than 0
    df.loc[mask, 'Initial Compounds'] += short_name + ','  # Append short name to the new column

# Remove trailing comma from the new column values
df['Initial Compounds'] = df['Initial Compounds'].str.rstrip(',')

df

Unnamed: 0,Number of Reactions,Step,SM_left,C_left,B_left,P_right,IMP1_right,INT1_right,C_right,B_right,R_left,P_left,INT1_left,IMP2_right,Reaction Format,Initial Compounds
0,2,1,1,0,0,0,0,1,0,0,0,0,0,0,SM -> INT1,SM
1,2,1,1,0,0,0,0,1,0,0,1,0,0,0,SM + R -> INT1,"SM,R"
2,2,1,1,0,0,0,1,0,0,0,0,0,0,0,SM -> IMP1,"SM,IMP1"
3,2,1,1,0,0,0,1,0,0,0,1,0,0,0,SM + R -> IMP1,"SM,R,IMP1"
4,2,1,1,0,0,0,1,1,0,0,0,0,0,0,SM -> IMP1 + INT1,"SM,IMP1"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,2,2,1,1,1,0,0,0,0,1,1,1,1,1,SM + P + C + B + R + INT1 -> IMP2 + B,"SM,B,C,R"
586,2,2,1,1,1,0,0,0,1,0,0,1,1,1,SM + P + C + B + INT1 -> IMP2 + C,"SM,B,C"
587,2,2,1,1,1,0,0,0,1,0,1,1,1,1,SM + P + C + B + R + INT1 -> IMP2 + C,"SM,B,C,R"
588,2,2,1,1,1,0,0,0,1,1,0,1,1,1,SM + P + C + B + INT1 -> IMP2 + C + B,"SM,B,C"


In [11]:
# Save the DataFrame
df.to_excel('database_two_reactions.xlsx', index=True)
joblib.dump(df, "database_two_reactions.joblib", compress=9)

['database_two_reactions.joblib']

### 2. Generate All Reasonable Combinations for Two Reactions Mechanism

In [11]:
df_combined = df.copy()
df_combined

Unnamed: 0,Number of Reactions,Step,SM_left,C_left,B_left,P_right,IMP1_right,INT1_right,C_right,B_right,R_left,P_left,INT1_left,IMP2_right,Reaction Format,Initial Compounds
0,2,1,1,0,0,0,0,1,0,0,0,0,0,0,SM -> INT1,SM
1,2,1,1,0,0,0,0,1,0,0,1,0,0,0,SM + R -> INT1,"SM,R"
2,2,1,1,0,0,0,1,0,0,0,0,0,0,0,SM -> IMP1,"SM,IMP1"
3,2,1,1,0,0,0,1,0,0,0,1,0,0,0,SM + R -> IMP1,"SM,R,IMP1"
4,2,1,1,0,0,0,1,1,0,0,0,0,0,0,SM -> IMP1 + INT1,"SM,IMP1"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
585,2,2,1,1,1,0,0,0,0,1,1,1,1,1,SM + P + C + B + R + INT1 -> IMP2 + B,"SM,B,C,R"
586,2,2,1,1,1,0,0,0,1,0,0,1,1,1,SM + P + C + B + INT1 -> IMP2 + C,"SM,B,C"
587,2,2,1,1,1,0,0,0,1,0,1,1,1,1,SM + P + C + B + R + INT1 -> IMP2 + C,"SM,B,C,R"
588,2,2,1,1,1,0,0,0,1,1,0,1,1,1,SM + P + C + B + INT1 -> IMP2 + C + B,"SM,B,C"


In [12]:
# Get the indexes of all combinations for the mechanism has two reactions
# Filter rows where "Number of Reactions" is 2 and "Step" is 1
condition_1 = (df_combined["Number of Reactions"] == 2) & (df_combined["Step"] == 1)
rows_with_2_rxns_1_step = df_combined[condition_1]

# Filter rows where "Number of Reactions" is 2 and "Step" is 2
condition_2 = (df_combined["Number of Reactions"] == 2) & (df_combined["Step"] == 2)
rows_with_2_rxns_2_step = df_combined[condition_2]

# Get the list of indexes for each combination
indexes_with_2_rxns_1_step = rows_with_2_rxns_1_step.index.tolist()
indexes_with_2_rxns_2_step = rows_with_2_rxns_2_step.index.tolist()

print(len(indexes_with_2_rxns_1_step))
print(len(indexes_with_2_rxns_2_step))

126
464


In [13]:
# Create all combinations of the indexes
combinations_two_rxns = list(itertools.product(indexes_with_2_rxns_1_step, indexes_with_2_rxns_2_step))

# Print the first few combinations to check the result
print("First few combinations of indexes:")
print(combinations_two_rxns[:10])

# Total number of valid combinations
print("Total number of combinations:", len(combinations_two_rxns))

First few combinations of indexes:
[(0, 126), (0, 127), (0, 128), (0, 129), (0, 130), (0, 131), (0, 132), (0, 133), (0, 134), (0, 135)]
Total number of combinations: 58464


In [14]:
# Criteria 6: At least one step of the reactions must involve product formation.

valid_combinations_rule6_two_reactions = []

# Iterate through each combination
for combination in combinations_two_rxns:
    idx_row1, idx_row2 = combination

    # Check if "P_right" is present in either step
    if df_combined.loc[idx_row1, "P_right"] == 1 or df_combined.loc[idx_row2, "P_right"] == 1:
        valid_combinations_rule6_two_reactions.append(combination)

print("Valid combinations that satisfy Rule 6:")
print(valid_combinations_rule6_two_reactions[:10])

print("Total number of valid combinations:", len(valid_combinations_rule6_two_reactions))

Valid combinations that satisfy Rule 6:
[(0, 130), (0, 131), (0, 132), (0, 133), (0, 134), (0, 135), (0, 136), (0, 137), (0, 146), (0, 147)]
Total number of valid combinations: 45504


In [15]:
# Criteria 7: If the product or intermediate are reactants in the second step, they must be formed in the first step.

valid_combinations_rule7_two_reactions = []

for combination in valid_combinations_rule6_two_reactions:
    idx_row1, idx_row2 = combination

    # Check if P_left, INT_left in the second step correspond to P_right, INT_right in the first step
    if (((df_combined.loc[idx_row1, "P_right"] == 0 and df_combined.loc[idx_row2, "P_left"] == 0 and \
          df_combined.loc[idx_row2, "P_right"] == 1) or \
            (df_combined.loc[idx_row1, "P_right"] == 1)) and \
                df_combined.loc[idx_row2, "INT1_left"] == df_combined.loc[idx_row1, "INT1_right"]):

        valid_combinations_rule7_two_reactions.append(combination)

print("Valid combinations that satisfy Rule 7:")
print(valid_combinations_rule7_two_reactions[:10])

print("Total number of valid combinations after applying Rule 7:", len(valid_combinations_rule7_two_reactions))

Valid combinations that satisfy Rule 7:
[(0, 146), (0, 147), (0, 148), (0, 149), (0, 150), (0, 151), (0, 152), (0, 153), (0, 154), (0, 155)]
Total number of valid combinations after applying Rule 7: 23040


In [18]:
# Criteria 8: The overall reaction should have the same amount of base, catalyst, and intermediate on both sides of the reaction.

def is_valid_overall_reaction(combination):
    idx_row1, idx_row2 = combination

    # Check if 'C' and 'B' elements are present on the left side of the overall reaction
    c_left = df_combined.loc[idx_row1, 'C_left'] + df_combined.loc[idx_row2, 'C_left']
    b_left = df_combined.loc[idx_row1, 'B_left'] + df_combined.loc[idx_row2, 'B_left']
    c_right = df_combined.loc[idx_row1, 'C_right'] + df_combined.loc[idx_row2, 'C_right']
    b_right = df_combined.loc[idx_row1, 'B_right'] + df_combined.loc[idx_row2, 'B_right']

    if c_left > 0 or c_right > 0 or b_left > 0 or b_right > 0:
        # Ensure that 'C' and 'B' elements have the same amount on the right side
        if c_left != c_right or b_left != b_right:
            return False

    # Check if 'INT1' elements are present on the left side of the overall reaction
    int1_left = df_combined.loc[idx_row1, 'INT1_left'] + df_combined.loc[idx_row2, 'INT1_left']
    int1_right = df_combined.loc[idx_row1, 'INT1_right'] + df_combined.loc[idx_row2, 'INT1_right']

    if int1_left > 0 or int1_right > 0:
        # Ensure that 'INT' elements have the same amount on the right side
        if int1_left != int1_right:
            return False

    return True

valid_combinations_rule8_two_reactions = [combination for combination in valid_combinations_rule7_two_reactions if is_valid_overall_reaction(combination)]

print("Valid combinations that satisfy Rule 8:")
print(valid_combinations_rule8_two_reactions[:10])

print("Total number of valid combinations after applying Rule 8:", len(valid_combinations_rule8_two_reactions))

Valid combinations that satisfy Rule 8:
[(0, 146), (0, 147), (0, 154), (0, 155), (0, 200), (0, 201), (0, 208), (0, 209), (0, 254), (0, 255)]
Total number of valid combinations after applying Rule 8: 4000


In [19]:
def is_valid_reaction(combination):
    idx_row1, idx_row2 = combination

    # Criteria 9. If the starting material and reagent are reactants in the first step, they should not exist in the second step simultaneously.
    if df_combined.loc[idx_row1, "SM_left"] == 1 and df_combined.loc[idx_row1, "R_left"] > 0 and df_combined.loc[idx_row2, "SM_left"] == 1 and df_combined.loc[idx_row2, "R_left"] > 0:
        return False

    # Criteria 10: The first step cannot solely form impurity without simultaneously forming product or intermediate.
    if df_combined.loc[idx_row1, "IMP1_right"] == 1 and df_combined.loc[idx_row1, "P_right"] == 0 and df_combined.loc[idx_row1, "INT1_right"] == 0:
        return False

    # Criteria 11: Only one of the two steps can form a product.
    if df_combined.loc[idx_row1, "P_right"] == 1 and df_combined.loc[idx_row2, "P_right"] == 1:
        return False 
    
    # Criteria 12: If an intermediate is formed in the first step, the starting material should not be present in the second step. The second step should initiate from the intermediate instead.
    if df_combined.loc[idx_row1, "SM_left"] == 1 and df_combined.loc[idx_row1, "INT1_right"] == 1 and df_combined.loc[idx_row2, "SM_left"] == 1:
        return False

    # Criteria 13: The base has to be present on both sides of reactions if there is any.
    if df_combined.loc[idx_row1, "B_left"] != df_combined.loc[idx_row1, "B_right"] or df_combined.loc[idx_row2, "B_left"] != df_combined.loc[idx_row2, "B_right"]:
        return False

    # Criteria 14: If the base is used in the second step, it must also be used in the first step. 
    # For example, if the first step forms the product and is reacted in the second step in the presence of the base, the base must also be used in the first step for the main reaction.
    if df_combined.loc[idx_row2, "B_left"] == 1 and df_combined.loc[idx_row1, "B_right"] == 0:
        return False
    
    # Criteria 15: If there is no intermediate formed in the first step, 
    # the reagent and product should be in the same step because the product must be formed from the reagent and starting material.
    if df_combined.loc[idx_row1, "INT1_right"] != 1 and ((df_combined.loc[idx_row1, "P_right"] == 1 and df_combined.loc[idx_row1, "R_left"] == 0)\
    or (df_combined.loc[idx_row2, "P_right"] == 1 and df_combined.loc[idx_row2, "R_left"] == 0)):
        return False

    # Criteria 16. If the catalyst is not utilized to catalyze the starting material (as not shown on the left side of the first step), it should not be employed in the second step. 
    # This implies that the catalyst cannot catalyze all other species without catalyzing the starting material and reagent.
    if df_combined.loc[idx_row1, "C_left"] == 0 and (df_combined.loc[idx_row1, "C_right"] == 1 or df_combined.loc[idx_row2, "C_left"] == 1):
        return False

    # Criteria 17. If the product is formed in the first step and the reagent is not a reactant in the first step, the reagent should not be in the second reaction either. 
    # The reagent must at least react with the starting material.
    if df_combined.loc[idx_row1, "P_right"] == 1 and (df_combined.loc[idx_row1, "R_left"] == 0 and df_combined.loc[idx_row2, "R_left"] == 1):
        return False   
    return True

# Filter combinations that satisfy all rules for overall reactions
valid_combinations_rule_9_17_two_reactions = [combination for combination in valid_combinations_rule8_two_reactions if is_valid_reaction(combination)]

print("Valid combinations that satisfy Rule 9-17:")
print(valid_combinations_rule_9_17_two_reactions[:10])

print("Total number of valid combinations after applying Rule 9-17:", len(valid_combinations_rule_9_17_two_reactions))

Valid combinations that satisfy Rule 9-17:
[(0, 146), (0, 147), (0, 154), (0, 155), (1, 146), (1, 147), (1, 154), (1, 155), (4, 146), (4, 147)]
Total number of valid combinations after applying Rule 9-17: 456


In [20]:
df_two_reactions = pd.DataFrame()
rows_list = []
for idx_row1, idx_row2 in valid_combinations_rule_9_17_two_reactions:
    rxn1 = df_combined.iloc[idx_row1, :]
    rxn2 = df_combined.iloc[idx_row2, :]
    rows_list.append(rxn1)
    rows_list.append(rxn2)
    df_two_reactions = pd.concat(rows_list, axis=1).T
# print(df_two_reactions)
df_two_reactions

Unnamed: 0,Number of Reactions,Step,SM_left,C_left,B_left,P_right,IMP1_right,INT1_right,C_right,B_right,R_left,P_left,INT1_left,IMP2_right,Reaction Format,Initial Compounds
0,2,1,1,0,0,0,0,1,0,0,0,0,0,0,SM -> INT1,SM
146,2,2,0,0,0,1,0,0,0,0,0,0,1,0,INT1 -> P,
0,2,1,1,0,0,0,0,1,0,0,0,0,0,0,SM -> INT1,SM
147,2,2,0,0,0,1,0,0,0,0,1,0,1,0,R + INT1 -> P,R
0,2,1,1,0,0,0,0,1,0,0,0,0,0,0,SM -> INT1,SM
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,2,2,0,1,1,0,0,0,1,1,1,0,1,1,C + B + R + INT1 -> IMP2 + C + B,"B,C,R"
125,2,1,1,1,1,1,1,1,1,1,1,0,0,0,SM + C + B + R -> P + IMP1 + C + B + INT1,"SM,B,C,R,IMP1"
332,2,2,0,1,1,0,0,0,1,1,0,1,1,1,P + C + B + INT1 -> IMP2 + C + B,"B,C"
125,2,1,1,1,1,1,1,1,1,1,1,0,0,0,SM + C + B + R -> P + IMP1 + C + B + INT1,"SM,B,C,R,IMP1"


In [39]:
excel_file_path = 'mechanism_two_reactions_paper.xlsx'
df_two_reactions.to_excel(excel_file_path, index=True)

In [72]:
joblib.dump(valid_combinations_rule_9_17_two_reactions, "valid_combinations_two_reactions.joblib", compress=9)

['valid_combinations_two_reactions.joblib']