In [2]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [0, 0, 0, 1, 1, 0, 1, 0, 1, 0] * 5
}
df = pd.DataFrame(data)

# Initialize variables
rules = []
max_rules = 20  # Generate more rules

# Generate combinations of features, including both numerical and categorical variables
numerical_features = ['Age', 'Income']
categorical_features = ['Education', 'Gender']

for num_features in range(1, len(numerical_features) + 1):
    for cat_features in itertools.combinations(categorical_features, num_features):
        feature_combination = list(cat_features) + numerical_features

        # Apply the combination of features
        subset_df = df[feature_combination + ['Anomaly']]

        # Calculate the number of anomalies in this subset
        num_anomalies = subset_df['Anomaly'].sum()

        # Generate rules for the current feature combination
        rule = {
            'Features': ', '.join(feature_combination),
            'Num Anomalies Captured': num_anomalies
        }

        # Add the rule to the list
        rules.append(rule)

# Sort the rules by the number of anomalies detected in descending order
rules.sort(key=lambda x: x['Num Anomalies Captured'], reverse=True)

# Print the top 20 rules with anomalies counts in descending order
for i, rule in enumerate(rules[:max_rules]):
    num_anomalies = rule['Num Anomalies Captured']
    print(f"Rule {i + 1}:")
    print(f"Features: {rule['Features']}")
    print(f"Num Anomalies Captured: {num_anomalies}")

    # Generate rules for the current feature combination
    features = rule['Features'].split(', ')
    rules = []
    for feature in features:
        if feature in numerical_features:
            rule_condition = f"{feature} >= {df[feature].min()} AND {feature} <= {df[feature].max()}"
        else:
            categories = df[feature].unique()
            category_conditions = [f"{feature} == '{category}'" for category in categories]
            rule_condition = " OR ".join(category_conditions)
        rules.append(rule_condition)

    # Combine the rules into a single rule for the current feature combination
    combined_rule = " AND ".join(rules)
    print(f"Rule Condition: {combined_rule}\n")


Rule 1:
Features: Education, Age, Income
Num Anomalies Captured: 20
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Age >= 22 AND Age <= 45 AND Income >= 45000 AND Income <= 90000

Rule 2:
Features: Gender, Age, Income
Num Anomalies Captured: 20
Rule Condition: Gender == 'Male' OR Gender == 'Female' AND Age >= 22 AND Age <= 45 AND Income >= 45000 AND Income <= 90000

Rule 3:
Features: Education, Gender, Age, Income
Num Anomalies Captured: 20
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Gender == 'Male' OR Gender == 'Female' AND Age >= 22 AND Age <= 45 AND Income >= 45000 AND Income <= 90000



In [3]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [0, 1, 0, 1, 1, 0, 1, 0, 1, 1] * 5
}
df = pd.DataFrame(data)

# Initialize variables
top_rules = []
top_rules_anomalies = {}
max_rules = 10

# Generate combinations of features, including both numerical and categorical variables
numerical_features = ['Age', 'Income']
categorical_features = ['Education', 'Gender']

for num_features in range(1, len(numerical_features) + 1):
    for cat_features in itertools.combinations(categorical_features, num_features):
        feature_combination = list(cat_features) + numerical_features

        # Apply the combination of features
        subset_df = df[feature_combination + ['Anomaly']]

        # Calculate the number of anomalies in this subset
        num_anomalies = subset_df['Anomaly'].sum()

        # Store the feature combination and its anomalies count
        top_rules.append(feature_combination)
        top_rules_anomalies[tuple(feature_combination)] = num_anomalies

# Sort the top rules by the number of anomalies detected in descending order
top_rules.sort(key=lambda x: top_rules_anomalies[tuple(x)], reverse=True)

# Generate and print the top 10 rules and their anomalies count
for i, rule in enumerate(top_rules[:max_rules]):
    num_anomalies = top_rules_anomalies[tuple(rule)]
    print(f"Rule {i+1}:")
    print(f"Features: {', '.join(rule)}")
    print(f"Num Anomalies Captured: {num_anomalies}")

    # Generate rules for the current feature combination
    rules = []
    for feature in rule:
        if feature in numerical_features:
            rule_condition = f"{feature} >= {df[feature].min()} AND {feature} <= {df[feature].max()}"
        else:
            categories = df[feature].unique()
            category_conditions = [f"{feature} == '{category}'" for category in categories]
            rule_condition = " OR ".join(category_conditions)
        rules.append(rule_condition)

    # Combine the rules into a single rule for the current feature combination
    combined_rule = " AND ".join(rules)
    print(f"Rule Condition: {combined_rule}\n")


Rule 1:
Features: Education, Age, Income
Num Anomalies Captured: 30
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Age >= 22 AND Age <= 45 AND Income >= 45000 AND Income <= 90000

Rule 2:
Features: Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: Gender == 'Male' OR Gender == 'Female' AND Age >= 22 AND Age <= 45 AND Income >= 45000 AND Income <= 90000

Rule 3:
Features: Education, Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Gender == 'Male' OR Gender == 'Female' AND Age >= 22 AND Age <= 45 AND Income >= 45000 AND Income <= 90000



In [4]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Initialize variables
rules = []
max_rules = 20  # Generate more rules

# Generate combinations of features, including both numerical and categorical variables
numerical_features = ['Age', 'Income']
categorical_features = ['Education', 'Gender']

for num_features in range(1, len(numerical_features) + 1):
    for cat_features in itertools.combinations(categorical_features, num_features):
        feature_combination = list(cat_features) + numerical_features

        # Apply the combination of features
        subset_df = df[feature_combination + ['Anomaly']]

        # Calculate the number of anomalies in this subset
        num_anomalies = subset_df['Anomaly'].sum()

        # Generate rules for the current feature combination
        rule = {
            'Features': ', '.join(feature_combination),
            'Num Anomalies Captured': num_anomalies
        }

        # Add the rule to the list
        rules.append(rule)

# Sort the rules by the number of anomalies detected in descending order
rules.sort(key=lambda x: x['Num Anomalies Captured'], reverse=True)

# Print the top 20 rules with anomalies counts in descending order
for i, rule in enumerate(rules[:max_rules]):
    num_anomalies = rule['Num Anomalies Captured']
    print(f"Rule {i + 1}:")
    print(f"Features: {rule['Features']}")
    print(f"Num Anomalies Captured: {num_anomalies}")

    # Generate rules for the current feature combination
    features = rule['Features'].split(', ')
    rules = []
    for feature in features:
        if feature in numerical_features:
            rule_condition = f"{feature} >= {df[feature].min()} AND {feature} <= {df[feature].max()}"
        else:
            categories = df[feature].unique()
            category_conditions = [f"{feature} == '{category}'" for category in categories]
            rule_condition = " OR ".join(category_conditions)
        rules.append(rule_condition)

    # Combine the rules into a single rule for the current feature combination
    combined_rule = " AND ".join(rules)
    print(f"Rule Condition: {combined_rule}\n")


Rule 1:
Features: Education, Age, Income
Num Anomalies Captured: 30
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Age >= 22 AND Age <= 45 AND Income >= 45000 AND Income <= 90000

Rule 2:
Features: Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: Gender == 'Male' OR Gender == 'Female' AND Age >= 22 AND Age <= 45 AND Income >= 45000 AND Income <= 90000

Rule 3:
Features: Education, Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Gender == 'Male' OR Gender == 'Female' AND Age >= 22 AND Age <= 45 AND Income >= 45000 AND Income <= 90000



In [9]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Initialize variables
all_rules = []
max_rules = 20  # Generate more rules

# Define which numerical features should be greater than while creating combinations
numerical_features_greater = ['Income']

# Generate combinations of features, including both numerical and categorical variables
numerical_features = ['Age', 'Income']
categorical_features = ['Education', 'Gender']

for num_features in range(1, len(numerical_features) + 1):
    for cat_features in itertools.combinations(categorical_features, num_features):
        feature_combination = list(cat_features) + numerical_features

        # Check if the combination is valid (Income should be greater)
        if all(feature in feature_combination for feature in numerical_features_greater):
            # Apply the combination of features
            subset_df = df[feature_combination + ['Anomaly']]

            # Filter the subset where the rule is satisfying (Anomaly is True)
            satisfying_subset = subset_df[subset_df['Anomaly']]

            # Calculate the number of anomalies in this subset
            num_anomalies = satisfying_subset['Anomaly'].sum()

            # Determine dynamic cutoffs for numerical features based on this subset
            cutoffs = {}
            for feature in numerical_features:
                if feature in numerical_features_greater:
                    cutoff = satisfying_subset[feature].max()  # Use max value for 'Income'
                else:
                    cutoff = 0  # Default cutoff for other numerical features
                cutoffs[feature] = cutoff

            # Generate rules for the current feature combination with dynamic cutoffs
            current_rules = []
            for feature in feature_combination:
                if feature in numerical_features:
                    cutoff = cutoffs.get(feature, 0)  # Get dynamic cutoff or default to 0
                    rule_condition = f"{feature} > {cutoff}"
                else:
                    categories = df[feature].unique()
                    category_conditions = [f"{feature} == '{category}'" for category in categories]
                    rule_condition = " OR ".join(category_conditions)
                current_rules.append(rule_condition)

            # Combine the rules into a single rule for the current feature combination
            combined_rule = " AND ".join(current_rules)

            # Add the rule to the list
            rule = {
                'Features': ', '.join(feature_combination),
                'Num Anomalies Captured': num_anomalies,
                'Rule Condition': combined_rule
            }
            all_rules.append(rule)

# Sort all the rules by the number of anomalies detected in descending order
all_rules.sort(key=lambda x: x['Num Anomalies Captured'], reverse=True)

# Print the top 20 rules with anomalies counts in descending order
for i, rule in enumerate(all_rules[:max_rules]):
    num_anomalies = rule['Num Anomalies Captured']
    print(f"Rule {i + 1}:")
    print(f"Features: {rule['Features']}")
    print(f"Num Anomalies Captured: {num_anomalies}")
    print(f"Rule Condition: {rule['Rule Condition']}\n")


Rule 1:
Features: Education, Age, Income
Num Anomalies Captured: 30
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Age > 0 AND Income > 90000

Rule 2:
Features: Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: Gender == 'Male' OR Gender == 'Female' AND Age > 0 AND Income > 90000

Rule 3:
Features: Education, Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Gender == 'Male' OR Gender == 'Female' AND Age > 0 AND Income > 90000



In [10]:
df.shape

(50, 5)

In [11]:
df['Income'].value_counts()

50000    5
60000    5
45000    5
70000    5
55000    5
80000    5
90000    5
52000    5
65000    5
59000    5
Name: Income, dtype: int64

In [12]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Initialize variables
all_rules = []
max_rules = 20  # Generate more rules

# Define which numerical features should be greater than while creating combinations
numerical_features_greater = ['Income']

# Generate combinations of features, including both numerical and categorical variables
numerical_features = ['Age', 'Income']
categorical_features = ['Education', 'Gender']

for num_features in range(1, len(numerical_features) + 1):
    for cat_features in itertools.combinations(categorical_features, num_features):
        feature_combination = list(cat_features) + numerical_features

        # Check if the combination is valid (Income should be greater)
        if all(feature in feature_combination for feature in numerical_features_greater):
            # Apply the combination of features
            subset_df = df[feature_combination + ['Anomaly']]

            # Filter the subset where the rule is satisfying (Anomaly is True)
            satisfying_subset = subset_df[subset_df['Anomaly']]

            # Calculate the number of anomalies in this subset
            num_anomalies = satisfying_subset['Anomaly'].sum()

            # Determine dynamic cutoffs for numerical features based on this subset
            cutoffs = {}
            for feature in numerical_features:
                if feature in numerical_features_greater:
                    cutoff = satisfying_subset[feature].min()  # Use min value where anomalies are present
                else:
                    cutoff = 0  # Default cutoff for other numerical features
                cutoffs[feature] = cutoff

            # Generate rules for the current feature combination with dynamic cutoffs
            current_rules = []
            for feature in feature_combination:
                if feature in numerical_features:
                    cutoff = cutoffs.get(feature, 0)  # Get dynamic cutoff or default to 0
                    rule_condition = f"{feature} > {cutoff}"
                else:
                    categories = df[feature].unique()
                    category_conditions = [f"{feature} == '{category}'" for category in categories]
                    rule_condition = " OR ".join(category_conditions)
                current_rules.append(rule_condition)

            # Combine the rules into a single rule for the current feature combination
            combined_rule = " AND ".join(current_rules)

            # Add the rule to the list
            rule = {
                'Features': ', '.join(feature_combination),
                'Num Anomalies Captured': num_anomalies,
                'Rule Condition': combined_rule
            }
            all_rules.append(rule)

# Sort all the rules by the number of anomalies detected in descending order
all_rules.sort(key=lambda x: x['Num Anomalies Captured'], reverse=True)

# Print the top 20 rules with anomalies counts in descending order
for i, rule in enumerate(all_rules[:max_rules]):
    num_anomalies = rule['Num Anomalies Captured']
    print(f"Rule {i + 1}:")
    print(f"Features: {rule['Features']}")
    print(f"Num Anomalies Captured: {num_anomalies}")
    print(f"Rule Condition: {rule['Rule Condition']}\n")


Rule 1:
Features: Education, Age, Income
Num Anomalies Captured: 30
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Age > 0 AND Income > 55000

Rule 2:
Features: Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: Gender == 'Male' OR Gender == 'Female' AND Age > 0 AND Income > 55000

Rule 3:
Features: Education, Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Gender == 'Male' OR Gender == 'Female' AND Age > 0 AND Income > 55000



In [13]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Initialize variables
all_rules = []
max_rules = 20  # Generate more rules

# Define which numerical features should be greater than while creating combinations
numerical_features_greater = ['Income']

# Generate combinations of features, including both numerical and categorical variables
numerical_features = ['Age', 'Income']
categorical_features = ['Education', 'Gender']

for num_features in range(1, len(numerical_features) + 1):
    for cat_features in itertools.combinations(categorical_features, num_features):
        feature_combination = list(cat_features) + numerical_features

        # Check if the combination is valid (Income should be greater)
        if all(feature in feature_combination for feature in numerical_features_greater):
            # Apply the combination of features
            subset_df = df[feature_combination + ['Anomaly']]

            # Filter the subset where the rule is satisfying (Anomaly is True)
            satisfying_subset = subset_df[subset_df['Anomaly']]

            # Calculate the number of anomalies in this subset
            num_anomalies = satisfying_subset['Anomaly'].sum()

            # Determine dynamic cutoffs for numerical features based on this subset
            cutoffs = {}
            for feature in numerical_features:
                if feature in numerical_features_greater:
                    # Use the average value where anomalies are present
                    cutoff = satisfying_subset[feature].mean()
                else:
                    cutoff = 0  # Default cutoff for other numerical features
                cutoffs[feature] = cutoff

            # Generate rules for the current feature combination with dynamic cutoffs
            current_rules = []
            for feature in feature_combination:
                if feature in numerical_features:
                    cutoff = cutoffs.get(feature, 0)  # Get dynamic cutoff or default to 0
                    rule_condition = f"{feature} > {cutoff}"
                else:
                    categories = df[feature].unique()
                    category_conditions = [f"{feature} == '{category}'" for category in categories]
                    rule_condition = " OR ".join(category_conditions)
                current_rules.append(rule_condition)

            # Combine the rules into a single rule for the current feature combination
            combined_rule = " AND ".join(current_rules)

            # Add the rule to the list
            rule = {
                'Features': ', '.join(feature_combination),
                'Num Anomalies Captured': num_anomalies,
                'Rule Condition': combined_rule
            }
            all_rules.append(rule)

# Sort all the rules by the number of anomalies detected in descending order
all_rules.sort(key=lambda x: x['Num Anomalies Captured'], reverse=True)

# Print the top 20 rules with anomalies counts in descending order
for i, rule in enumerate(all_rules[:max_rules]):
    num_anomalies = rule['Num Anomalies Captured']
    print(f"Rule {i + 1}:")
    print(f"Features: {rule['Features']}")
    print(f"Num Anomalies Captured: {num_anomalies}")
    print(f"Rule Condition: {rule['Rule Condition']}\n")


Rule 1:
Features: Education, Age, Income
Num Anomalies Captured: 30
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Age > 0 AND Income > 66500.0

Rule 2:
Features: Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: Gender == 'Male' OR Gender == 'Female' AND Age > 0 AND Income > 66500.0

Rule 3:
Features: Education, Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Gender == 'Male' OR Gender == 'Female' AND Age > 0 AND Income > 66500.0



In [14]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Initialize variables
all_rules = []
max_rules = 20  # Generate more rules

# Define which numerical features should be greater than while creating combinations
numerical_features_greater = ['Income']

# Generate combinations of features, including both numerical and categorical variables
numerical_features = ['Age', 'Income']
categorical_features = ['Education', 'Gender']

for num_features in range(1, len(numerical_features) + 1):
    for cat_features in itertools.combinations(categorical_features, num_features):
        feature_combination = list(cat_features) + numerical_features

        # Check if the combination is valid (Income should be greater)
        if all(feature in feature_combination for feature in numerical_features_greater):
            # Apply the combination of features
            subset_df = df[feature_combination + ['Anomaly']]

            # Filter the subset where the rule is satisfying (Anomaly is True)
            satisfying_subset = subset_df[subset_df['Anomaly']]

            # Calculate the number of anomalies in this subset
            num_anomalies = satisfying_subset['Anomaly'].sum()

            # Determine dynamic cutoffs for numerical features based on this subset
            cutoffs = {}
            for feature in numerical_features:
                if feature in numerical_features_greater:
                    # Use the average value where anomalies are present
                    cutoff = satisfying_subset[feature].mean()
                else:
                    cutoff = 0  # Default cutoff for other numerical features
                cutoffs[feature] = cutoff

            # Generate rules for the current feature combination with dynamic cutoffs
            current_rules = []
            for feature in feature_combination:
                if feature in numerical_features:
                    cutoff = cutoffs.get(feature, 0)  # Get dynamic cutoff or default to 0
                    rule_condition = f"{feature} > {cutoff}"
                else:
                    # For categorical features, use "OR" combination for the unique categories
                    categories = df[feature].unique()
                    category_conditions = [f"{feature} == '{category}'" for category in categories]
                    rule_condition = " OR ".join(category_conditions)
                current_rules.append(rule_condition)

            # Combine the rules into a single rule for the current feature combination
            combined_rule = " AND ".join(current_rules)

            # Add the rule to the list
            rule = {
                'Features': ', '.join(feature_combination),
                'Num Anomalies Captured': num_anomalies,
                'Rule Condition': combined_rule
            }
            all_rules.append(rule)

# Sort all the rules by the number of anomalies detected in descending order
all_rules.sort(key=lambda x: x['Num Anomalies Captured'], reverse=True)

# Print the top 20 rules with anomalies counts in descending order
for i, rule in enumerate(all_rules[:max_rules]):
    num_anomalies = rule['Num Anomalies Captured']
    print(f"Rule {i + 1}:")
    print(f"Features: {rule['Features']}")
    print(f"Num Anomalies Captured: {num_anomalies}")
    print(f"Rule Condition: {rule['Rule Condition']}\n")


Rule 1:
Features: Education, Age, Income
Num Anomalies Captured: 30
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Age > 0 AND Income > 66500.0

Rule 2:
Features: Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: Gender == 'Male' OR Gender == 'Female' AND Age > 0 AND Income > 66500.0

Rule 3:
Features: Education, Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Gender == 'Male' OR Gender == 'Female' AND Age > 0 AND Income > 66500.0



In [15]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Initialize variables
all_rules = []
max_rules = 20  # Generate more rules

# Define which numerical features should be greater than while creating combinations
numerical_features_greater = ['Income']

# Generate combinations of features, including both numerical and categorical variables
numerical_features = ['Age', 'Income']
categorical_features = ['Education', 'Gender']

for num_features in range(1, len(numerical_features) + 1):
    for cat_features in itertools.combinations(categorical_features, num_features):
        feature_combination = list(cat_features) + numerical_features

        # Check if the combination is valid (Income should be greater)
        if all(feature in feature_combination for feature in numerical_features_greater):
            # Apply the combination of features
            subset_df = df[feature_combination + ['Anomaly']]

            # Filter the subset where the rule is satisfying (Anomaly is True)
            satisfying_subset = subset_df[subset_df['Anomaly']]

            # Calculate the number of anomalies in this subset
            num_anomalies = satisfying_subset['Anomaly'].sum()

            # Determine dynamic cutoffs for numerical features based on this subset
            cutoffs = {}
            for feature in numerical_features:
                if feature in numerical_features_greater:
                    # Use the average value where anomalies are present
                    cutoff = satisfying_subset[feature].mean()
                else:
                    cutoff = 0  # Default cutoff for other numerical features
                cutoffs[feature] = cutoff

            # Generate rules for the current feature combination with dynamic cutoffs
            current_rules = []
            for feature in feature_combination:
                if feature in numerical_features:
                    cutoff = cutoffs.get(feature, 0)  # Get dynamic cutoff or default to 0
                    rule_condition = f"{feature} > {cutoff}"
                else:
                    # For categorical features, use "AND" and "OR" combinations
                    categories = df[feature].unique()
                    category_conditions = [f"{feature} == '{category}'" for category in categories]
                    rule_condition = " OR ".join(category_conditions)
                    if len(category_conditions) > 1:
                        rule_condition = f"({rule_condition})"
                current_rules.append(rule_condition)

            # Combine the rules into a single rule for the current feature combination
            combined_rule = " AND ".join(current_rules)

            # Add the rule to the list
            rule = {
                'Features': ', '.join(feature_combination),
                'Num Anomalies Captured': num_anomalies,
                'Rule Condition': combined_rule
            }
            all_rules.append(rule)

# Sort all the rules by the number of anomalies detected in descending order
all_rules.sort(key=lambda x: x['Num Anomalies Captured'], reverse=True)

# Print the top 20 rules with anomalies counts in descending order
for i, rule in enumerate(all_rules[:max_rules]):
    num_anomalies = rule['Num Anomalies Captured']
    print(f"Rule {i + 1}:")
    print(f"Features: {rule['Features']}")
    print(f"Num Anomalies Captured: {num_anomalies}")
    print(f"Rule Condition: {rule['Rule Condition']}\n")


Rule 1:
Features: Education, Age, Income
Num Anomalies Captured: 30
Rule Condition: (Education == 'High' OR Education == 'Low' OR Education == 'Medium') AND Age > 0 AND Income > 66500.0

Rule 2:
Features: Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: (Gender == 'Male' OR Gender == 'Female') AND Age > 0 AND Income > 66500.0

Rule 3:
Features: Education, Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: (Education == 'High' OR Education == 'Low' OR Education == 'Medium') AND (Gender == 'Male' OR Gender == 'Female') AND Age > 0 AND Income > 66500.0



In [18]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Initialize variables
all_rules = []
max_rules = 20  # Generate more rules

# Define which numerical features should be greater than while creating combinations
numerical_features_greater = ['Income']

# Generate combinations of features, including both numerical and categorical variables
numerical_features = ['Age', 'Income']
categorical_features = ['Education', 'Gender']

for num_features in range(1, len(numerical_features) + 1):
    for cat_features in itertools.combinations(categorical_features, num_features):
        feature_combination = list(cat_features) + numerical_features

        # Check if the combination is valid (Income should be greater)
        if all(feature in feature_combination for feature in numerical_features_greater):
            # Apply the combination of features
            subset_df = df[feature_combination + ['Anomaly']]

            # Filter the subset where the rule is satisfying (Anomaly is True)
            satisfying_subset = subset_df[subset_df['Anomaly']]

            # Calculate the number of anomalies in this subset
            num_anomalies = satisfying_subset['Anomaly'].sum()

            # Determine dynamic cutoffs for numerical features based on this subset
            cutoffs = {}
            for feature in numerical_features:
                if feature in numerical_features_greater:
                    # Use the average value where anomalies are present
                    cutoff = satisfying_subset[feature].mean()
                else:
                    cutoff = 0  # Default cutoff for other numerical features
                cutoffs[feature] = cutoff

            # Generate rules for the current feature combination with dynamic cutoffs
            current_rules = []
            for feature in feature_combination:
                if feature in numerical_features:
                    cutoff = cutoffs.get(feature, 0)  # Get dynamic cutoff or default to 0
                    rule_condition = f"{feature} > {cutoff}"
                else:
                    # For categorical features, use both "OR" and "AND" combinations
                    categories = df[feature].unique()
                    print("categories", categories)
                    category_conditions = [f"({feature} == '{category}')" for category in categories]
                    rule_condition = " OR ".join(category_conditions)  # Combine categories with "OR"
                    if len(category_conditions) > 1:
                        rule_condition = f"({' AND '.join(category_conditions)})"  # Combine with "AND"
                current_rules.append(rule_condition)

            # Combine the rules into a single rule for the current feature combination
            combined_rule = " AND ".join(current_rules)

            # Add the rule to the list
            rule = {
                'Features': ', '.join(feature_combination),
                'Num Anomalies Captured': num_anomalies,
                'Rule Condition': combined_rule
            }
            all_rules.append(rule)

# Sort all the rules by the number of anomalies detected in descending order
all_rules.sort(key=lambda x: x['Num Anomalies Captured'], reverse=True)

# Print the top 20 rules with anomalies counts in descending order
for i, rule in enumerate(all_rules[:max_rules]):
    num_anomalies = rule['Num Anomalies Captured']
    print(f"Rule {i + 1}:")
    print(f"Features: {rule['Features']}")
    print(f"Num Anomalies Captured: {num_anomalies}")
    print(f"Rule Condition: {rule['Rule Condition']}\n")


categories ['High' 'Low' 'Medium']
categories ['Male' 'Female']
categories ['High' 'Low' 'Medium']
categories ['Male' 'Female']
Rule 1:
Features: Education, Age, Income
Num Anomalies Captured: 30
Rule Condition: ((Education == 'High') AND (Education == 'Low') AND (Education == 'Medium')) AND Age > 0 AND Income > 66500.0

Rule 2:
Features: Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: ((Gender == 'Male') AND (Gender == 'Female')) AND Age > 0 AND Income > 66500.0

Rule 3:
Features: Education, Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: ((Education == 'High') AND (Education == 'Low') AND (Education == 'Medium')) AND ((Gender == 'Male') AND (Gender == 'Female')) AND Age > 0 AND Income > 66500.0



In [19]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Initialize variables
all_rules = []
max_rules = 20  # Generate more rules

# Define which numerical features should be greater than while creating combinations
numerical_features_greater = ['Income']

# Generate combinations of features, including both numerical and categorical variables
numerical_features = ['Age', 'Income']
categorical_features = ['Education', 'Gender']

# Helper function to generate all combinations of AND and OR conditions for categorical variables
def generate_categorical_combinations(categories):
    combinations = []
    for r in range(1, len(categories) + 1):
        for combo in itertools.combinations(categories, r):
            combinations.append(' OR '.join(combo))
    return combinations

for num_features in range(1, len(numerical_features) + 1):
    for cat_features in itertools.combinations(categorical_features, num_features):
        feature_combination = list(cat_features) + numerical_features

        # Check if the combination is valid (Income should be greater)
        if all(feature in feature_combination for feature in numerical_features_greater):
            # Apply the combination of features
            subset_df = df[feature_combination + ['Anomaly']]

            # Filter the subset where the rule is satisfying (Anomaly is True)
            satisfying_subset = subset_df[subset_df['Anomaly']]

            # Calculate the number of anomalies in this subset
            num_anomalies = satisfying_subset['Anomaly'].sum()

            # Determine dynamic cutoffs for numerical features based on this subset
            cutoffs = {}
            for feature in numerical_features:
                if feature in numerical_features_greater:
                    # Use the average value where anomalies are present
                    cutoff = satisfying_subset[feature].mean()
                else:
                    cutoff = 0  # Default cutoff for other numerical features
                cutoffs[feature] = cutoff

            # Generate rules for the current feature combination with dynamic cutoffs
            current_rules = []
            for feature in feature_combination:
                if feature in numerical_features:
                    cutoff = cutoffs.get(feature, 0)  # Get dynamic cutoff or default to 0
                    rule_condition = f"{feature} > {cutoff}"
                else:
                    # For categorical features, generate all possible combinations of AND and OR conditions
                    categories = df[feature].unique()
                    category_combinations = generate_categorical_combinations(categories)
                    rule_condition = ' AND '.join(category_combinations)
                current_rules.append(rule_condition)

            # Combine the rules into a single rule for the current feature combination
            combined_rule = " AND ".join(current_rules)

            # Add the rule to the list
            rule = {
                'Features': ', '.join(feature_combination),
                'Num Anomalies Captured': num_anomalies,
                'Rule Condition': combined_rule
            }
            all_rules.append(rule)

# Sort all the rules by the number of anomalies detected in descending order
all_rules.sort(key=lambda x: x['Num Anomalies Captured'], reverse=True)

# Print the top 20 rules with anomalies counts in descending order
for i, rule in enumerate(all_rules[:max_rules]):
    num_anomalies = rule['Num Anomalies Captured']
    print(f"Rule {i + 1}:")
    print(f"Features: {rule['Features']}")
    print(f"Num Anomalies Captured: {num_anomalies}")
    print(f"Rule Condition: {rule['Rule Condition']}\n")


Rule 1:
Features: Education, Age, Income
Num Anomalies Captured: 30
Rule Condition: High AND Low AND Medium AND High OR Low AND High OR Medium AND Low OR Medium AND High OR Low OR Medium AND Age > 0 AND Income > 66500.0

Rule 2:
Features: Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: Male AND Female AND Male OR Female AND Age > 0 AND Income > 66500.0

Rule 3:
Features: Education, Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: High AND Low AND Medium AND High OR Low AND High OR Medium AND Low OR Medium AND High OR Low OR Medium AND Male AND Female AND Male OR Female AND Age > 0 AND Income > 66500.0



In [22]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Initialize variables
all_rules = []
max_rules = 20  # Generate more rules

# Define which numerical features should be greater than while creating combinations
numerical_features_greater = ['Income']

# Generate combinations of features, including both numerical and categorical variables
numerical_features = ['Age', 'Income']
categorical_features = ['Education', 'Gender']

# Helper function to generate all combinations of AND and OR conditions for categorical variables
def generate_categorical_combinations(categories):
    combinations = []
    for r in range(1, len(categories) + 1):
        for combo in itertools.combinations(categories, r):
            # Generate all possible OR conditions within the combination
            or_conditions = ' OR '.join(combo)
            
            # Generate all possible AND conditions by adding ' AND ' between OR conditions
            and_condition = ' AND '.join(f"({condition})" for condition in or_conditions.split(' OR '))
            
            combinations.append(and_condition)
    return combinations

for num_features in range(1, len(numerical_features) + 1):
    for cat_features in itertools.combinations(categorical_features, num_features):
        feature_combination = list(cat_features) + numerical_features

        # Check if the combination is valid (Income should be greater)
        if all(feature in feature_combination for feature in numerical_features_greater):
            # Apply the combination of features
            subset_df = df[feature_combination + ['Anomaly']]

            # Filter the subset where the rule is satisfying (Anomaly is True)
            satisfying_subset = subset_df[subset_df['Anomaly']]

            # Calculate the number of anomalies in this subset
            num_anomalies = satisfying_subset['Anomaly'].sum()

            # Determine dynamic cutoffs for numerical features based on this subset
            cutoffs = {}
            for feature in numerical_features:
                if feature in numerical_features_greater:
                    # Use the average value where anomalies are present
                    cutoff = satisfying_subset[feature].mean()
                else:
                    cutoff = 0  # Default cutoff for other numerical features
                cutoffs[feature] = cutoff

            # Generate rules for the current feature combination with dynamic cutoffs
            current_rules = []
            for feature in feature_combination:
                if feature in numerical_features:
                    cutoff = cutoffs.get(feature, 0)  # Get dynamic cutoff or default to 0
                    rule_condition = f"{feature} > {cutoff}"
                else:
                    # For categorical features, generate all possible combinations of AND and OR conditions
                    categories = df[feature].unique()
                    category_combinations = generate_categorical_combinations(categories)
                    print("category_combinations", category_combinations)
                    rule_condition = ' AND '.join(category_combinations)
                current_rules.append(rule_condition)

            # Combine the rules into a single rule for the current feature combination
            combined_rule = " AND ".join(current_rules)

            # Add the rule to the list
            rule = {
                'Features': ', '.join(feature_combination),
                'Num Anomalies Captured': num_anomalies,
                'Rule Condition': combined_rule
            }
            all_rules.append(rule)

# Sort all the rules by the number of anomalies detected in descending order
all_rules.sort(key=lambda x: x['Num Anomalies Captured'], reverse=True)

# Print the top 20 rules with anomalies counts in descending order
for i, rule in enumerate(all_rules[:max_rules]):
    num_anomalies = rule['Num Anomalies Captured']
    print(f"Rule {i + 1}:")
    print(f"Features: {rule['Features']}")
    print(f"Num Anomalies Captured: {num_anomalies}")
    print(f"Rule Condition: {rule['Rule Condition']}\n")


category_combinations ['(High)', '(Low)', '(Medium)', '(High) AND (Low)', '(High) AND (Medium)', '(Low) AND (Medium)', '(High) AND (Low) AND (Medium)']
category_combinations ['(Male)', '(Female)', '(Male) AND (Female)']
category_combinations ['(High)', '(Low)', '(Medium)', '(High) AND (Low)', '(High) AND (Medium)', '(Low) AND (Medium)', '(High) AND (Low) AND (Medium)']
category_combinations ['(Male)', '(Female)', '(Male) AND (Female)']
Rule 1:
Features: Education, Age, Income
Num Anomalies Captured: 30
Rule Condition: (High) AND (Low) AND (Medium) AND (High) AND (Low) AND (High) AND (Medium) AND (Low) AND (Medium) AND (High) AND (Low) AND (Medium) AND Age > 0 AND Income > 66500.0

Rule 2:
Features: Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: (Male) AND (Female) AND (Male) AND (Female) AND Age > 0 AND Income > 66500.0

Rule 3:
Features: Education, Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: (High) AND (Low) AND (Medium) AND (High) AND (Low) AND (Hig

In [23]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Initialize variables
all_rules = []
max_rules = 20  # Generate more rules

# Define which numerical features should be greater than while creating combinations
numerical_features_greater = ['Income']

# Generate combinations of features, including both numerical and categorical variables
numerical_features = ['Age', 'Income']
categorical_features = ['Education', 'Gender']

# Helper function to generate all combinations of AND and OR conditions for categorical variables
def generate_categorical_combinations(categories):
    combinations = []
    for r in range(1, len(categories) + 1):
        for combo in itertools.combinations(categories, r):
            # Generate all possible OR conditions within the combination
            or_conditions = ' OR '.join(combo)
            
            # Generate all possible AND conditions by adding ' AND ' between OR conditions
            and_condition = ' AND '.join(f"({condition})" for condition in or_conditions.split(' OR '))
            
            combinations.append(and_condition)
    return combinations

for num_features in range(1, len(numerical_features) + 1):
    for cat_features in itertools.combinations(categorical_features, num_features):
        feature_combination = list(cat_features) + numerical_features

        # Check if the combination is valid (Income should be greater)
        if all(feature in feature_combination for feature in numerical_features_greater):
            # Apply the combination of features
            subset_df = df[feature_combination + ['Anomaly']]

            # Filter the subset where the rule is satisfying (Anomaly is True)
            satisfying_subset = subset_df[subset_df['Anomaly']]

            # Calculate the number of anomalies in this subset
            num_anomalies = satisfying_subset['Anomaly'].sum()

            # Determine dynamic cutoffs for numerical features based on this subset
            cutoffs = {}
            for feature in numerical_features:
                if feature in numerical_features_greater:
                    # Use the average value where anomalies are present
                    cutoff = satisfying_subset[feature].mean()
                else:
                    cutoff = 0  # Default cutoff for other numerical features
                cutoffs[feature] = cutoff

            # Generate rules for the current feature combination with dynamic cutoffs
            current_rules = []
            for feature in feature_combination:
                if feature in numerical_features:
                    cutoff = cutoffs.get(feature, 0)  # Get dynamic cutoff or default to 0
                    rule_condition = f"{feature} > {cutoff}"
                else:
                    # For categorical features, generate all possible combinations of AND and OR conditions
                    categories = df[feature].unique()
                    category_combinations = generate_categorical_combinations(categories)
                    rule_condition = ' OR '.join(category_combinations)
                current_rules.append(rule_condition)

            # Combine the rules into a single rule for the current feature combination
            combined_rule = " AND ".join(current_rules)

            # Add the rule to the list
            rule = {
                'Features': ', '.join(feature_combination),
                'Num Anomalies Captured': num_anomalies,
                'Rule Condition': combined_rule
            }
            all_rules.append(rule)

# Sort all the rules by the number of anomalies detected in descending order
all_rules.sort(key=lambda x: x['Num Anomalies Captured'], reverse=True)

# Print the top 20 rules with anomalies counts in descending order
for i, rule in enumerate(all_rules[:max_rules]):
    num_anomalies = rule['Num Anomalies Captured']
    print(f"Rule {i + 1}:")
    print(f"Features: {rule['Features']}")
    print(f"Num Anomalies Captured: {num_anomalies}")
    print(f"Rule Condition: {rule['Rule Condition']}\n")


Rule 1:
Features: Education, Age, Income
Num Anomalies Captured: 30
Rule Condition: (High) OR (Low) OR (Medium) OR (High) AND (Low) OR (High) AND (Medium) OR (Low) AND (Medium) OR (High) AND (Low) AND (Medium) AND Age > 0 AND Income > 66500.0

Rule 2:
Features: Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: (Male) OR (Female) OR (Male) AND (Female) AND Age > 0 AND Income > 66500.0

Rule 3:
Features: Education, Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: (High) OR (Low) OR (Medium) OR (High) AND (Low) OR (High) AND (Medium) OR (Low) AND (Medium) OR (High) AND (Low) AND (Medium) AND (Male) OR (Female) OR (Male) AND (Female) AND Age > 0 AND Income > 66500.0



In [24]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Initialize variables
all_rules = []
max_rules = 20  # Generate more rules

# Define which numerical features should be greater than while creating combinations
numerical_features_greater = ['Income']

# Generate combinations of features, including both numerical and categorical variables
numerical_features = ['Age', 'Income']
categorical_features = ['Education', 'Gender']

# Helper function to generate all combinations of AND and OR conditions for categorical variables
def generate_categorical_combinations(categories, variable_name):
    combinations = []
    for r in range(1, len(categories) + 1):
        for combo in itertools.combinations(categories, r):
            # Generate all possible OR conditions within the combination
            or_conditions = ' OR '.join(f"{variable_name} == '{category}'" for category in combo)
            
            # Generate all possible AND conditions by adding ' AND ' between OR conditions
            and_condition = ' AND '.join(f"({condition})" for condition in or_conditions.split(' OR '))
            
            combinations.append(and_condition)
    return combinations

for num_features in range(1, len(numerical_features) + 1):
    for cat_features in itertools.combinations(categorical_features, num_features):
        feature_combination = list(cat_features) + numerical_features

        # Check if the combination is valid (Income should be greater)
        if all(feature in feature_combination for feature in numerical_features_greater):
            # Apply the combination of features
            subset_df = df[feature_combination + ['Anomaly']]

            # Filter the subset where the rule is satisfying (Anomaly is True)
            satisfying_subset = subset_df[subset_df['Anomaly']]

            # Calculate the number of anomalies in this subset
            num_anomalies = satisfying_subset['Anomaly'].sum()

            # Determine dynamic cutoffs for numerical features based on this subset
            cutoffs = {}
            for feature in numerical_features:
                if feature in numerical_features_greater:
                    # Use the average value where anomalies are present
                    cutoff = satisfying_subset[feature].mean()
                else:
                    cutoff = 0  # Default cutoff for other numerical features
                cutoffs[feature] = cutoff

            # Generate rules for the current feature combination with dynamic cutoffs
            current_rules = []
            for feature in feature_combination:
                if feature in numerical_features:
                    cutoff = cutoffs.get(feature, 0)  # Get dynamic cutoff or default to 0
                    rule_condition = f"{feature} > {cutoff}"
                else:
                    variable_name = feature
                    categories = df[feature].unique()
                    category_combinations = generate_categorical_combinations(categories, variable_name)
                    rule_condition = ' OR '.join(category_combinations)
                current_rules.append(rule_condition)

            # Combine the rules into a single rule for the current feature combination
            combined_rule = " AND ".join(current_rules)

            # Add the rule to the list
            rule = {
                'Features': ', '.join(feature_combination),
                'Num Anomalies Captured': num_anomalies,
                'Rule Condition': combined_rule
            }
            all_rules.append(rule)

# Sort all the rules by the number of anomalies detected in descending order
all_rules.sort(key=lambda x: x['Num Anomalies Captured'], reverse=True)

# Print the top 20 rules with anomalies counts in descending order
for i, rule in enumerate(all_rules[:max_rules]):
    num_anomalies = rule['Num Anomalies Captured']
    print(f"Rule {i + 1}:")
    print(f"Features: {rule['Features']}")
    print(f"Num Anomalies Captured: {num_anomalies}")
    print(f"Rule Condition: {rule['Rule Condition']}\n")


Rule 1:
Features: Education, Age, Income
Num Anomalies Captured: 30
Rule Condition: (Education == 'High') OR (Education == 'Low') OR (Education == 'Medium') OR (Education == 'High') AND (Education == 'Low') OR (Education == 'High') AND (Education == 'Medium') OR (Education == 'Low') AND (Education == 'Medium') OR (Education == 'High') AND (Education == 'Low') AND (Education == 'Medium') AND Age > 0 AND Income > 66500.0

Rule 2:
Features: Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: (Gender == 'Male') OR (Gender == 'Female') OR (Gender == 'Male') AND (Gender == 'Female') AND Age > 0 AND Income > 66500.0

Rule 3:
Features: Education, Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: (Education == 'High') OR (Education == 'Low') OR (Education == 'Medium') OR (Education == 'High') AND (Education == 'Low') OR (Education == 'High') AND (Education == 'Medium') OR (Education == 'Low') AND (Education == 'Medium') OR (Education == 'High') AND (Education == 'Low') A