In [2]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [0, 0, 0, 1, 1, 0, 1, 0, 1, 0] * 5
}
df = pd.DataFrame(data)

# Initialize variables
rules = []
max_rules = 20  # Generate more rules

# Generate combinations of features, including both numerical and categorical variables
numerical_features = ['Age', 'Income']
categorical_features = ['Education', 'Gender']

for num_features in range(1, len(numerical_features) + 1):
    for cat_features in itertools.combinations(categorical_features, num_features):
        feature_combination = list(cat_features) + numerical_features

        # Apply the combination of features
        subset_df = df[feature_combination + ['Anomaly']]

        # Calculate the number of anomalies in this subset
        num_anomalies = subset_df['Anomaly'].sum()

        # Generate rules for the current feature combination
        rule = {
            'Features': ', '.join(feature_combination),
            'Num Anomalies Captured': num_anomalies
        }

        # Add the rule to the list
        rules.append(rule)

# Sort the rules by the number of anomalies detected in descending order
rules.sort(key=lambda x: x['Num Anomalies Captured'], reverse=True)

# Print the top 20 rules with anomalies counts in descending order
for i, rule in enumerate(rules[:max_rules]):
    num_anomalies = rule['Num Anomalies Captured']
    print(f"Rule {i + 1}:")
    print(f"Features: {rule['Features']}")
    print(f"Num Anomalies Captured: {num_anomalies}")

    # Generate rules for the current feature combination
    features = rule['Features'].split(', ')
    rules = []
    for feature in features:
        if feature in numerical_features:
            rule_condition = f"{feature} >= {df[feature].min()} AND {feature} <= {df[feature].max()}"
        else:
            categories = df[feature].unique()
            category_conditions = [f"{feature} == '{category}'" for category in categories]
            rule_condition = " OR ".join(category_conditions)
        rules.append(rule_condition)

    # Combine the rules into a single rule for the current feature combination
    combined_rule = " AND ".join(rules)
    print(f"Rule Condition: {combined_rule}\n")


Rule 1:
Features: Education, Age, Income
Num Anomalies Captured: 20
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Age >= 22 AND Age <= 45 AND Income >= 45000 AND Income <= 90000

Rule 2:
Features: Gender, Age, Income
Num Anomalies Captured: 20
Rule Condition: Gender == 'Male' OR Gender == 'Female' AND Age >= 22 AND Age <= 45 AND Income >= 45000 AND Income <= 90000

Rule 3:
Features: Education, Gender, Age, Income
Num Anomalies Captured: 20
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Gender == 'Male' OR Gender == 'Female' AND Age >= 22 AND Age <= 45 AND Income >= 45000 AND Income <= 90000



In [3]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [0, 1, 0, 1, 1, 0, 1, 0, 1, 1] * 5
}
df = pd.DataFrame(data)

# Initialize variables
top_rules = []
top_rules_anomalies = {}
max_rules = 10

# Generate combinations of features, including both numerical and categorical variables
numerical_features = ['Age', 'Income']
categorical_features = ['Education', 'Gender']

for num_features in range(1, len(numerical_features) + 1):
    for cat_features in itertools.combinations(categorical_features, num_features):
        feature_combination = list(cat_features) + numerical_features

        # Apply the combination of features
        subset_df = df[feature_combination + ['Anomaly']]

        # Calculate the number of anomalies in this subset
        num_anomalies = subset_df['Anomaly'].sum()

        # Store the feature combination and its anomalies count
        top_rules.append(feature_combination)
        top_rules_anomalies[tuple(feature_combination)] = num_anomalies

# Sort the top rules by the number of anomalies detected in descending order
top_rules.sort(key=lambda x: top_rules_anomalies[tuple(x)], reverse=True)

# Generate and print the top 10 rules and their anomalies count
for i, rule in enumerate(top_rules[:max_rules]):
    num_anomalies = top_rules_anomalies[tuple(rule)]
    print(f"Rule {i+1}:")
    print(f"Features: {', '.join(rule)}")
    print(f"Num Anomalies Captured: {num_anomalies}")

    # Generate rules for the current feature combination
    rules = []
    for feature in rule:
        if feature in numerical_features:
            rule_condition = f"{feature} >= {df[feature].min()} AND {feature} <= {df[feature].max()}"
        else:
            categories = df[feature].unique()
            category_conditions = [f"{feature} == '{category}'" for category in categories]
            rule_condition = " OR ".join(category_conditions)
        rules.append(rule_condition)

    # Combine the rules into a single rule for the current feature combination
    combined_rule = " AND ".join(rules)
    print(f"Rule Condition: {combined_rule}\n")


Rule 1:
Features: Education, Age, Income
Num Anomalies Captured: 30
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Age >= 22 AND Age <= 45 AND Income >= 45000 AND Income <= 90000

Rule 2:
Features: Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: Gender == 'Male' OR Gender == 'Female' AND Age >= 22 AND Age <= 45 AND Income >= 45000 AND Income <= 90000

Rule 3:
Features: Education, Gender, Age, Income
Num Anomalies Captured: 30
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Gender == 'Male' OR Gender == 'Female' AND Age >= 22 AND Age <= 45 AND Income >= 45000 AND Income <= 90000

