In [None]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [0, 0, 0, 1, 1, 0, 1, 0, 1, 0] * 5
}
df = pd.DataFrame(data)

# Initialize variables
rules = []
max_rules = 20  # Generate more rules

# Generate combinations of features, including both numerical and categorical variables
numerical_features = ['Age', 'Income']
categorical_features = ['Education', 'Gender']

# Include individual categorical values
for cat_feature in categorical_features:
    feature_combination = [cat_feature] + numerical_features

    # Apply the combination of features
    subset_df = df[feature_combination + ['Anomaly']]

    # Calculate the number of anomalies in this subset
    num_anomalies = subset_df['Anomaly'].sum()

    # Generate a rule for the current feature combination
    rule = {
        'Features': ', '.join(feature_combination),
        'Num Anomalies Captured': num_anomalies
    }

    # Add the rule to the list
    rules.append(rule)

# Generate combinations of categorical variables
for num_features in range(1, len(categorical_features) + 1):
    for cat_features in itertools.combinations(categorical_features, num_features):
        feature_combination = list(cat_features) + numerical_features

        # Apply the combination of features
        subset_df = df[feature_combination + ['Anomaly']]

        # Calculate the number of anomalies in this subset
        num_anomalies = subset_df['Anomaly'].sum()

        # Generate a rule for the current feature combination
        rule = {
            'Features': ', '.join(feature_combination),
            'Num Anomalies Captured': num_anomalies
        }

        # Add the rule to the list
        rules.append(rule)


# Sort the rules by the number of anomalies detected in descending order
rules.sort(key=lambda x: x['Num Anomalies Captured'], reverse=True)

# Print the top 20 rules with anomalies counts in descending order
for i, rule in enumerate(rules[:max_rules]):
    num_anomalies = rule['Num Anomalies Captured']
    print(f"Rule {i + 1}:")
    print(f"Features: {rule['Features']}")
    print(f"Num Anomalies Captured: {num_anomalies}")

    # Generate rules for the current feature combination
    features = rule['Features'].split(', ')
    rule_conditions = []
    for feature in features:
        if feature in numerical_features:
            rule_condition = f"{feature} >= {df[feature].mean()}"
        else:
            categories = df[feature].unique()
            category_conditions = [f"{feature} == '{category}'" for category in categories]
            rule_condition = " OR ".join(category_conditions)
        rule_conditions.append(rule_condition)

    # Combine the rules into a single rule for the current feature combination
    combined_rule = " AND ".join(rule_conditions)
    print(f"Rule Condition: {combined_rule}\n")


In [10]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [0, 0, 0, 1, 1, 0, 1, 0, 1, 0] * 5
}
df = pd.DataFrame(data)

# Initialize variables
rules = []
max_rules = 20  # Generate more rules

# Generate combinations of features, including both numerical and categorical variables
numerical_features = ['Age', 'Income']
categorical_features = ['Education', 'Gender']

# Include individual categorical values and combinations as single distinct values
for num_features in range(1, len(categorical_features) + 1):
    for cat_features in itertools.combinations(categorical_features, num_features):
        # Create a new categorical feature by combining the selected categorical values
        combined_categorical = '_'.join(cat_features)
        df[combined_categorical] = df[list(cat_features)].apply('_'.join, axis=1)

        # Apply the combination of features
        subset_df = df[[combined_categorical] + numerical_features + ['Anomaly']]

        # Calculate the number of anomalies in this subset
        num_anomalies = subset_df['Anomaly'].sum()

        # Generate a rule for the current feature combination
        rule = {
            'Features': ', '.join([combined_categorical] + numerical_features),
            'Num Anomalies Captured': num_anomalies
        }

        # Add the rule to the list
        rules.append(rule)
        
        
# Sort the rules by the number of anomalies detected in descending order
rules.sort(key=lambda x: x['Num Anomalies Captured'], reverse=True)

# Print the top 20 rules with anomalies counts in descending order
for i, rule in enumerate(rules[:max_rules]):
    num_anomalies = rule['Num Anomalies Captured']
    print(f"Rule {i + 1}:")
    print(f"Features: {rule['Features']}")
    print(f"Num Anomalies Captured: {num_anomalies}")

    # Generate rules for the current feature combination
    features = rule['Features'].split(', ')
    rule_conditions = []
    for feature in features:
        if feature in numerical_features:
            rule_condition = f"{feature} >= {df[feature].mean()}"
        else:
            categories = df[feature].unique()
            category_conditions = [f"{feature} == '{category}'" for category in categories]
            rule_condition = " OR ".join(category_conditions)
        rule_conditions.append(rule_condition)

    # Combine the rules into a single rule for the current feature combination
    combined_rule = " AND ".join(rule_conditions)
    print(f"Rule Condition: {combined_rule}\n")


Rule 1:
Features: Education, Age, Income
Num Anomalies Captured: 20
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Age >= 31.4 AND Income >= 62600.0

Rule 2:
Features: Gender, Age, Income
Num Anomalies Captured: 20
Rule Condition: Gender == 'Male' OR Gender == 'Female' AND Age >= 31.4 AND Income >= 62600.0

Rule 3:
Features: Education_Gender, Age, Income
Num Anomalies Captured: 20
Rule Condition: Education_Gender == 'High_Male' OR Education_Gender == 'Low_Female' OR Education_Gender == 'Medium_Male' OR Education_Gender == 'Medium_Female' OR Education_Gender == 'High_Female' OR Education_Gender == 'Low_Male' AND Age >= 31.4 AND Income >= 62600.0



In [11]:


# Generate combinations of features, including both numerical and categorical variables
numerical_features = ['Age', 'Income']
categorical_features = ['Education', 'Gender']

# Calculate dynamic thresholds (mean) for numerical features
numerical_thresholds = {feature: df[feature].mean() for feature in numerical_features}

# Include individual categorical values and combinations as single distinct values
for num_features in range(1, len(categorical_features) + 1):
    for cat_features in itertools.combinations(categorical_features, num_features):
        # Create a new categorical feature by combining the selected categorical values
        combined_categorical = '_'.join(cat_features)
        df[combined_categorical] = df[list(cat_features)].apply('_'.join, axis=1)

        # Apply the combination of features
        subset_df = df[[combined_categorical] + numerical_features + ['Anomaly']]

        # Calculate the number of anomalies in this subset
        num_anomalies = subset_df['Anomaly'].sum()

        # Generate a rule for the current feature combination
        rule = {
            'Features': ', '.join([combined_categorical] + numerical_features),
            'Num Anomalies Captured': num_anomalies
        }

        # Check if numerical features are greater than or equal to their respective dynamic thresholds
        if (subset_df[numerical_features] >= pd.Series(numerical_thresholds)).all().all():
            rules.append(rule)
            
            
# Sort the rules by the number of anomalies detected in descending order
rules.sort(key=lambda x: x['Num Anomalies Captured'], reverse=True)

# Print the top 20 rules with anomalies counts in descending order
for i, rule in enumerate(rules[:max_rules]):
    num_anomalies = rule['Num Anomalies Captured']
    print(f"Rule {i + 1}:")
    print(f"Features: {rule['Features']}")
    print(f"Num Anomalies Captured: {num_anomalies}")

    # Generate rules for the current feature combination
    features = rule['Features'].split(', ')
    rule_conditions = []
    for feature in features:
        if feature in numerical_features:
            rule_condition = f"{feature} >= {df[feature].mean()}"
        else:
            categories = df[feature].unique()
            category_conditions = [f"{feature} == '{category}'" for category in categories]
            rule_condition = " OR ".join(category_conditions)
        rule_conditions.append(rule_condition)

    # Combine the rules into a single rule for the current feature combination
    combined_rule = " AND ".join(rule_conditions)
    print(f"Rule Condition: {combined_rule}\n")

Rule 1:
Features: Education, Age, Income
Num Anomalies Captured: 20
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Age >= 31.4 AND Income >= 62600.0

Rule 2:
Features: Gender, Age, Income
Num Anomalies Captured: 20
Rule Condition: Gender == 'Male' OR Gender == 'Female' AND Age >= 31.4 AND Income >= 62600.0

Rule 3:
Features: Education_Gender, Age, Income
Num Anomalies Captured: 20
Rule Condition: Education_Gender == 'High_Male' OR Education_Gender == 'Low_Female' OR Education_Gender == 'Medium_Male' OR Education_Gender == 'Medium_Female' OR Education_Gender == 'High_Female' OR Education_Gender == 'Low_Male' AND Age >= 31.4 AND Income >= 62600.0



In [12]:
categorical_features = ['Education', 'Gender']
numerical_features = ['Age', 'Income']

# Iterate through categorical features
for cat_feature in categorical_features:
    # Create a new dataframe with the current categorical feature and numerical features
    subset_df = df[[cat_feature] + numerical_features + ['Anomaly']]

    # Calculate the number of anomalies in this subset
    num_anomalies = subset_df['Anomaly'].sum()

    # Generate a rule for the current feature combination
    rule = {
        'Features': ', '.join([cat_feature] + numerical_features),
        'Num Anomalies Captured': num_anomalies
    }

    # Add the rule to the list
    rules.append(rule)
    
    
# Sort the rules by the number of anomalies detected in descending order
rules.sort(key=lambda x: x['Num Anomalies Captured'], reverse=True)

# Print the top 20 rules with anomalies counts in descending order
for i, rule in enumerate(rules[:max_rules]):
    num_anomalies = rule['Num Anomalies Captured']
    print(f"Rule {i + 1}:")
    print(f"Features: {rule['Features']}")
    print(f"Num Anomalies Captured: {num_anomalies}")

    # Generate rules for the current feature combination
    features = rule['Features'].split(', ')
    rule_conditions = []
    for feature in features:
        if feature in numerical_features:
            rule_condition = f"{feature} >= {df[feature].mean()}"
        else:
            categories = df[feature].unique()
            category_conditions = [f"{feature} == '{category}'" for category in categories]
            rule_condition = " OR ".join(category_conditions)
        rule_conditions.append(rule_condition)

    # Combine the rules into a single rule for the current feature combination
    combined_rule = " AND ".join(rule_conditions)
    print(f"Rule Condition: {combined_rule}\n")

Rule 1:
Features: Education, Age, Income
Num Anomalies Captured: 20
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Age >= 31.4 AND Income >= 62600.0

Rule 2:
Features: Gender, Age, Income
Num Anomalies Captured: 20
Rule Condition: Gender == 'Male' OR Gender == 'Female' AND Age >= 31.4 AND Income >= 62600.0

Rule 3:
Features: Education_Gender, Age, Income
Num Anomalies Captured: 20
Rule Condition: Education_Gender == 'High_Male' OR Education_Gender == 'Low_Female' OR Education_Gender == 'Medium_Male' OR Education_Gender == 'Medium_Female' OR Education_Gender == 'High_Female' OR Education_Gender == 'Low_Male' AND Age >= 31.4 AND Income >= 62600.0

Rule 4:
Features: Education, Age, Income
Num Anomalies Captured: 20
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Age >= 31.4 AND Income >= 62600.0

Rule 5:
Features: Gender, Age, Income
Num Anomalies Captured: 20
Rule Condition: Gender == 'Male' OR Gender == 'Fema

In [15]:
import pandas as pd

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [0, 0, 0, 1, 1, 0, 1, 0, 1, 0] * 5
}
df = pd.DataFrame(data)

# Initialize variables
rules = []
max_rules = 20  # Generate more rules

# Generate combinations of features, including both categorical and numerical variables
categorical_features = ['Education', 'Gender']
numerical_features = ['Age', 'Income']

# Iterate through categorical features and generate rules for each category separately
for cat_feature in categorical_features:
    # Iterate through unique categories in the current categorical feature
    for category in df[cat_feature].unique():
        # Create a new dataframe with the current category, numerical features, and other categorical features
        subset_df = df[(df[cat_feature] == category)][[cat_feature] + numerical_features + ['Anomaly']]

        # Calculate the number of anomalies in this subset
        num_anomalies = subset_df['Anomaly'].sum()

        # Generate a rule for the current category
        rule = {
            'Features': ', '.join([cat_feature] + numerical_features),
            'Num Anomalies Captured': num_anomalies
        }

        # Add the rule to the list
        rules.append(rule)

# Sort the rules by the number of anomalies detected in descending order
rules.sort(key=lambda x: x['Num Anomalies Captured'], reverse=True)

# Print the top 20 rules with anomalies counts in descending order
for i, rule in enumerate(rules[:max_rules]):
    num_anomalies = rule['Num Anomalies Captured']
    print(f"Rule {i + 1}:")
    print(f"Features: {rule['Features']}")
    print(f"Num Anomalies Captured: {num_anomalies}")

    # Generate rules for the current feature combination
    features = rule['Features'].split(', ')
    rule_conditions = []
    for feature in features:
        if feature in numerical_features:
            rule_condition = f"{feature} >= {df[feature].mean()}"
        else:
            categories = df[feature].unique()
            category_conditions = [f"{feature} == '{category}'" for category in categories]
            rule_condition = " OR ".join(category_conditions)
        rule_conditions.append(rule_condition)

    # Combine the rules into a single rule for the current feature combination
    combined_rule = " AND ".join(rule_conditions)
    print(f"Rule Condition: {combined_rule}\n")

Rule 1:
Features: Gender, Age, Income
Num Anomalies Captured: 15
Rule Condition: Gender == 'Male' OR Gender == 'Female' AND Age >= 31.4 AND Income >= 62600.0

Rule 2:
Features: Education, Age, Income
Num Anomalies Captured: 10
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Age >= 31.4 AND Income >= 62600.0

Rule 3:
Features: Education, Age, Income
Num Anomalies Captured: 5
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Age >= 31.4 AND Income >= 62600.0

Rule 4:
Features: Education, Age, Income
Num Anomalies Captured: 5
Rule Condition: Education == 'High' OR Education == 'Low' OR Education == 'Medium' AND Age >= 31.4 AND Income >= 62600.0

Rule 5:
Features: Gender, Age, Income
Num Anomalies Captured: 5
Rule Condition: Gender == 'Male' OR Gender == 'Female' AND Age >= 31.4 AND Income >= 62600.0



In [17]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Extract rows where Anomaly is True
anomaly_df = df[df['Anomaly'] == True]

# Get the unique values for each column
unique_values = {column: anomaly_df[column].unique() for column in anomaly_df.columns}

# Initialize an empty list to store the combinations
rule_combinations = []

# Loop through the columns and generate combinations of rules
for num_columns in range(1, len(anomaly_df.columns)):
    for combination in itertools.combinations(anomaly_df.columns, num_columns):
        # Exclude 'Anomaly' from the combination
        if 'Anomaly' in combination:
            continue
        
        # Generate all possible combinations of values for the selected columns
        value_combinations = itertools.product(*(unique_values[column] for column in combination))
        
        # Create a rule for each value combination
        for values in value_combinations:
            rule = " & ".join([f"{column} == '{value}'" for column, value in zip(combination, values)])
            rule_combinations.append(rule)

# Print the list of rule combinations
for rule in rule_combinations:
    print(rule)


Age == '30'
Age == '35'
Age == '28'
Age == '45'
Age == '33'
Age == '29'
Income == '60000'
Income == '70000'
Income == '55000'
Income == '90000'
Income == '65000'
Income == '59000'
Education == 'Low'
Education == 'Medium'
Education == 'High'
Gender == 'Female'
Gender == 'Male'
Age == '30' & Income == '60000'
Age == '30' & Income == '70000'
Age == '30' & Income == '55000'
Age == '30' & Income == '90000'
Age == '30' & Income == '65000'
Age == '30' & Income == '59000'
Age == '35' & Income == '60000'
Age == '35' & Income == '70000'
Age == '35' & Income == '55000'
Age == '35' & Income == '90000'
Age == '35' & Income == '65000'
Age == '35' & Income == '59000'
Age == '28' & Income == '60000'
Age == '28' & Income == '70000'
Age == '28' & Income == '55000'
Age == '28' & Income == '90000'
Age == '28' & Income == '65000'
Age == '28' & Income == '59000'
Age == '45' & Income == '60000'
Age == '45' & Income == '70000'
Age == '45' & Income == '55000'
Age == '45' & Income == '90000'
Age == '45' & Incom

In [19]:
import pandas as pd

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Initialize a dictionary to store the optimal cutoff values
optimal_cutoffs = {}

# Iterate through each numeric variable
numeric_variables = ['Age', 'Income']
for var in numeric_variables:
    unique_values = sorted(df[var].unique())  # Get unique values and sort them
    
    # Initialize variables to keep track of the optimal cutoff and the highest anomaly rate
    optimal_cutoff = None
    highest_anomaly_rate = 0.0
    
    # Iterate through unique values and find the cutoff with the highest anomaly rate
    for value in unique_values:
        subset = df[df[var] <= value]
        anomaly_rate = subset['Anomaly'].mean()
        
        if anomaly_rate > highest_anomaly_rate:
            highest_anomaly_rate = anomaly_rate
            optimal_cutoff = value
    
    optimal_cutoffs[var] = optimal_cutoff

# Print the optimal cutoff values
for var, cutoff in optimal_cutoffs.items():
    print(f"Optimal {var} Cutoff: {cutoff}")

    
    
# Extract rows where Anomaly is True
anomaly_df = df[df['Anomaly'] == True]

# Get the unique values for each column
unique_values = {column: anomaly_df[column].unique() for column in anomaly_df.columns}

# Initialize an empty list to store the combinations
rule_combinations = []

# Loop through the columns and generate combinations of rules
for num_columns in range(1, len(anomaly_df.columns)):
    for combination in itertools.combinations(anomaly_df.columns, num_columns):
        # Exclude 'Anomaly' from the combination
        if 'Anomaly' in combination:
            continue
        
        # Generate all possible combinations of values for the selected columns
        value_combinations = itertools.product(*(unique_values[column] for column in combination))
        
        # Create a rule for each value combination
        for values in value_combinations:
            rule = " & ".join([f"{column} == '{value}'" for column, value in zip(combination, values)])
            rule_combinations.append(rule)

# Print the list of rule combinations
for rule in rule_combinations:
    print(rule)

Optimal Age Cutoff: 35
Optimal Income Cutoff: 70000
Age == '30'
Age == '35'
Age == '28'
Age == '45'
Age == '33'
Age == '29'
Income == '60000'
Income == '70000'
Income == '55000'
Income == '90000'
Income == '65000'
Income == '59000'
Education == 'Low'
Education == 'Medium'
Education == 'High'
Gender == 'Female'
Gender == 'Male'
Age == '30' & Income == '60000'
Age == '30' & Income == '70000'
Age == '30' & Income == '55000'
Age == '30' & Income == '90000'
Age == '30' & Income == '65000'
Age == '30' & Income == '59000'
Age == '35' & Income == '60000'
Age == '35' & Income == '70000'
Age == '35' & Income == '55000'
Age == '35' & Income == '90000'
Age == '35' & Income == '65000'
Age == '35' & Income == '59000'
Age == '28' & Income == '60000'
Age == '28' & Income == '70000'
Age == '28' & Income == '55000'
Age == '28' & Income == '90000'
Age == '28' & Income == '65000'
Age == '28' & Income == '59000'
Age == '45' & Income == '60000'
Age == '45' & Income == '70000'
Age == '45' & Income == '55000'

In [20]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Initialize a dictionary to store the optimal cutoff values
optimal_cutoffs = {}

# Iterate through each numeric variable
numeric_variables = ['Age', 'Income']
for var in numeric_variables:
    unique_values = sorted(df[var].unique())  # Get unique values and sort them
    
    # Initialize variables to keep track of the optimal cutoff and the highest anomaly rate
    optimal_cutoff = None
    highest_anomaly_rate = 0.0
    
    # Iterate through unique values and find the cutoff with the highest anomaly rate
    for value in unique_values:
        subset = df[df[var] <= value]
        anomaly_rate = subset['Anomaly'].mean()
        
        if anomaly_rate > highest_anomaly_rate:
            highest_anomaly_rate = anomaly_rate
            optimal_cutoff = value
    
    optimal_cutoffs[var] = optimal_cutoff

# Print the optimal cutoff values
for var, cutoff in optimal_cutoffs.items():
    print(f"Optimal {var} Cutoff: {cutoff}")

# Extract rows where Anomaly is True
anomaly_df = df[df['Anomaly'] == True]

# Get the unique values for each column
unique_values = {column: anomaly_df[column].unique() for column in anomaly_df.columns}

# Initialize an empty list to store the combinations
rule_combinations = []

# Loop through the columns and generate combinations of rules
for num_columns in range(1, len(anomaly_df.columns)):
    for combination in itertools.combinations(anomaly_df.columns, num_columns):
        # Exclude 'Anomaly' from the combination
        if 'Anomaly' in combination:
            continue
        
        # Generate all possible combinations of values for the selected columns
        value_combinations = itertools.product(*(unique_values[column] for column in combination))
        
        # Create a rule for each value combination
        for values in value_combinations:
            rule = " & ".join([f"{column} == '{value}'" for column, value in zip(combination, values)])
            rule_combinations.append(rule)

# Print the list of rule combinations
for rule in rule_combinations:
    print(rule)


Optimal Age Cutoff: 35
Optimal Income Cutoff: 70000
Age == '30'
Age == '35'
Age == '28'
Age == '45'
Age == '33'
Age == '29'
Income == '60000'
Income == '70000'
Income == '55000'
Income == '90000'
Income == '65000'
Income == '59000'
Education == 'Low'
Education == 'Medium'
Education == 'High'
Gender == 'Female'
Gender == 'Male'
Age == '30' & Income == '60000'
Age == '30' & Income == '70000'
Age == '30' & Income == '55000'
Age == '30' & Income == '90000'
Age == '30' & Income == '65000'
Age == '30' & Income == '59000'
Age == '35' & Income == '60000'
Age == '35' & Income == '70000'
Age == '35' & Income == '55000'
Age == '35' & Income == '90000'
Age == '35' & Income == '65000'
Age == '35' & Income == '59000'
Age == '28' & Income == '60000'
Age == '28' & Income == '70000'
Age == '28' & Income == '55000'
Age == '28' & Income == '90000'
Age == '28' & Income == '65000'
Age == '28' & Income == '59000'
Age == '45' & Income == '60000'
Age == '45' & Income == '70000'
Age == '45' & Income == '55000'

In [21]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Extract rows where Anomaly is True
anomaly_df = df[df['Anomaly'] == True]

# Get the unique values for each column
unique_values = {column: anomaly_df[column].unique() for column in anomaly_df.columns}

# Initialize an empty list to store the combinations and their anomaly counts
rule_combinations = []
anomaly_counts = []

# Loop through the columns and generate combinations of rules
for num_columns in range(1, len(anomaly_df.columns)):
    for combination in itertools.combinations(anomaly_df.columns, num_columns):
        # Exclude 'Anomaly' from the combination
        if 'Anomaly' in combination:
            continue
        
        # Generate all possible combinations of values for the selected columns
        value_combinations = itertools.product(*(unique_values[column] for column in combination))
        
        # Create a rule for each value combination
        rule = " & ".join([f"{column} == '{value}'" for column, value in zip(combination, values)])
        
        # Count the number of anomalies for this rule
        subset = anomaly_df
        for column, value in zip(combination, values):
            subset = subset[subset[column] == value]
        anomaly_count = len(subset)
        
        rule_combinations.append(rule)
        anomaly_counts.append(anomaly_count)

# Find the rule with the maximum number of anomalies
max_anomaly_rule = rule_combinations[anomaly_counts.index(max(anomaly_counts))]
max_anomaly_count = max(anomaly_counts)

# Print the rule with the maximum number of anomalies
print(f"Rule with Maximum Anomalies: {max_anomaly_rule}")
print(f"Number of Anomalies: {max_anomaly_count}")


Rule with Maximum Anomalies: Age == '29'
Number of Anomalies: 5


In [27]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Extract rows where Anomaly is True
anomaly_df = df[df['Anomaly'] == True]

# Get the unique values for each column
unique_values = {column: anomaly_df[column].unique() for column in anomaly_df.columns}
print("unique_values", unique_values)
# Initialize an empty list to store the combinations and their anomaly counts
rule_combinations = []
anomaly_counts = []

# Loop through the columns and generate combinations of rules
for num_columns in range(1, len(anomaly_df.columns)):
    for combination in itertools.combinations(anomaly_df.columns, num_columns):
        # Exclude 'Anomaly' from the combination
        if 'Anomaly' in combination:
            continue
        
        # Generate all possible combinations of values for the selected columns
        value_combinations = itertools.product(*(unique_values[column] for column in combination))
        
        # Create a rule for each value combination
        rule = " & ".join([f"{column} == '{value}'" for column, value in zip(combination, values)])
        
        # Count the number of anomalies for this rule
        subset = anomaly_df
        for column, value in zip(combination, values):
            subset = subset[subset[column] == value]
        anomaly_count = len(subset)
        
        rule_combinations.append(rule)
        anomaly_counts.append(anomaly_count)

# Create a DataFrame to store the rules and their corresponding anomaly counts
rule_df = pd.DataFrame({'Rule': rule_combinations, 'Anomaly_Count': anomaly_counts})

# Sort the DataFrame by Anomaly_Count in descending order
sorted_rule_df = rule_df.sort_values(by='Anomaly_Count', ascending=False)
print("sorted_rule_df", sorted_rule_df)
# Print the top 20 rules with the highest number of anomalies
top_20_rules = sorted_rule_df.head(20)
print(top_20_rules)


unique_values {'Age': array([30, 35, 28, 45, 33, 29], dtype=int64), 'Income': array([60000, 70000, 55000, 90000, 65000, 59000], dtype=int64), 'Education': array(['Low', 'Medium', 'High'], dtype=object), 'Gender': array(['Female', 'Male'], dtype=object), 'Anomaly': array([ True])}
sorted_rule_df                                                  Rule  Anomaly_Count
0                                         Age == '29'              5
4                     Age == '29' & Income == '59000'              5
1                                      Income == '29'              0
2                                   Education == '29'              0
3                                      Gender == '29'              0
5                  Age == '29' & Education == '59000'              0
6                     Age == '29' & Gender == '59000'              0
7               Income == '29' & Education == '59000'              0
8                  Income == '29' & Gender == '59000'              0
9             

In [23]:
top_20_rules.head()

Unnamed: 0,Rule,Anomaly_Count
0,Age == '29',5
4,Age == '29' & Income == '59000',5
1,Income == '29',0
2,Education == '29',0
3,Gender == '29',0


In [28]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Extract rows where Anomaly is True
anomaly_df = df[df['Anomaly'] == True]

# Get the mean values for each numeric column
mean_values = {column: anomaly_df[column].mean() for column in anomaly_df.select_dtypes(include='number').columns}

# Initialize an empty list to store the combinations and their anomaly counts
rule_combinations = []
anomaly_counts = []

# Loop through the columns and generate combinations of rules using mean values as cutoffs
for num_columns in range(1, len(anomaly_df.columns)):
    for combination in itertools.combinations(anomaly_df.select_dtypes(include='number').columns, num_columns):
        # Exclude 'Anomaly' from the combination
        if 'Anomaly' in combination:
            continue
        
        # Create a rule for each value combination using mean values as cutoffs
        rule = " & ".join([f"{column} <= {mean_values[column]}" for column in combination])
        
        # Count the number of anomalies for this rule
        subset = anomaly_df
        for column in combination:
            subset = subset[subset[column] <= mean_values[column]]
        anomaly_count = len(subset)
        
        rule_combinations.append(rule)
        anomaly_counts.append(anomaly_count)

# Create a DataFrame to store the rules and their corresponding anomaly counts
rule_df = pd.DataFrame({'Rule': rule_combinations, 'Anomaly_Count': anomaly_counts})

# Sort the DataFrame by Anomaly_Count in descending order
sorted_rule_df = rule_df.sort_values(by='Anomaly_Count', ascending=False)

# Print the top 20 rules with the highest number of anomalies
top_20_rules = sorted_rule_df.head(20)
print(top_20_rules)


                                            Rule  Anomaly_Count
0                      Age <= 33.333333333333336             20
1                              Income <= 66500.0             20
2  Age <= 33.333333333333336 & Income <= 66500.0             20


In [29]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Extract rows where Anomaly is True
anomaly_df = df[df['Anomaly'] == True]

# Initialize an empty list to store the combinations and their anomaly counts
rule_combinations = []
anomaly_counts = []

# Loop through the columns and generate combinations of rules using mean values for numeric columns
for num_columns in range(1, len(anomaly_df.columns)):
    for combination in itertools.combinations(anomaly_df.columns, num_columns):
        # Exclude 'Anomaly' from the combination
        if 'Anomaly' in combination:
            continue
        
        # Create a rule for each value combination
        rule = ""
        subset = anomaly_df
        
        for column in combination:
            if column in anomaly_df.select_dtypes(include='number').columns:
                # For numeric columns, use mean values as cutoffs
                mean_value = anomaly_df[column].mean()
                rule += f"{column} <= {mean_value} & "
                subset = subset[subset[column] <= mean_value]
            else:
                # For categorical columns, include all possible categories
                rule += f"{column} == '{anomaly_df[column].unique()[0]}' & "
        
        # Remove the trailing ' & ' from the rule
        rule = rule[:-3]
        
        # Count the number of anomalies for this rule
        anomaly_count = len(subset)
        
        rule_combinations.append(rule)
        anomaly_counts.append(anomaly_count)

# Create a DataFrame to store the rules and their corresponding anomaly counts
rule_df = pd.DataFrame({'Rule': rule_combinations, 'Anomaly_Count': anomaly_counts})

# Sort the DataFrame by Anomaly_Count in descending order
sorted_rule_df = rule_df.sort_values(by='Anomaly_Count', ascending=False)

# Print the top 20 rules with the highest number of anomalies
top_20_rules = sorted_rule_df.head(20)
print(top_20_rules)


                                                 Rule  Anomaly_Count
2                                  Education == 'Low'             30
3                                  Gender == 'Female'             30
9             Education == 'Low' & Gender == 'Female'             30
0                           Age <= 33.333333333333336             20
1                                   Income <= 66500.0             20
4       Age <= 33.333333333333336 & Income <= 66500.0             20
5      Age <= 33.333333333333336 & Education == 'Low'             20
6      Age <= 33.333333333333336 & Gender == 'Female'             20
7              Income <= 66500.0 & Education == 'Low'             20
8              Income <= 66500.0 & Gender == 'Female'             20
10  Age <= 33.333333333333336 & Income <= 66500.0 ...             20
11  Age <= 33.333333333333336 & Income <= 66500.0 ...             20
12  Age <= 33.333333333333336 & Education == 'Low'...             20
13  Income <= 66500.0 & Education 

In [30]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Extract rows where Anomaly is True
anomaly_df = df[df['Anomaly'] == True]

# Initialize an empty list to store the combinations and their anomaly counts
rule_combinations = []
anomaly_counts = []

# Loop through the columns and generate combinations of rules
for num_columns in range(1, len(anomaly_df.columns)):
    for combination in itertools.combinations(anomaly_df.columns, num_columns):
        # Exclude 'Anomaly' from the combination
        if 'Anomaly' in combination:
            continue
        
        # Create a rule for each value combination
        rule = ""
        subset = anomaly_df
        
        for column in combination:
            if column in anomaly_df.select_dtypes(include='number').columns:
                # For numeric columns, use >= as the condition
                mean_value = anomaly_df[column].mean()
                rule += f"{column} >= {mean_value} & "
                subset = subset[subset[column] >= mean_value]
            else:
                # For categorical columns, include the category exactly as is
                unique_value = anomaly_df[column].unique()[0]
                rule += f"{column} == '{unique_value}' & "
        
        # Remove the trailing ' & ' from the rule
        rule = rule[:-3]
        
        # Count the number of anomalies for this rule
        anomaly_count = len(subset)
        
        rule_combinations.append(rule)
        anomaly_counts.append(anomaly_count)

# Create a DataFrame to store the rules and their corresponding anomaly counts
rule_df = pd.DataFrame({'Rule': rule_combinations, 'Anomaly_Count': anomaly_counts})

# Sort the DataFrame by Anomaly_Count in descending order
sorted_rule_df = rule_df.sort_values(by='Anomaly_Count', ascending=False)

# Print the top 20 rules with the highest number of anomalies
top_20_rules = sorted_rule_df.head(20)
print(top_20_rules)


                                                 Rule  Anomaly_Count
2                                  Education == 'Low'             30
3                                  Gender == 'Female'             30
9             Education == 'Low' & Gender == 'Female'             30
0                           Age >= 33.333333333333336             10
1                                   Income >= 66500.0             10
4       Age >= 33.333333333333336 & Income >= 66500.0             10
5      Age >= 33.333333333333336 & Education == 'Low'             10
6      Age >= 33.333333333333336 & Gender == 'Female'             10
7              Income >= 66500.0 & Education == 'Low'             10
8              Income >= 66500.0 & Gender == 'Female'             10
10  Age >= 33.333333333333336 & Income >= 66500.0 ...             10
11  Age >= 33.333333333333336 & Income >= 66500.0 ...             10
12  Age >= 33.333333333333336 & Education == 'Low'...             10
13  Income >= 66500.0 & Education 

In [32]:
top_20_rules.to_csv('rule_v1.csv')

In [33]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Extract rows where Anomaly is True
anomaly_df = df[df['Anomaly'] == True]

# Initialize an empty list to store the combinations and their anomaly counts
rule_combinations = []
anomaly_counts = []

# Loop through the columns and generate combinations of rules
for num_columns in range(1, len(anomaly_df.columns)):
    for combination in itertools.combinations(anomaly_df.columns, num_columns):
        # Exclude 'Anomaly' from the combination
        if 'Anomaly' in combination:
            continue
        
        # Create a rule for each value combination
        rule = ""
        subset = anomaly_df
        
        for column in combination:
            if column in anomaly_df.select_dtypes(include='number').columns:
                # For numeric columns, use >= as the condition
                mean_value = anomaly_df[column].mean()
                rule += f"{column} >= {mean_value} & "
                subset = subset[subset[column] >= mean_value]
            else:
                # For categorical columns, include all categories using "OR" conditions
                categories = anomaly_df[column].unique()
                category_conditions = " | ".join([f"{column} == '{cat}'" for cat in categories])
                rule += f"({category_conditions}) & "
        
        # Remove the trailing ' & ' from the rule
        rule = rule[:-3]
        
        # Count the number of anomalies for this rule
        anomaly_count = len(subset)
        
        rule_combinations.append(rule)
        anomaly_counts.append(anomaly_count)

# Create a DataFrame to store the rules and their corresponding anomaly counts
rule_df = pd.DataFrame({'Rule': rule_combinations, 'Anomaly_Count': anomaly_counts})

# Sort the DataFrame by Anomaly_Count in descending order
sorted_rule_df = rule_df.sort_values(by='Anomaly_Count', ascending=False)

# Print the top 20 rules with the highest number of anomalies
top_20_rules = sorted_rule_df.head(20)
print(top_20_rules)


                                                 Rule  Anomaly_Count
2   (Education == 'Low' | Education == 'Medium' | ...             30
3             (Gender == 'Female' | Gender == 'Male')             30
9   (Education == 'Low' | Education == 'Medium' | ...             30
0                           Age >= 33.333333333333336             10
1                                   Income >= 66500.0             10
4       Age >= 33.333333333333336 & Income >= 66500.0             10
5   Age >= 33.333333333333336 & (Education == 'Low...             10
6   Age >= 33.333333333333336 & (Gender == 'Female...             10
7   Income >= 66500.0 & (Education == 'Low' | Educ...             10
8   Income >= 66500.0 & (Gender == 'Female' | Gend...             10
10  Age >= 33.333333333333336 & Income >= 66500.0 ...             10
11  Age >= 33.333333333333336 & Income >= 66500.0 ...             10
12  Age >= 33.333333333333336 & (Education == 'Low...             10
13  Income >= 66500.0 & (Education

In [34]:
top_20_rules.to_csv('rule_v2.csv')

In [37]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Extract rows where Anomaly is True
anomaly_df = df[df['Anomaly'] == True]

# Initialize an empty list to store the combinations and their anomaly counts
rule_combinations = []
anomaly_counts = []

# Loop through the columns and generate combinations of rules
for num_columns in range(1, len(anomaly_df.columns)):
    for combination in itertools.combinations(anomaly_df.columns, num_columns):
        # Exclude 'Anomaly' from the combination
        if 'Anomaly' in combination:
            continue
        
        # Create a rule for each value combination
        rule = ""
        subset = anomaly_df
        
        for column in combination:
            if column in anomaly_df.select_dtypes(include='number').columns:
                # For numeric columns, use >= as the condition
                mean_value = anomaly_df[column].mean()
                rule += f"{column} >= {mean_value} & "
                subset = subset[subset[column] >= mean_value]
            else:
                # For categorical columns, include all categories using "OR" conditions
                categories = anomaly_df[column].unique()
                category_conditions = " | ".join([f"{column} == '{cat}'" for cat in categories])
                rule += f"({category_conditions}) & "
        
        # Remove the trailing ' & ' from the rule
        rule = rule[:-3]
        
        # Count the number of anomalies for this rule
        anomaly_count = len(subset)
        
        rule_combinations.append(rule)
        anomaly_counts.append(anomaly_count)

# Create a DataFrame to store the rules and their corresponding anomaly counts
rule_df = pd.DataFrame({'Rule': rule_combinations, 'Anomaly_Count': anomaly_counts})

# Sort the DataFrame by Anomaly_Count in descending order
sorted_rule_df = rule_df.sort_values(by='Anomaly_Count', ascending=False)

# Print the top 20 rules with the highest number of anomalies
top_20_rules = sorted_rule_df.head(50)

# Print the top 20 rules from the previous code
print("Top 20 Rules from Previous Code:")
print(top_20_rules)

# Print the top 20 rules with "OR" conditions for categorical variables
print("\nTop 20 Rules with 'OR' Conditions for Categorical Variables:")
for rule in rule_combinations:
    print(rule)


Top 20 Rules from Previous Code:
                                                 Rule  Anomaly_Count
2   (Education == 'Low' | Education == 'Medium' | ...             30
3             (Gender == 'Female' | Gender == 'Male')             30
9   (Education == 'Low' | Education == 'Medium' | ...             30
0                           Age >= 33.333333333333336             10
1                                   Income >= 66500.0             10
4       Age >= 33.333333333333336 & Income >= 66500.0             10
5   Age >= 33.333333333333336 & (Education == 'Low...             10
6   Age >= 33.333333333333336 & (Gender == 'Female...             10
7   Income >= 66500.0 & (Education == 'Low' | Educ...             10
8   Income >= 66500.0 & (Gender == 'Female' | Gend...             10
10  Age >= 33.333333333333336 & Income >= 66500.0 ...             10
11  Age >= 33.333333333333336 & Income >= 66500.0 ...             10
12  Age >= 33.333333333333336 & (Education == 'Low...             10
1

In [38]:
top_20_rules.to_csv('rule_v33.csv')

In [39]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Extract rows where Anomaly is True
anomaly_df = df[df['Anomaly'] == True]

# Initialize an empty list to store the combinations and their anomaly counts
rule_combinations = []
anomaly_counts = []

# Loop through the columns and generate combinations of rules
for num_columns in range(1, len(anomaly_df.columns)):
    for combination in itertools.combinations(anomaly_df.columns, num_columns):
        # Exclude 'Anomaly' from the combination
        if 'Anomaly' in combination:
            continue
        
        # Create a rule for each value combination
        rule = ""
        subset = anomaly_df
        
        for column in combination:
            if column in anomaly_df.select_dtypes(include='number').columns:
                # For numeric columns, use >= as the condition
                mean_value = anomaly_df[column].mean()
                rule += f"{column} >= {mean_value} & "
                subset = subset[subset[column] >= mean_value]
            else:
                # For categorical columns, include the category exactly as is
                unique_value = anomaly_df[column].unique()[0]
                rule += f"{column} == '{unique_value}' & "
        
        # Remove the trailing ' & ' from the rule
        rule = rule[:-3]
        
        # Count the number of anomalies for this rule
        anomaly_count = len(subset)
        
        rule_combinations.append(rule)
        anomaly_counts.append(anomaly_count)

# Create a DataFrame to store the rules and their corresponding anomaly counts
rule_df = pd.DataFrame({'Rule': rule_combinations, 'Anomaly_Count': anomaly_counts})

# Sort the DataFrame by Anomaly_Count in descending order
sorted_rule_df = rule_df.sort_values(by='Anomaly_Count', ascending=False)

# Print the top 20 rules with the highest number of anomalies
top_20_rules = sorted_rule_df.head(20)
print("Top 20 Rules (Original):")
print(top_20_rules)

# Additional: Generate rules with "OR" conditions for categorical variables
additional_rules = []
for column in anomaly_df.select_dtypes(include='object').columns:
    unique_categories = anomaly_df[column].unique()
    for category in unique_categories:
        rule = f"{column} == '{category}'"
        additional_rules.append(rule)

# Print the additional "OR" condition rules for categorical variables
print("\nAdditional 'OR' Condition Rules for Categorical Variables:")
for rule in additional_rules:
    print(rule)


Top 20 Rules (Original):
                                                 Rule  Anomaly_Count
2                                  Education == 'Low'             30
3                                  Gender == 'Female'             30
9             Education == 'Low' & Gender == 'Female'             30
0                           Age >= 33.333333333333336             10
1                                   Income >= 66500.0             10
4       Age >= 33.333333333333336 & Income >= 66500.0             10
5      Age >= 33.333333333333336 & Education == 'Low'             10
6      Age >= 33.333333333333336 & Gender == 'Female'             10
7              Income >= 66500.0 & Education == 'Low'             10
8              Income >= 66500.0 & Gender == 'Female'             10
10  Age >= 33.333333333333336 & Income >= 66500.0 ...             10
11  Age >= 33.333333333333336 & Income >= 66500.0 ...             10
12  Age >= 33.333333333333336 & Education == 'Low'...             10
13  Incom

In [40]:
top_20_rules.to_csv('rule_v34.csv')

In [42]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Extract rows where Anomaly is True
anomaly_df = df[df['Anomaly'] == True]

# Initialize an empty list to store the combinations and their anomaly counts
rule_combinations = []
anomaly_counts = []
additional_anomaly_counts = []

# Loop through the columns and generate combinations of rules
for num_columns in range(1, len(anomaly_df.columns)):
    for combination in itertools.combinations(anomaly_df.columns, num_columns):
        # Exclude 'Anomaly' from the combination
        if 'Anomaly' in combination:
            continue
        
        # Create a rule for each value combination
        rule = ""
        subset = anomaly_df
        
        for column in combination:
            if column in anomaly_df.select_dtypes(include='number').columns:
                # For numeric columns, use >= as the condition
                mean_value = anomaly_df[column].mean()
                rule += f"{column} >= {mean_value} & "
                subset = subset[subset[column] >= mean_value]
            else:
                # For categorical columns, include the category exactly as is
                unique_value = anomaly_df[column].unique()[0]
                rule += f"{column} == '{unique_value}' & "
        
        # Remove the trailing ' & ' from the rule
        rule = rule[:-3]
        
        # Count the number of anomalies for this rule
        anomaly_count = len(subset)
        
        rule_combinations.append(rule)
        anomaly_counts.append(anomaly_count)

# Additional: Generate rules with "OR" conditions for categorical variables
additional_rules = []
for column in anomaly_df.select_dtypes(include='object').columns:
    unique_categories = anomaly_df[column].unique()
    for category in unique_categories:
        rule = f"{column} == '{category}'"
        additional_rules.append(rule)
        # Count the number of anomalies for this additional rule
        additional_subset = anomaly_df[anomaly_df[column] == category]
        additional_anomaly_counts.append(len(additional_subset))

# Combine additional "OR" condition rules with the current rules
rule_combinations += additional_rules
anomaly_counts += additional_anomaly_counts

# Create a DataFrame to store the rules and their corresponding anomaly counts
rule_df = pd.DataFrame({'Rule': rule_combinations, 'Anomaly_Count': anomaly_counts})

# Sort the DataFrame by Anomaly_Count in descending order
sorted_rule_df = rule_df.sort_values(by='Anomaly_Count', ascending=False)

# Print the top 20 rules with the highest number of anomalies, including additional rules
top_20_rules = sorted_rule_df.head(20)
print("Top 20 Rules (Including Additional 'OR' Condition Rules):")
print(top_20_rules)


Top 20 Rules (Including Additional 'OR' Condition Rules):
                                                 Rule  Anomaly_Count
2                                  Education == 'Low'             30
3                                  Gender == 'Female'             30
9             Education == 'Low' & Gender == 'Female'             30
19                                   Gender == 'Male'             15
18                                 Gender == 'Female'             15
16                              Education == 'Medium'             15
11  Age >= 33.333333333333336 & Income >= 66500.0 ...             10
15                                 Education == 'Low'             10
14  Age >= 33.333333333333336 & Income >= 66500.0 ...             10
13  Income >= 66500.0 & Education == 'Low' & Gende...             10
12  Age >= 33.333333333333336 & Education == 'Low'...             10
0                           Age >= 33.333333333333336             10
1                                   Income >=

In [43]:
top_20_rules.to_csv('rule_v36.csv')

In [44]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Extract rows where Anomaly is True
anomaly_df = df[df['Anomaly'] == True]

# Initialize an empty list to store the combinations and their anomaly counts
rule_combinations = []
anomaly_counts = []

# Loop through the columns and generate combinations of rules
for num_columns in range(1, len(anomaly_df.columns)):
    for combination in itertools.combinations(anomaly_df.columns, num_columns):
        # Exclude 'Anomaly' from the combination
        if 'Anomaly' in combination:
            continue
        
        # Create a rule for each value combination
        rule = ""
        subset = anomaly_df
        
        for column in combination:
            if column in anomaly_df.select_dtypes(include='number').columns:
                # For numeric columns, use >= as the condition
                mean_value = anomaly_df[column].mean()
                rule += f"{column} >= {mean_value} & "
                subset = subset[subset[column] >= mean_value]
            else:
                # For categorical columns, include the category exactly as is
                unique_value = anomaly_df[column].unique()[0]
                rule += f"{column} == '{unique_value}' & "
        
        # Remove the trailing ' & ' from the rule
        rule = rule[:-3]
        
        # Count the number of anomalies for this rule
        anomaly_count = len(subset)
        
        rule_combinations.append(rule)
        anomaly_counts.append(anomaly_count)

# Additional: Generate rules with "OR" conditions for categorical variables
additional_rules = []
for column in anomaly_df.select_dtypes(include='object').columns:
    unique_categories = anomaly_df[column].unique()
    for category in unique_categories:
        rule = f"{column} == '{category}'"
        additional_rules.append(rule)
        # Count the number of anomalies for this additional rule
        additional_subset = anomaly_df[anomaly_df[column] == category]
        anomaly_count = len(additional_subset)
        
        # Combine the "OR" condition rule with the current rule using "OR"
        combined_rule = f"({rule}) | {rule}"
        rule_combinations.append(combined_rule)
        anomaly_counts.append(anomaly_count)

# Create a DataFrame to store the rules and their corresponding anomaly counts
rule_df = pd.DataFrame({'Rule': rule_combinations, 'Anomaly_Count': anomaly_counts})

# Sort the DataFrame by Anomaly_Count in descending order
sorted_rule_df = rule_df.sort_values(by='Anomaly_Count', ascending=False)

# Print the top 20 rules with the highest number of anomalies, including additional rules
top_20_rules = sorted_rule_df.head(20)
print("Top 20 Rules (Including Additional 'OR' Condition Rules):")
print(top_20_rules)


Top 20 Rules (Including Additional 'OR' Condition Rules):
                                                 Rule  Anomaly_Count
2                                  Education == 'Low'             30
3                                  Gender == 'Female'             30
9             Education == 'Low' & Gender == 'Female'             30
19              (Gender == 'Male') | Gender == 'Male'             15
18          (Gender == 'Female') | Gender == 'Female'             15
16    (Education == 'Medium') | Education == 'Medium'             15
11  Age >= 33.333333333333336 & Income >= 66500.0 ...             10
15          (Education == 'Low') | Education == 'Low'             10
14  Age >= 33.333333333333336 & Income >= 66500.0 ...             10
13  Income >= 66500.0 & Education == 'Low' & Gende...             10
12  Age >= 33.333333333333336 & Education == 'Low'...             10
0                           Age >= 33.333333333333336             10
1                                   Income >=

In [45]:
top_20_rules.to_csv('rule_v37.csv')

In [47]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Extract rows where Anomaly is True
anomaly_df = df[df['Anomaly'] == True]

# Initialize an empty set to store unique rule combinations
unique_rule_combinations = set()

# Loop through the columns and generate combinations of rules
for num_columns in range(1, len(anomaly_df.columns)):
    for combination in itertools.combinations(anomaly_df.columns, num_columns):
        # Exclude 'Anomaly' from the combination
        if 'Anomaly' in combination:
            continue
        
        # Create a rule for each value combination
        rule = ""
        subset = anomaly_df
        
        for column in combination:
            if column in anomaly_df.select_dtypes(include='number').columns:
                # For numeric columns, use >= as the condition
                mean_value = anomaly_df[column].mean()
                rule += f"{column} >= {mean_value} & "
                subset = subset[subset[column] >= mean_value]
            else:
                # For categorical columns, include the category exactly as is
                unique_value = anomaly_df[column].unique()[0]
                rule += f"{column} == '{unique_value}' & "
        
        # Remove the trailing ' & ' from the rule
        rule = rule[:-3]
        
        # Count the number of anomalies for this rule
        anomaly_count = len(subset)
        
        # Add the rule to the set to ensure uniqueness
        unique_rule_combinations.add(rule)

# Additional: Generate rules with "OR" conditions for categorical variables
additional_rules = []
for column in anomaly_df.select_dtypes(include='object').columns:
    unique_categories = anomaly_df[column].unique()
    for category in unique_categories:
        rule = f"{column} == '{category}'"
        additional_rules.append(rule)
        # Count the number of anomalies for this additional rule
        additional_subset = anomaly_df[anomaly_df[column] == category]
        anomaly_count = len(additional_subset)
        
        # Combine the "OR" condition rule with other rules using "OR"
        combined_rules = set()
        for unique_rule in unique_rule_combinations:
            combined_rule = f"({unique_rule}) | {rule}"
            combined_rules.add(combined_rule)
        
        # Update the set of unique rule combinations
        unique_rule_combinations.update(combined_rules)

# Create a DataFrame to store the rules and their corresponding anomaly counts
rule_df = pd.DataFrame({'Rule': list(unique_rule_combinations)})

# Calculate the anomaly counts for each rule and store them in a list
anomaly_counts = []
for rule in rule_df['Rule']:
    subset = anomaly_df.query(rule)
    anomaly_counts.append(len(subset))

# Add the anomaly counts to the DataFrame
rule_df['Anomaly_Count'] = anomaly_counts

# Sort the DataFrame by Anomaly_Count in descending order
sorted_rule_df = rule_df.sort_values(by='Anomaly_Count', ascending=False)

# Print the top 20 rules with the highest number of anomalies, including additional rules
top_20_rules = sorted_rule_df.head(20)
print("Top 20 Rules (Including Additional 'OR' Condition Rules):")
print(top_20_rules)


Top 20 Rules (Including Additional 'OR' Condition Rules):
                                                  Rule  Anomaly_Count
240  ((((Income >= 66500.0 & Education == 'Low' & G...             30
252  (((Income >= 66500.0 & Education == 'Low') | E...             30
267  ((Education == 'Low' & Gender == 'Female') | E...             30
266  (((((Age >= 33.333333333333336 & Income >= 665...             30
263  ((((Age >= 33.333333333333336 & Income >= 6650...             30
261  (((Age >= 33.333333333333336) | Education == '...             30
257  (((Education == 'Low') | Education == 'High') ...             30
253  (((Gender == 'Female') | Education == 'Medium'...             30
250  (((Age >= 33.333333333333336 & Gender == 'Fema...             30
230  ((((Age >= 33.333333333333336 & Income >= 6650...             30
247  (((Age >= 33.333333333333336 & Gender == 'Fema...             30
242  (((Age >= 33.333333333333336 & Education == 'L...             30
237  ((((Age >= 33.3333333333333

In [48]:
top_20_rules.to_csv('rule_v38.csv')