In [4]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Extract rows where Anomaly is True
anomaly_df = df[df['Anomaly'] == True]

# Discretize numerical variables (Age, Income) into bins
from sklearn.preprocessing import KBinsDiscretizer
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
anomaly_df[['Age', 'Income']] = discretizer.fit_transform(anomaly_df[['Age', 'Income']])

# Manually convert discretized values to binary using a threshold
threshold = 2  # Adjust this threshold as needed
anomaly_df['Age'] = anomaly_df['Age'] <= threshold
anomaly_df['Income'] = anomaly_df['Income'] <= threshold

# Encode categorical variables as binary
anomaly_df_encoded = pd.get_dummies(anomaly_df.drop(columns=['Anomaly']), drop_first=True)

# Perform Apriori algorithm for frequent itemset generation
frequent_itemsets = apriori(anomaly_df_encoded, min_support=0.3, use_colnames=True)

# Generate association rules from frequent itemsets
association_rules_df = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

# Filter the association rules based on desired metrics (e.g., lift, confidence)
filtered_rules = association_rules_df[(association_rules_df['lift'] >= 1.0) & (association_rules_df['confidence'] >= 0.7)]

# Print the filtered association rules
print("Filtered Association Rules:")
print(filtered_rules)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anomaly_df[['Age', 'Income']] = discretizer.fit_transform(anomaly_df[['Age', 'Income']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anomaly_df['Age'] = anomaly_df['Age'] <= threshold
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anomaly_df['Income'] = anomaly_df['Income'] <= threshold


Filtered Association Rules:
                   antecedents    consequents  antecedent support  \
0                     (Income)          (Age)            0.833333   
1                        (Age)       (Income)            0.833333   
2              (Education_Low)          (Age)            0.333333   
4              (Education_Low)       (Income)            0.333333   
6      (Education_Low, Income)          (Age)            0.333333   
8         (Education_Low, Age)       (Income)            0.333333   
10             (Education_Low)  (Age, Income)            0.333333   
12  (Education_Medium, Income)          (Age)            0.333333   
13     (Education_Medium, Age)       (Income)            0.333333   
16       (Gender_Male, Income)          (Age)            0.333333   
17          (Gender_Male, Age)       (Income)            0.333333   

    consequent support   support  confidence  lift  leverage  conviction  \
0             0.833333  0.833333         1.0   1.2  0.138889       

In [5]:
# Convert association rules to a human-readable format
def convert_rules_to_readable(rules_df):
    readable_rules = []
    for _, row in rules_df.iterrows():
        antecedents = row['antecedents']
        consequents = row['consequents']
        support = row['support']
        confidence = row['confidence']
        lift = row['lift']
        
        antecedent_str = ', '.join([str(antecedent) for antecedent in antecedents])
        consequent_str = ', '.join([str(consequent) for consequent in consequents])
        
        rule_str = f"If {antecedent_str}, then {consequent_str}"
        metrics_str = f"Support: {support:.4f}, Confidence: {confidence:.4f}, Lift: {lift:.4f}"
        
        readable_rule = f"{rule_str} ({metrics_str})"
        readable_rules.append(readable_rule)
    
    return readable_rules

# Convert the filtered association rules to human-readable format
human_readable_rules = convert_rules_to_readable(filtered_rules)

# Print the human-readable rules
print("Human-Readable Association Rules:")
for rule in human_readable_rules:
    print(rule)


Human-Readable Association Rules:
If Income, then Age (Support: 0.8333, Confidence: 1.0000, Lift: 1.2000)
If Age, then Income (Support: 0.8333, Confidence: 1.0000, Lift: 1.2000)
If Education_Low, then Age (Support: 0.3333, Confidence: 1.0000, Lift: 1.2000)
If Education_Low, then Income (Support: 0.3333, Confidence: 1.0000, Lift: 1.2000)
If Education_Low, Income, then Age (Support: 0.3333, Confidence: 1.0000, Lift: 1.2000)
If Education_Low, Age, then Income (Support: 0.3333, Confidence: 1.0000, Lift: 1.2000)
If Education_Low, then Age, Income (Support: 0.3333, Confidence: 1.0000, Lift: 1.2000)
If Education_Medium, Income, then Age (Support: 0.3333, Confidence: 1.0000, Lift: 1.2000)
If Education_Medium, Age, then Income (Support: 0.3333, Confidence: 1.0000, Lift: 1.2000)
If Gender_Male, Income, then Age (Support: 0.3333, Confidence: 1.0000, Lift: 1.2000)
If Gender_Male, Age, then Income (Support: 0.3333, Confidence: 1.0000, Lift: 1.2000)


In [14]:
import pandas as pd
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import association_rules

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Extract rows where Anomaly is True
anomaly_df = df[df['Anomaly'] == True]

# Discretize numerical variables (Age, Income) into bins
from sklearn.preprocessing import KBinsDiscretizer
discretizer = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
anomaly_df[['Age', 'Income']] = discretizer.fit_transform(anomaly_df[['Age', 'Income']])

# Manually convert discretized values to binary using a threshold
threshold = 2  # Adjust this threshold as needed
anomaly_df['Age'] = anomaly_df['Age'] <= threshold
anomaly_df['Income'] = anomaly_df['Income'] <= threshold

# Encode categorical variables as binary
anomaly_df_encoded = pd.get_dummies(anomaly_df.drop(columns=['Anomaly']), drop_first=True)

# Perform Apriori algorithm for frequent itemset generation
frequent_itemsets = apriori(anomaly_df_encoded, min_support=0.3, use_colnames=True)

# Generate association rules from frequent itemsets
association_rules_df = association_rules(frequent_itemsets, metric="lift", min_threshold=1.0)

# Filter the association rules based on desired metrics (e.g., lift, confidence)
filtered_rules = association_rules_df[(association_rules_df['lift'] >= 1.0) & (association_rules_df['confidence'] >= 0.7)]

# Print the filtered association rules
print("Filtered Association Rules:")
print(filtered_rules)


Filtered Association Rules:
                   antecedents    consequents  antecedent support  \
0                     (Income)          (Age)            0.833333   
1                        (Age)       (Income)            0.833333   
2              (Education_Low)          (Age)            0.333333   
4              (Education_Low)       (Income)            0.333333   
6      (Education_Low, Income)          (Age)            0.333333   
8         (Education_Low, Age)       (Income)            0.333333   
10             (Education_Low)  (Age, Income)            0.333333   
12  (Education_Medium, Income)          (Age)            0.333333   
13     (Education_Medium, Age)       (Income)            0.333333   
16       (Gender_Male, Income)          (Age)            0.333333   
17          (Gender_Male, Age)       (Income)            0.333333   

    consequent support   support  confidence  lift  leverage  conviction  \
0             0.833333  0.833333         1.0   1.2  0.138889       

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anomaly_df[['Age', 'Income']] = discretizer.fit_transform(anomaly_df[['Age', 'Income']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anomaly_df['Age'] = anomaly_df['Age'] <= threshold
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anomaly_df['Income'] = anomaly_df['Income'] <= threshold


In [12]:
print(df['Anomaly'].unique())


[False  True]


In [18]:
# Convert association rules to a human-readable format
def convert_rules_to_readable(rules_df):
    readable_rules = []
    for _, row in rules_df.iterrows():
        antecedents = row['antecedents']
        consequents = row['consequents']
        support = row['support']
        confidence = row['confidence']
        lift = row['lift']
        
        antecedent_str = ', '.join([str(antecedent) for antecedent in antecedents])
        consequent_str = ', '.join([str(consequent) for consequent in consequents])
        
        rule_str = f"If {antecedent_str}, then {consequent_str}"
        metrics_str = f"Support: {support:.4f}, Confidence: {confidence:.4f}, Lift: {lift:.4f}"
        
        readable_rule = f"{rule_str} ({metrics_str})"
        readable_rules.append(readable_rule)
    
    return readable_rules

# Convert the filtered association rules to human-readable format
human_readable_rules = convert_rules_to_readable(filtered_rules)

# Print the human-readable rules
print("Human-Readable Association Rules:")
for rule in human_readable_rules:
    print(rule)


Human-Readable Association Rules:
If Income, then Age (Support: 0.8333, Confidence: 1.0000, Lift: 1.2000)
If Age, then Income (Support: 0.8333, Confidence: 1.0000, Lift: 1.2000)
If Education_Low, then Age (Support: 0.3333, Confidence: 1.0000, Lift: 1.2000)
If Education_Low, then Income (Support: 0.3333, Confidence: 1.0000, Lift: 1.2000)
If Education_Low, Income, then Age (Support: 0.3333, Confidence: 1.0000, Lift: 1.2000)
If Education_Low, Age, then Income (Support: 0.3333, Confidence: 1.0000, Lift: 1.2000)
If Education_Low, then Age, Income (Support: 0.3333, Confidence: 1.0000, Lift: 1.2000)
If Education_Medium, Income, then Age (Support: 0.3333, Confidence: 1.0000, Lift: 1.2000)
If Education_Medium, Age, then Income (Support: 0.3333, Confidence: 1.0000, Lift: 1.2000)
If Gender_Male, Income, then Age (Support: 0.3333, Confidence: 1.0000, Lift: 1.2000)
If Gender_Male, Age, then Income (Support: 0.3333, Confidence: 1.0000, Lift: 1.2000)


In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Separate your dataset into features and target variable
X = df[['Age', 'Income', 'Education', 'Gender']]
y = df['Anomaly']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the decision tree classifier
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)

# Extract rules where Anomaly is True
def extract_rules(tree, feature_names):
    rules = []
    for node in tree.children_left:
        if node != -1:
            feature = feature_names[tree.feature[node]]
            threshold = tree.threshold[node]
            left_rule = f"{feature} <= {threshold}"
            right_rule = f"{feature} > {threshold}"
            rules.append(left_rule)
            rules.append(right_rule)
            rules.extend(extract_rules(tree.children_left[node], feature_names))
            rules.extend(extract_rules(tree.children_right[node], feature_names))
    return rules

rules = extract_rules(clf.tree_, X.columns.tolist())
print(rules)


ValueError: could not convert string to float: 'Medium'

In [24]:
import pyfpgrowth

# Convert numerical columns to strings within the transactions
transactions = df[df['Anomaly']].drop(columns=['Anomaly']).astype(str).values.tolist()

# Perform FP-Growth
patterns = pyfpgrowth.find_frequent_patterns(transactions, 2)  # Adjust the support threshold as needed
rules = pyfpgrowth.generate_association_rules(patterns, 0.7)  # Adjust the confidence threshold as needed

print(rules)


{('30',): (('60000', 'Female', 'Low'), 1.0), ('30', 'Female'): (('60000', 'Low'), 1.0), ('30', 'Low'): (('60000', 'Female'), 1.0), ('Female', 'Low'): (('30', '60000'), 1.0), ('60000',): (('30', 'Female', 'Low'), 1.0), ('30', '60000'): (('Female', 'Low'), 1.0), ('60000', 'Low'): (('30', 'Female'), 1.0), ('60000', 'Female'): (('30', 'Low'), 1.0), ('30', '60000', 'Female'): (('Low',), 1.0), ('30', '60000', 'Low'): (('Female',), 1.0), ('30', 'Female', 'Low'): (('60000',), 1.0), ('60000', 'Female', 'Low'): (('30',), 1.0), ('35',): (('70000', 'Female', 'Medium'), 1.0), ('35', 'Female'): (('70000', 'Medium'), 1.0), ('35', 'Medium'): (('70000', 'Female'), 1.0), ('70000',): (('35', 'Female', 'Medium'), 1.0), ('35', '70000'): (('Female', 'Medium'), 1.0), ('70000', 'Female'): (('35', 'Medium'), 1.0), ('70000', 'Medium'): (('35', 'Female'), 1.0), ('35', '70000', 'Female'): (('Medium',), 1.0), ('35', '70000', 'Medium'): (('Female',), 1.0), ('35', 'Female', 'Medium'): (('70000',), 1.0), ('70000', 'F

In [38]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Filter rows where 'Anomaly' is True
anomaly_df = df[df['Anomaly']]

# Define the columns you want to consider for generating rules
columns_to_consider = ['Age', 'Income', 'Education', 'Gender']

# Generate all possible combinations of conditions
rule_combinations = []
for r in range(1, len(columns_to_consider) + 1):
    for combination in itertools.combinations(columns_to_consider, r):
        rule_combinations.append(combination)

# Generate rules based on selected combinations
rules = []
for combination in rule_combinations:
    conditions = []
    for column in combination:
        if df[column].dtype == 'object':
            condition = f"{column} == '{anomaly_df[column].iloc[0]}'"
        else:
            condition = f"{column} >= {anomaly_df[column].iloc[0]}"
        conditions.append(condition)
    rule = " & ".join(conditions)
    rules.append(rule)

# Apply the generated rules to the anomaly_df using the query method
for i, rule in enumerate(rules, 1):
    filtered_data = anomaly_df.query(rule)
    print(f"Rule {i}: {rule}")
    print(f"Filtered Data using Rule {i}:")
    #print(filtered_data)


Rule 1: Age >= 30
Filtered Data using Rule 1:
Rule 2: Income >= 60000
Filtered Data using Rule 2:
Rule 3: Education == 'Low'
Filtered Data using Rule 3:
Rule 4: Gender == 'Female'
Filtered Data using Rule 4:
Rule 5: Age >= 30 & Income >= 60000
Filtered Data using Rule 5:
Rule 6: Age >= 30 & Education == 'Low'
Filtered Data using Rule 6:
Rule 7: Age >= 30 & Gender == 'Female'
Filtered Data using Rule 7:
Rule 8: Income >= 60000 & Education == 'Low'
Filtered Data using Rule 8:
Rule 9: Income >= 60000 & Gender == 'Female'
Filtered Data using Rule 9:
Rule 10: Education == 'Low' & Gender == 'Female'
Filtered Data using Rule 10:
Rule 11: Age >= 30 & Income >= 60000 & Education == 'Low'
Filtered Data using Rule 11:
Rule 12: Age >= 30 & Income >= 60000 & Gender == 'Female'
Filtered Data using Rule 12:
Rule 13: Age >= 30 & Education == 'Low' & Gender == 'Female'
Filtered Data using Rule 13:
Rule 14: Income >= 60000 & Education == 'Low' & Gender == 'Female'
Filtered Data using Rule 14:
Rule 15: A

In [41]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Filter rows where 'Anomaly' is True
anomaly_df = df[df['Anomaly']]

# Calculate means for numerical columns in 'anomaly_df'
numerical_means = anomaly_df[['Age', 'Income']].mean()

# Define the columns you want to consider for generating rules
columns_to_consider = ['Age', 'Income', 'Education', 'Gender']

# Generate all possible combinations of conditions
rule_combinations = []
for r in range(1, len(columns_to_consider) + 1):
    for combination in itertools.combinations(columns_to_consider, r):
        rule_combinations.append(combination)

# Generate rules based on selected combinations
rules = []
for combination in rule_combinations:
    conditions = []
    for column in combination:
        if df[column].dtype == 'object':
            condition = f"{column} == '{anomaly_df[column].iloc[0]}'"
        else:
            # Use mean as cutoff for numerical columns
            condition = f"{column} >= {numerical_means[column]}"
        conditions.append(condition)
    rule = " & ".join(conditions)
    rules.append(rule)

# Apply the generated rules to the anomaly_df using the query method
for i, rule in enumerate(rules, 1):
    filtered_data = anomaly_df.query(rule)
    print(f"Rule {i}: {rule}")
    print(f"Filtered Data using Rule {i}:")
    #print(filtered_data)


Rule 1: Age >= 33.333333333333336
Filtered Data using Rule 1:
Rule 2: Income >= 66500.0
Filtered Data using Rule 2:
Rule 3: Education == 'Low'
Filtered Data using Rule 3:
Rule 4: Gender == 'Female'
Filtered Data using Rule 4:
Rule 5: Age >= 33.333333333333336 & Income >= 66500.0
Filtered Data using Rule 5:
Rule 6: Age >= 33.333333333333336 & Education == 'Low'
Filtered Data using Rule 6:
Rule 7: Age >= 33.333333333333336 & Gender == 'Female'
Filtered Data using Rule 7:
Rule 8: Income >= 66500.0 & Education == 'Low'
Filtered Data using Rule 8:
Rule 9: Income >= 66500.0 & Gender == 'Female'
Filtered Data using Rule 9:
Rule 10: Education == 'Low' & Gender == 'Female'
Filtered Data using Rule 10:
Rule 11: Age >= 33.333333333333336 & Income >= 66500.0 & Education == 'Low'
Filtered Data using Rule 11:
Rule 12: Age >= 33.333333333333336 & Income >= 66500.0 & Gender == 'Female'
Filtered Data using Rule 12:
Rule 13: Age >= 33.333333333333336 & Education == 'Low' & Gender == 'Female'
Filtered Da

In [42]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Filter rows where 'Anomaly' is True
anomaly_df = df[df['Anomaly']]

# Calculate means for numerical columns in 'anomaly_df'
numerical_means = anomaly_df[['Age', 'Income']].mean()

# Define the columns you want to consider for generating rules
columns_to_consider = ['Age', 'Income', 'Education', 'Gender']

# Generate all possible combinations of conditions
rule_combinations = []
for r in range(1, len(columns_to_consider) + 1):
    for combination in itertools.combinations(columns_to_consider, r):
        rule_combinations.append(combination)

# Generate rules based on selected combinations
rules = []
anomalies_count = []

for combination in rule_combinations:
    conditions = []
    for column in combination:
        if df[column].dtype == 'object':
            condition = f"{column} == '{anomaly_df[column].iloc[0]}'"
        else:
            # Use mean as cutoff for numerical columns
            condition = f"{column} >= {numerical_means[column]}"
        conditions.append(condition)
    rule = " & ".join(conditions)
    rules.append(rule)

    # Count anomalies matching the rule
    count = len(anomaly_df.query(rule))
    anomalies_count.append(count)

# Display rules along with anomalies count
for i, (rule, count) in enumerate(zip(rules, anomalies_count), 1):
    print(f"Rule {i}: {rule}")
    print(f"Anomalies Count using Rule {i}: {count}")
    print()


Rule 1: Age >= 33.333333333333336
Anomalies Count using Rule 1: 10

Rule 2: Income >= 66500.0
Anomalies Count using Rule 2: 10

Rule 3: Education == 'Low'
Anomalies Count using Rule 3: 10

Rule 4: Gender == 'Female'
Anomalies Count using Rule 4: 15

Rule 5: Age >= 33.333333333333336 & Income >= 66500.0
Anomalies Count using Rule 5: 10

Rule 6: Age >= 33.333333333333336 & Education == 'Low'
Anomalies Count using Rule 6: 0

Rule 7: Age >= 33.333333333333336 & Gender == 'Female'
Anomalies Count using Rule 7: 5

Rule 8: Income >= 66500.0 & Education == 'Low'
Anomalies Count using Rule 8: 0

Rule 9: Income >= 66500.0 & Gender == 'Female'
Anomalies Count using Rule 9: 5

Rule 10: Education == 'Low' & Gender == 'Female'
Anomalies Count using Rule 10: 5

Rule 11: Age >= 33.333333333333336 & Income >= 66500.0 & Education == 'Low'
Anomalies Count using Rule 11: 0

Rule 12: Age >= 33.333333333333336 & Income >= 66500.0 & Gender == 'Female'
Anomalies Count using Rule 12: 5

Rule 13: Age >= 33.3333

In [44]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Filter rows where 'Anomaly' is True
anomaly_df = df[df['Anomaly']]

# Calculate percentiles for numerical columns in 'anomaly_df'
percentiles = anomaly_df[['Age', 'Income']].quantile([0.25, 0.50, 0.75])
print("percentiles", percentiles)
# Define the columns you want to consider for generating rules
columns_to_consider = ['Age', 'Income', 'Education', 'Gender']

# Generate all possible combinations of conditions
rule_combinations = []
for r in range(1, len(columns_to_consider) + 1):
    for combination in itertools.combinations(columns_to_consider, r):
        rule_combinations.append(combination)

# Generate rules based on selected combinations
rules = []
anomalies_count = []

for combination in rule_combinations:
    conditions = []
    for column in combination:
        if df[column].dtype == 'object':
            condition = f"{column} == '{anomaly_df[column].iloc[0]}'"
        else:
            # Use 75th percentile as cutoff for Age and Income
            percentile_75 = percentiles.loc[0.75, column]
            condition = f"{column} >= {percentile_75}"
        conditions.append(condition)
    rule = " & ".join(conditions)
    rules.append(rule)

    # Count anomalies matching the rule
    count = len(anomaly_df.query(rule))
    anomalies_count.append(count)

# Display rules along with anomalies count
for i, (rule, count) in enumerate(zip(rules, anomalies_count), 1):
    print(f"Rule {i}: {rule}")
    print(f"Anomalies Count using Rule {i}: {count}")
    print()


percentiles        Age   Income
0.25  29.0  59000.0
0.50  31.5  62500.0
0.75  35.0  70000.0
Rule 1: Age >= 35.0
Anomalies Count using Rule 1: 10

Rule 2: Income >= 70000.0
Anomalies Count using Rule 2: 10

Rule 3: Education == 'Low'
Anomalies Count using Rule 3: 10

Rule 4: Gender == 'Female'
Anomalies Count using Rule 4: 15

Rule 5: Age >= 35.0 & Income >= 70000.0
Anomalies Count using Rule 5: 10

Rule 6: Age >= 35.0 & Education == 'Low'
Anomalies Count using Rule 6: 0

Rule 7: Age >= 35.0 & Gender == 'Female'
Anomalies Count using Rule 7: 5

Rule 8: Income >= 70000.0 & Education == 'Low'
Anomalies Count using Rule 8: 0

Rule 9: Income >= 70000.0 & Gender == 'Female'
Anomalies Count using Rule 9: 5

Rule 10: Education == 'Low' & Gender == 'Female'
Anomalies Count using Rule 10: 5

Rule 11: Age >= 35.0 & Income >= 70000.0 & Education == 'Low'
Anomalies Count using Rule 11: 0

Rule 12: Age >= 35.0 & Income >= 70000.0 & Gender == 'Female'
Anomalies Count using Rule 12: 5

Rule 13: Age >=

In [46]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Extract rows where Anomaly is True
anomaly_df = df[df['Anomaly'] == True]

# Initialize an empty list to store the combinations and their anomaly counts
rule_combinations = []
anomaly_counts = []

# Loop through the columns and generate combinations of rules
for num_columns in range(1, len(anomaly_df.columns)):
    for combination in itertools.combinations(anomaly_df.columns, num_columns):
        # Exclude 'Anomaly' from the combination
        if 'Anomaly' in combination:
            continue
        
        # Create a rule for each value combination
        rule = ""
        subset = anomaly_df
        
        for column in combination:
            if column in anomaly_df.select_dtypes(include='number').columns:
                # For numeric columns, use >= as the condition
                mean_value = anomaly_df[column].mean()
                rule += f"{column} >= {mean_value} & "
                subset = subset[subset[column] >= mean_value]
            else:
                # For categorical columns, include the category exactly as is
                unique_value = anomaly_df[column].unique()[0]
                rule += f"{column} == '{unique_value}' & "
        
        # Remove the trailing ' & ' from the rule
        rule = rule[:-3]
        
        # Count the number of anomalies for this rule
        anomaly_count = len(subset)
        
        rule_combinations.append(rule)
        anomaly_counts.append(anomaly_count)

# Create a DataFrame to store the rules and their corresponding anomaly counts
rule_df = pd.DataFrame({'Rule': rule_combinations, 'Anomaly_Count': anomaly_counts})

# Sort the DataFrame by Anomaly_Count in descending order
sorted_rule_df = rule_df.sort_values(by='Anomaly_Count', ascending=False)

# Print the top 20 rules with the highest number of anomalies
top_20_rules = sorted_rule_df.head(50)
print(top_20_rules)


                                                 Rule  Anomaly_Count
2                                  Education == 'Low'             30
3                                  Gender == 'Female'             30
9             Education == 'Low' & Gender == 'Female'             30
0                           Age >= 33.333333333333336             10
1                                   Income >= 66500.0             10
4       Age >= 33.333333333333336 & Income >= 66500.0             10
5      Age >= 33.333333333333336 & Education == 'Low'             10
6      Age >= 33.333333333333336 & Gender == 'Female'             10
7              Income >= 66500.0 & Education == 'Low'             10
8              Income >= 66500.0 & Gender == 'Female'             10
10  Age >= 33.333333333333336 & Income >= 66500.0 ...             10
11  Age >= 33.333333333333336 & Income >= 66500.0 ...             10
12  Age >= 33.333333333333336 & Education == 'Low'...             10
13  Income >= 66500.0 & Education 

In [47]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Filter rows where 'Anomaly' is True
anomaly_df = df[df['Anomaly']]

# Calculate percentiles for numerical columns in 'anomaly_df'
percentiles = anomaly_df[['Age', 'Income']].quantile([0.25, 0.50, 0.75])

# Define the columns you want to consider for generating rules
columns_to_consider = ['Age', 'Income', 'Education', 'Gender']

# Generate all possible combinations of conditions
rule_combinations = []
for r in range(1, len(columns_to_consider) + 1):
    for combination in itertools.combinations(columns_to_consider, r):
        rule_combinations.append(combination)

# Generate rules based on selected combinations with both "AND" and "OR" conditions for categorical variables
rules = []
anomalies_count = []

for combination in rule_combinations:
    conditions = []
    or_conditions = []
    
    for column in combination:
        if df[column].dtype == 'object':
            # Create an "OR" condition for categorical variables
            unique_values = anomaly_df[column].unique()
            or_condition = " | ".join([f"{column} == '{value}'" for value in unique_values])
            or_conditions.append(f"({or_condition})")
        else:
            # Use 75th percentile as cutoff for Age and Income
            percentile_75 = percentiles.loc[0.75, column]
            conditions.append(f"{column} >= {percentile_75}")
    
    if or_conditions:
        # Combine "OR" conditions with "AND" conditions
        conditions.extend(or_conditions)
    
    rule = " & ".join(conditions)
    rules.append(rule)

    # Count anomalies matching the rule
    count = len(anomaly_df.query(rule))
    anomalies_count.append(count)

# Display rules along with anomalies count
for i, (rule, count) in enumerate(zip(rules, anomalies_count), 1):
    print(f"Rule {i}: {rule}")
    print(f"Anomalies Count using Rule {i}: {count}")
    print()


Rule 1: Age >= 35.0
Anomalies Count using Rule 1: 10

Rule 2: Income >= 70000.0
Anomalies Count using Rule 2: 10

Rule 3: (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 3: 30

Rule 4: (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 4: 30

Rule 5: Age >= 35.0 & Income >= 70000.0
Anomalies Count using Rule 5: 10

Rule 6: Age >= 35.0 & (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 6: 10

Rule 7: Age >= 35.0 & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 7: 10

Rule 8: Income >= 70000.0 & (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 8: 10

Rule 9: Income >= 70000.0 & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 9: 10

Rule 10: (Education == 'Low' | Education == 'Medium' | Education == 'High') & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 10: 30

Rule 11: Age >= 35

In [48]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Filter rows where 'Anomaly' is True
anomaly_df = df[df['Anomaly']]

# Calculate percentiles for numerical columns in 'anomaly_df'
percentiles = anomaly_df[['Age', 'Income']].quantile([0.25, 0.50, 0.75])

# Define the columns you want to consider for generating rules
columns_to_consider = ['Age', 'Income', 'Education', 'Gender']

# Generate all possible combinations of conditions with "|" and "&" for categorical variables
rules = []
anomalies_count = []

for r in range(1, len(columns_to_consider) + 1):
    for combination in itertools.combinations(columns_to_consider, r):
        conditions = []
        
        for column in combination:
            if df[column].dtype == 'object':
                # Create an "OR" condition for categorical variables
                unique_values = anomaly_df[column].unique()
                or_condition = " | ".join([f"{column} == '{value}'" for value in unique_values])
                conditions.append(f"({or_condition})")
            else:
                # Use 75th percentile as cutoff for Age and Income
                percentile_75 = percentiles.loc[0.75, column]
                conditions.append(f"{column} >= {percentile_75}")
        
        # Combine all conditions with "&" for this combination
        rule = " & ".join(conditions)
        rules.append(rule)

        # Count anomalies matching the rule
        count = len(anomaly_df.query(rule))
        anomalies_count.append(count)

# Display rules along with anomalies count
for i, (rule, count) in enumerate(zip(rules, anomalies_count), 1):
    print(f"Rule {i}: {rule}")
    print(f"Anomalies Count using Rule {i}: {count}")
    print()


Rule 1: Age >= 35.0
Anomalies Count using Rule 1: 10

Rule 2: Income >= 70000.0
Anomalies Count using Rule 2: 10

Rule 3: (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 3: 30

Rule 4: (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 4: 30

Rule 5: Age >= 35.0 & Income >= 70000.0
Anomalies Count using Rule 5: 10

Rule 6: Age >= 35.0 & (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 6: 10

Rule 7: Age >= 35.0 & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 7: 10

Rule 8: Income >= 70000.0 & (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 8: 10

Rule 9: Income >= 70000.0 & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 9: 10

Rule 10: (Education == 'Low' | Education == 'Medium' | Education == 'High') & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 10: 30

Rule 11: Age >= 35

In [49]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Filter rows where 'Anomaly' is True
anomaly_df = df[df['Anomaly']]

# Calculate percentiles for numerical columns in 'anomaly_df'
percentiles = anomaly_df[['Age', 'Income']].quantile([0.25, 0.50, 0.75])

# Define the columns you want to consider for generating rules
columns_to_consider = ['Age', 'Income', 'Education', 'Gender']

# Generate all possible combinations of conditions with "|" and "&" for categorical variables
rules = []
anomalies_count = []

for r in range(1, len(columns_to_consider) + 1):
    for combination in itertools.combinations(columns_to_consider, r):
        conditions = []
        
        for column in combination:
            if df[column].dtype == 'object':
                # Create a mix of "|" and "&" conditions for categorical variables
                unique_values = anomaly_df[column].unique()
                or_condition = " | ".join([f"{column} == '{value}'" for value in unique_values])
                and_condition = f"({or_condition})"
                conditions.append(and_condition)
            else:
                # Use 75th percentile as cutoff for Age and Income
                percentile_75 = percentiles.loc[0.75, column]
                conditions.append(f"{column} >= {percentile_75}")
        
        # Combine all conditions with "&" for this combination
        rule = " & ".join(conditions)
        rules.append(rule)

        # Count anomalies matching the rule
        count = len(anomaly_df.query(rule))
        anomalies_count.append(count)

# Display rules along with anomalies count
for i, (rule, count) in enumerate(zip(rules, anomalies_count), 1):
    print(f"Rule {i}: {rule}")
    print(f"Anomalies Count using Rule {i}: {count}")
    print()


Rule 1: Age >= 35.0
Anomalies Count using Rule 1: 10

Rule 2: Income >= 70000.0
Anomalies Count using Rule 2: 10

Rule 3: (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 3: 30

Rule 4: (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 4: 30

Rule 5: Age >= 35.0 & Income >= 70000.0
Anomalies Count using Rule 5: 10

Rule 6: Age >= 35.0 & (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 6: 10

Rule 7: Age >= 35.0 & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 7: 10

Rule 8: Income >= 70000.0 & (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 8: 10

Rule 9: Income >= 70000.0 & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 9: 10

Rule 10: (Education == 'Low' | Education == 'Medium' | Education == 'High') & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 10: 30

Rule 11: Age >= 35

In [50]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Filter rows where 'Anomaly' is True
anomaly_df = df[df['Anomaly']]

# Calculate percentiles for numerical columns in 'anomaly_df'
percentiles = anomaly_df[['Age', 'Income']].quantile([0.25, 0.50, 0.75])

# Define the columns you want to consider for generating rules
columns_to_consider = ['Age', 'Income', 'Education', 'Gender']

# Generate all possible combinations of conditions with "|" and "&" for categorical variables
rules = []
anomalies_count = []

for r in range(1, len(columns_to_consider) + 1):
    for combination in itertools.combinations(columns_to_consider, r):
        conditions = []
        
        for column in combination:
            if df[column].dtype == 'object':
                # Create a mix of "|" and "&" conditions for categorical variables in the rule
                unique_values = anomaly_df[column].unique()
                or_condition = " | ".join([f"{column} == '{value}'" for value in unique_values])
                conditions.append(f"({or_condition})")
            else:
                # Use 75th percentile as cutoff for Age and Income
                percentile_75 = percentiles.loc[0.75, column]
                conditions.append(f"{column} >= {percentile_75}")
        
        # Combine all conditions with "&" in the rule
        rule = " & ".join(conditions)
        rules.append(rule)

        # Count anomalies matching the rule
        count = len(anomaly_df.query(rule))
        anomalies_count.append(count)

# Display rules along with anomalies count
for i, (rule, count) in enumerate(zip(rules, anomalies_count), 1):
    print(f"Rule {i}: {rule}")
    print(f"Anomalies Count using Rule {i}: {count}")
    print()


Rule 1: Age >= 35.0
Anomalies Count using Rule 1: 10

Rule 2: Income >= 70000.0
Anomalies Count using Rule 2: 10

Rule 3: (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 3: 30

Rule 4: (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 4: 30

Rule 5: Age >= 35.0 & Income >= 70000.0
Anomalies Count using Rule 5: 10

Rule 6: Age >= 35.0 & (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 6: 10

Rule 7: Age >= 35.0 & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 7: 10

Rule 8: Income >= 70000.0 & (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 8: 10

Rule 9: Income >= 70000.0 & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 9: 10

Rule 10: (Education == 'Low' | Education == 'Medium' | Education == 'High') & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 10: 30

Rule 11: Age >= 35

In [51]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Filter rows where 'Anomaly' is True
anomaly_df = df[df['Anomaly']]

# Calculate percentiles for numerical columns in 'anomaly_df'
percentiles = anomaly_df[['Age', 'Income']].quantile([0.25, 0.50, 0.75])

# Define the columns you want to consider for generating rules
columns_to_consider = ['Age', 'Income', 'Education', 'Gender']

# Generate all possible combinations of conditions with "|" between different categorical variables
rules = []
anomalies_count = []

for r in range(1, len(columns_to_consider) + 1):
    for combination in itertools.combinations(columns_to_consider, r):
        conditions = []
        
        for column in combination:
            if df[column].dtype == 'object':
                # Create an "OR" condition for categorical variables within the same rule
                unique_values = anomaly_df[column].unique()
                or_condition = " | ".join([f"{column} == '{value}'" for value in unique_values])
                conditions.append(f"({or_condition})")
            else:
                # Use 75th percentile as cutoff for Age and Income
                percentile_75 = percentiles.loc[0.75, column]
                conditions.append(f"{column} >= {percentile_75}")
        
        # Combine all conditions with "&" in the rule
        rule = " & ".join(conditions)
        rules.append(rule)

        # Count anomalies matching the rule
        count = len(anomaly_df.query(rule))
        anomalies_count.append(count)

# Display rules along with anomalies count
for i, (rule, count) in enumerate(zip(rules, anomalies_count), 1):
    print(f"Rule {i}: {rule}")
    print(f"Anomalies Count using Rule {i}: {count}")
    print()


Rule 1: Age >= 35.0
Anomalies Count using Rule 1: 10

Rule 2: Income >= 70000.0
Anomalies Count using Rule 2: 10

Rule 3: (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 3: 30

Rule 4: (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 4: 30

Rule 5: Age >= 35.0 & Income >= 70000.0
Anomalies Count using Rule 5: 10

Rule 6: Age >= 35.0 & (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 6: 10

Rule 7: Age >= 35.0 & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 7: 10

Rule 8: Income >= 70000.0 & (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 8: 10

Rule 9: Income >= 70000.0 & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 9: 10

Rule 10: (Education == 'Low' | Education == 'Medium' | Education == 'High') & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 10: 30

Rule 11: Age >= 35

In [52]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Filter rows where 'Anomaly' is True
anomaly_df = df[df['Anomaly']]

# Calculate percentiles for numerical columns in 'anomaly_df'
percentiles = anomaly_df[['Age', 'Income']].quantile([0.25, 0.50, 0.75])

# Define the columns you want to consider for generating rules
columns_to_consider = ['Age', 'Income', 'Education', 'Gender']

# Generate all possible combinations of conditions with "|" between different categorical variables
rules = []
anomalies_count = []

for r in range(1, len(columns_to_consider) + 1):
    for combination in itertools.combinations(columns_to_consider, r):
        conditions = []
        
        for column in combination:
            if df[column].dtype == 'object':
                # Create an "OR" condition for categorical variables within the same rule
                unique_values = anomaly_df[column].unique()
                or_condition = " | ".join([f"{column} == '{value}'" for value in unique_values])
                conditions.append(f"({or_condition})")
            else:
                # Use 75th percentile as cutoff for Age and Income
                percentile_75 = percentiles.loc[0.75, column]
                conditions.append(f"{column} >= {percentile_75}")
        
        # Combine all conditions with "&" in the rule
        rule = " & ".join(conditions)
        rules.append(rule)

        # Count anomalies matching the rule
        count = len(anomaly_df.query(rule))
        anomalies_count.append(count)

# Display rules along with anomalies count
for i, (rule, count) in enumerate(zip(rules, anomalies_count), 1):
    print(f"Rule {i}: {rule}")
    print(f"Anomalies Count using Rule {i}: {count}")
    print()


Rule 1: Age >= 35.0
Anomalies Count using Rule 1: 10

Rule 2: Income >= 70000.0
Anomalies Count using Rule 2: 10

Rule 3: (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 3: 30

Rule 4: (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 4: 30

Rule 5: Age >= 35.0 & Income >= 70000.0
Anomalies Count using Rule 5: 10

Rule 6: Age >= 35.0 & (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 6: 10

Rule 7: Age >= 35.0 & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 7: 10

Rule 8: Income >= 70000.0 & (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 8: 10

Rule 9: Income >= 70000.0 & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 9: 10

Rule 10: (Education == 'Low' | Education == 'Medium' | Education == 'High') & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 10: 30

Rule 11: Age >= 35

In [53]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Filter rows where 'Anomaly' is True
anomaly_df = df[df['Anomaly']]

# Calculate percentiles for numerical columns in 'anomaly_df'
percentiles = anomaly_df[['Age', 'Income']].quantile([0.25, 0.50, 0.75])

# Define the columns you want to consider for generating rules
columns_to_consider = ['Age', 'Income', 'Education', 'Gender']

# Generate all possible combinations of conditions with "|" between different categorical variables
rules = []
anomalies_count = []

for r in range(1, len(columns_to_consider) + 1):
    for combination in itertools.combinations(columns_to_consider, r):
        conditions = []
        
        for column in combination:
            if df[column].dtype == 'object':
                # Combine different categorical variables in the same "|" condition
                unique_values = anomaly_df[column].unique()
                or_condition = " | ".join([f"{column} == '{value}'" for value in unique_values])
                conditions.append(f"({or_condition})")
            else:
                # Use 75th percentile as cutoff for Age and Income
                percentile_75 = percentiles.loc[0.75, column]
                conditions.append(f"{column} >= {percentile_75}")
        
        # Combine all conditions with "&" in the rule
        rule = " & ".join(conditions)
        rules.append(rule)

        # Count anomalies matching the rule
        count = len(anomaly_df.query(rule))
        anomalies_count.append(count)

# Display rules along with anomalies count
for i, (rule, count) in enumerate(zip(rules, anomalies_count), 1):
    print(f"Rule {i}: {rule}")
    print(f"Anomalies Count using Rule {i}: {count}")
    print()


Rule 1: Age >= 35.0
Anomalies Count using Rule 1: 10

Rule 2: Income >= 70000.0
Anomalies Count using Rule 2: 10

Rule 3: (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 3: 30

Rule 4: (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 4: 30

Rule 5: Age >= 35.0 & Income >= 70000.0
Anomalies Count using Rule 5: 10

Rule 6: Age >= 35.0 & (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 6: 10

Rule 7: Age >= 35.0 & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 7: 10

Rule 8: Income >= 70000.0 & (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 8: 10

Rule 9: Income >= 70000.0 & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 9: 10

Rule 10: (Education == 'Low' | Education == 'Medium' | Education == 'High') & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 10: 30

Rule 11: Age >= 35

In [54]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Filter rows where 'Anomaly' is True
anomaly_df = df[df['Anomaly']]

# Calculate percentiles for numerical columns in 'anomaly_df'
percentiles = anomaly_df[['Age', 'Income']].quantile([0.25, 0.50, 0.75])

# Define the columns you want to consider for generating rules
columns_to_consider = ['Age', 'Income', 'Education', 'Gender']

# Generate all possible combinations of conditions with "|" between different categorical variables
rules = []
anomalies_count = []

for r in range(1, len(columns_to_consider) + 1):
    for combination in itertools.combinations(columns_to_consider, r):
        conditions = []
        
        for column in combination:
            if df[column].dtype == 'object':
                # Combine different categorical variables in the same "|" condition
                unique_values = anomaly_df[column].unique()
                or_condition = " | ".join([f"{column} == '{value}'" for value in unique_values])
                conditions.append(f"({or_condition})")
            else:
                # Use 75th percentile as cutoff for Age and Income
                percentile_75 = percentiles.loc[0.75, column]
                conditions.append(f"{column} >= {percentile_75}")
        
        # Combine all conditions with "&" in the rule
        rule = " & ".join(conditions)
        rules.append(rule)

        # Count anomalies matching the rule
        count = len(anomaly_df.query(rule))
        anomalies_count.append(count)

# Display rules along with anomalies count
for i, (rule, count) in enumerate(zip(rules, anomalies_count), 1):
    print(f"Rule {i}: {rule}")
    print(f"Anomalies Count using Rule {i}: {count}")
    print()


Rule 1: Age >= 35.0
Anomalies Count using Rule 1: 10

Rule 2: Income >= 70000.0
Anomalies Count using Rule 2: 10

Rule 3: (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 3: 30

Rule 4: (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 4: 30

Rule 5: Age >= 35.0 & Income >= 70000.0
Anomalies Count using Rule 5: 10

Rule 6: Age >= 35.0 & (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 6: 10

Rule 7: Age >= 35.0 & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 7: 10

Rule 8: Income >= 70000.0 & (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 8: 10

Rule 9: Income >= 70000.0 & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 9: 10

Rule 10: (Education == 'Low' | Education == 'Medium' | Education == 'High') & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 10: 30

Rule 11: Age >= 35

In [55]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Filter rows where 'Anomaly' is True
anomaly_df = df[df['Anomaly']]

# Calculate percentiles for numerical columns in 'anomaly_df'
percentiles = anomaly_df[['Age', 'Income']].quantile([0.25, 0.50, 0.75])

# Define the columns you want to consider for generating rules
columns_to_consider = ['Age', 'Income', 'Education', 'Gender']

# Generate all possible combinations of conditions
rules = []
anomalies_count = []

for r in range(1, len(columns_to_consider) + 1):
    for combination in itertools.combinations(columns_to_consider, r):
        conditions = []
        categorical_conditions = []
        
        for column in combination:
            if df[column].dtype == 'object':
                # Create individual categorical conditions for each variable
                unique_values = anomaly_df[column].unique()
                categorical_conditions.append(" | ".join([f"{column} == '{value}'" for value in unique_values]))
            else:
                # Use 75th percentile as cutoff for Age and Income
                percentile_75 = percentiles.loc[0.75, column]
                conditions.append(f"{column} >= {percentile_75}")
        
        # Combine categorical conditions with "|" within the same rule
        if categorical_conditions:
            conditions.append(" | ".join(categorical_conditions))
        
        # Combine all conditions with "&" in the rule
        rule = " & ".join(conditions)
        rules.append(rule)

        # Count anomalies matching the rule
        count = len(anomaly_df.query(rule))
        anomalies_count.append(count)

# Display rules along with anomalies count
for i, (rule, count) in enumerate(zip(rules, anomalies_count), 1):
    print(f"Rule {i}: {rule}")
    print(f"Anomalies Count using Rule {i}: {count}")
    print()


Rule 1: Age >= 35.0
Anomalies Count using Rule 1: 10

Rule 2: Income >= 70000.0
Anomalies Count using Rule 2: 10

Rule 3: Education == 'Low' | Education == 'Medium' | Education == 'High'
Anomalies Count using Rule 3: 30

Rule 4: Gender == 'Female' | Gender == 'Male'
Anomalies Count using Rule 4: 30

Rule 5: Age >= 35.0 & Income >= 70000.0
Anomalies Count using Rule 5: 10

Rule 6: Age >= 35.0 & Education == 'Low' | Education == 'Medium' | Education == 'High'
Anomalies Count using Rule 6: 20

Rule 7: Age >= 35.0 & Gender == 'Female' | Gender == 'Male'
Anomalies Count using Rule 7: 20

Rule 8: Income >= 70000.0 & Education == 'Low' | Education == 'Medium' | Education == 'High'
Anomalies Count using Rule 8: 20

Rule 9: Income >= 70000.0 & Gender == 'Female' | Gender == 'Male'
Anomalies Count using Rule 9: 20

Rule 10: Education == 'Low' | Education == 'Medium' | Education == 'High' | Gender == 'Female' | Gender == 'Male'
Anomalies Count using Rule 10: 30

Rule 11: Age >= 35.0 & Income >= 7

In [56]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Filter rows where 'Anomaly' is True
anomaly_df = df[df['Anomaly']]

# Calculate percentiles for numerical columns in 'anomaly_df'
percentiles = anomaly_df[['Age', 'Income']].quantile([0.25, 0.50, 0.75])

# Define the columns you want to consider for generating rules
columns_to_consider = ['Age', 'Income', 'Education', 'Gender']

# Generate all possible combinations of conditions
rules = []
anomalies_count = []

for r in range(1, len(columns_to_consider) + 1):
    for combination in itertools.combinations(columns_to_consider, r):
        conditions = []
        categorical_conditions = []
        
        for column in combination:
            if df[column].dtype == 'object':
                # Create individual categorical conditions for each variable
                unique_values = anomaly_df[column].unique()
                categorical_conditions.append(" | ".join([f"{column} == '{value}'" for value in unique_values]))
            else:
                # Use 75th percentile as cutoff for Age and Income
                percentile_75 = percentiles.loc[0.75, column]
                conditions.append(f"{column} >= {percentile_75}")
        
        # Combine categorical conditions with "|" within the same rule
        if categorical_conditions:
            conditions.append(" | ".join(categorical_conditions))
        
        # Combine all conditions with "&" in the rule
        rule = " & ".join(conditions)
        rules.append(rule)

        # Count anomalies matching the rule
        count = len(anomaly_df.query(rule))
        anomalies_count.append(count)

# Display rules along with anomalies count
for i, (rule, count) in enumerate(zip(rules, anomalies_count), 1):
    print(f"Rule {i}: {rule}")
    print(f"Anomalies Count using Rule {i}: {count}")
    print()


Rule 1: Age >= 35.0
Anomalies Count using Rule 1: 10

Rule 2: Income >= 70000.0
Anomalies Count using Rule 2: 10

Rule 3: Education == 'Low' | Education == 'Medium' | Education == 'High'
Anomalies Count using Rule 3: 30

Rule 4: Gender == 'Female' | Gender == 'Male'
Anomalies Count using Rule 4: 30

Rule 5: Age >= 35.0 & Income >= 70000.0
Anomalies Count using Rule 5: 10

Rule 6: Age >= 35.0 & Education == 'Low' | Education == 'Medium' | Education == 'High'
Anomalies Count using Rule 6: 20

Rule 7: Age >= 35.0 & Gender == 'Female' | Gender == 'Male'
Anomalies Count using Rule 7: 20

Rule 8: Income >= 70000.0 & Education == 'Low' | Education == 'Medium' | Education == 'High'
Anomalies Count using Rule 8: 20

Rule 9: Income >= 70000.0 & Gender == 'Female' | Gender == 'Male'
Anomalies Count using Rule 9: 20

Rule 10: Education == 'Low' | Education == 'Medium' | Education == 'High' | Gender == 'Female' | Gender == 'Male'
Anomalies Count using Rule 10: 30

Rule 11: Age >= 35.0 & Income >= 7

In [57]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Filter rows where 'Anomaly' is True
anomaly_df = df[df['Anomaly']]

# Calculate percentiles for numerical columns in 'anomaly_df'
percentiles = anomaly_df[['Age', 'Income']].quantile([0.25, 0.50, 0.75])

# Define the columns you want to consider for generating rules
columns_to_consider = ['Age', 'Income', 'Education', 'Gender']

# Generate all possible combinations of conditions
rules = []
anomalies_count = []

for r in range(1, len(columns_to_consider) + 1):
    for combination in itertools.combinations(columns_to_consider, r):
        conditions = []
        
        for column in combination:
            if df[column].dtype == 'object':
                # Create individual categorical conditions for each variable
                unique_values = anomaly_df[column].unique()
                categorical_conditions = [
                    f"{column} == '{value}'" for value in unique_values
                ]
                conditions.append("(" + " | ".join(categorical_conditions) + ")")
            else:
                # Use 75th percentile as cutoff for Age and Income
                percentile_75 = percentiles.loc[0.75, column]
                conditions.append(f"{column} >= {percentile_75}")
        
        # Combine all conditions with "&" in the rule
        rule = " & ".join(conditions)
        rules.append(rule)

        # Count anomalies matching the rule
        count = len(anomaly_df.query(rule))
        anomalies_count.append(count)

# Display rules along with anomalies count
for i, (rule, count) in enumerate(zip(rules, anomalies_count), 1):
    print(f"Rule {i}: {rule}")
    print(f"Anomalies Count using Rule {i}: {count}")
    print()


Rule 1: Age >= 35.0
Anomalies Count using Rule 1: 10

Rule 2: Income >= 70000.0
Anomalies Count using Rule 2: 10

Rule 3: (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 3: 30

Rule 4: (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 4: 30

Rule 5: Age >= 35.0 & Income >= 70000.0
Anomalies Count using Rule 5: 10

Rule 6: Age >= 35.0 & (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 6: 10

Rule 7: Age >= 35.0 & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 7: 10

Rule 8: Income >= 70000.0 & (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Rule 8: 10

Rule 9: Income >= 70000.0 & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 9: 10

Rule 10: (Education == 'Low' | Education == 'Medium' | Education == 'High') & (Gender == 'Female' | Gender == 'Male')
Anomalies Count using Rule 10: 30

Rule 11: Age >= 35

In [59]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Filter rows where 'Anomaly' is True
anomaly_df = df[df['Anomaly']]

# Calculate percentiles for numerical columns in 'anomaly_df'
percentiles = anomaly_df[['Age', 'Income']].quantile([0.25, 0.50, 0.75])

# Define the columns you want to consider for generating rules
columns_to_consider = ['Age', 'Income', 'Education', 'Gender']

# Initialize variables to track the combination with the most anomalies
max_anomalies = 0
best_combination = []

# Generate all possible combinations of conditions
for r in range(1, len(columns_to_consider) + 1):
    for combination in itertools.combinations(columns_to_consider, r):
        conditions = []
        for column in combination:
            if df[column].dtype == 'object':
                # Create individual categorical conditions for each variable
                unique_values = anomaly_df[column].unique()
                categorical_conditions = [
                    f"{column} == '{value}'" for value in unique_values
                ]
                conditions.append("(" + " | ".join(categorical_conditions) + ")")
            else:
                # Use 75th percentile as cutoff for Age and Income
                percentile_75 = percentiles.loc[0.75, column]
                conditions.append(f"{column} >= {percentile_75}")
        
        # Combine all conditions with "&" in the rule
        rule = " & ".join(conditions)

        # Count anomalies matching the rule
        count = len(anomaly_df.query(rule))

        # Check if this combination produces more anomalies
        if count > max_anomalies:
            max_anomalies = count
            best_combination = combination

# Generate the rule based on the best combination
best_conditions = []
for column in best_combination:
    if df[column].dtype == 'object':
        unique_values = anomaly_df[column].unique()
        categorical_conditions = [f"{column} == '{value}'" for value in unique_values]
        best_conditions.append("(" + " | ".join(categorical_conditions) + ")")
    else:
        percentile_75 = percentiles.loc[0.75, column]
        best_conditions.append(f"{column} >= {percentile_75}")

best_rule = " & ".join(best_conditions)

# Display the best rule and the number of anomalies it captures
print(f"Best Rule: {best_rule}")
print(f"Anomalies Count using Best Rule: {max_anomalies}")


Best Rule: (Education == 'Low' | Education == 'Medium' | Education == 'High')
Anomalies Count using Best Rule: 30
