In [1]:
import pandas as pd
import itertools

# Create the dataframe
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Anomaly': [False, True, False, True, True, False, True, False, True, True] * 5
}
df = pd.DataFrame(data)

# Initialize variables
all_rules = []
max_rules = 20  # Generate more rules

# Define which numerical features should be greater than while creating combinations
numerical_features_greater = ['Income']

# Generate combinations of features, including both numerical and categorical variables
numerical_features = ['Age', 'Income']
categorical_features = ['Education', 'Gender']

# Helper function to generate all combinations of AND and OR conditions for categorical variables
def generate_categorical_combinations(categories, variable_name):
    combinations = []
    # Generate all possible OR conditions with one unique value for each categorical variable
    for category in categories:
        combinations.append(f"({variable_name} == '{category}')")

    # Generate all possible AND conditions by adding ' AND ' between OR conditions
    and_condition = ' AND '.join(combinations)
    combinations.append(and_condition)
    return combinations

# Initialize a dictionary to store the total number of rows captured by each rule
rows_captured_by_rule = {}

# Generate combinations of categorical features first
for cat_features in range(1, len(categorical_features) + 1):
    for num_features in itertools.combinations(categorical_features, cat_features):
        feature_combination = list(num_features) + numerical_features

        # Check if the combination is valid (Income should be greater)
        if all(feature in feature_combination for feature in numerical_features_greater):
            # Apply the combination of features
            subset_df = df[feature_combination + ['Anomaly']]

            # Filter the subset where the rule is satisfying (Anomaly is True)
            satisfying_subset = subset_df[subset_df['Anomaly']]

            # Calculate the number of anomalies in this subset
            num_anomalies = satisfying_subset['Anomaly'].sum()

            # Determine dynamic cutoffs for numerical features based on this subset
            cutoffs = {}
            for feature in numerical_features:
                if feature in numerical_features_greater:
                    # Use the average value where anomalies are present
                    cutoff = satisfying_subset[feature].mean()
                else:
                    cutoff = 0  # Default cutoff for other numerical features
                cutoffs[feature] = cutoff

            # Generate rules for the current feature combination with dynamic cutoffs
            current_rules = []
            categorical_rule_added = set()  # To track which categorical variables have been added
            for feature in feature_combination:
                if feature in numerical_features:
                    cutoff = cutoffs.get(feature, 0)  # Get dynamic cutoff or default to 0
                    rule_condition = f"({feature} > {cutoff})"
                else:
                    variable_name = feature
                    categories = df[feature].unique()
                    if feature in categorical_features and variable_name not in categorical_rule_added:
                        categorical_rule_added.add(variable_name)
                        # Include only a single category in the rule
                        rule_condition = f"({variable_name} == '{categories[0]}')"
                    else:
                        # Include all categories in the rule
                        category_combinations = generate_categorical_combinations(categories, variable_name)
                        rule_condition = ' AND '.join(category_combinations)
                current_rules.append(rule_condition)

            # Combine the rules into a single rule for the current feature combination
            combined_rule = " AND ".join(current_rules)

            # Add the rule to the list
            rule = {
                'Features': ', '.join(feature_combination),
                'Num Anomalies Captured': num_anomalies,
                'Rule Condition': combined_rule,
                'Total Rows Captured': len(satisfying_subset),
                'Percentage Anomalies Captured': (num_anomalies / len(satisfying_subset)) * 100
            }
            all_rules.append(rule)

            # Store the total number of rows captured by this rule
            rows_captured_by_rule[combined_rule] = len(satisfying_subset)

# Sort all the rules by the number of anomalies detected in descending order
all_rules.sort(key=lambda x: x['Num Anomalies Captured'], reverse=True)

# Print the top 20 rules with anomalies counts, total rows captured, and percentage anomalies captured
for i, rule in enumerate(all_rules[:max_rules]):
    num_anomalies = rule['Num Anomalies Captured']
    rows_captured = rule['Total Rows Captured']
    percentage_anomalies = rule['Percentage Anomalies Captured']
    print(f"Rule {i + 1}:")
    print(f"Features: {rule['Features']}")
    print(f"Num Anomalies Captured: {num_anomalies}")
    print(f"Total Rows Captured: {rows_captured}")
    print(f"Percentage Anomalies Captured: {percentage_anomalies:.2f}%")
    print(f"Rule Condition: {rule['Rule Condition']}\n")


Rule 1:
Features: Education, Age, Income
Num Anomalies Captured: 30
Total Rows Captured: 30
Percentage Anomalies Captured: 100.00%
Rule Condition: (Education == 'High') AND (Age > 0) AND (Income > 66500.0)

Rule 2:
Features: Gender, Age, Income
Num Anomalies Captured: 30
Total Rows Captured: 30
Percentage Anomalies Captured: 100.00%
Rule Condition: (Gender == 'Male') AND (Age > 0) AND (Income > 66500.0)

Rule 3:
Features: Education, Gender, Age, Income
Num Anomalies Captured: 30
Total Rows Captured: 30
Percentage Anomalies Captured: 100.00%
Rule Condition: (Education == 'High') AND (Gender == 'Male') AND (Age > 0) AND (Income > 66500.0)



In [4]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist

# Define the input text
text = "Education is High, Gender is Male."

# Define the list of anomaly rules
Anomaly_rule_list = {
    "Rule 1": "(Education == 'High') AND (Gender == 'Male')",
    "Rule 2": "(Education == 'High') AND (Age > 0) AND (Income > 66500.0)",
}

# Tokenize the input text and remove stopwords
nltk.download("punkt")
nltk.download("stopwords")

text_tokens = word_tokenize(text)
stop_words = set(stopwords.words("english"))
filtered_text_tokens = [word for word in text_tokens if word.lower() not in stop_words]

# Create a dictionary to store the similarity scores for each rule
rule_similarity = {}

# Calculate Jaccard similarity between the input text and each rule
for rule_name, rule_text in Anomaly_rule_list.items():
    rule_tokens = word_tokenize(rule_text)
    filtered_rule_tokens = [word for word in rule_tokens if word.lower() not in stop_words]
    
    # Calculate Jaccard similarity
    intersection = len(set(filtered_text_tokens).intersection(filtered_rule_tokens))
    union = len(set(filtered_text_tokens).union(filtered_rule_tokens))
    jaccard_similarity = intersection / union
    
    rule_similarity[rule_name] = jaccard_similarity

# Find the rule with the highest similarity score
most_similar_rule = max(rule_similarity, key=rule_similarity.get)

# Get the corresponding rule text
matched_rule_text = Anomaly_rule_list[most_similar_rule]

print(f"Input Text: {text}")
print(f"Most Similar Rule: {most_similar_rule}")
print(f"Matched Rule Text: {matched_rule_text}")


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pixel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pixel\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


Input Text: Education is High, Gender is Male.
Most Similar Rule: Rule 1
Matched Rule Text: (Education == 'High') AND (Gender == 'Male')


In [5]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import FreqDist

# Define the input text
text = """Education is 'Low' , 'Medium' and Gender is male. Appendix...
Age is 1 2 4 6 89. XYZ ltd. Among the largest university presses in the world,
The MIT Press publishes over 200 new books each year along with 30 journals
in the arts and humanities, economics, international affairs, history, political
science, science and technology along with other disciplines. We were among
the first university presses to offer titles electronically and we continue to
adopt technologies that allow us to better support the scholarly mission and
disseminate our content widely. The Press's enthusiasm for innovation is
reflected in our continuing exploration of this frontier. Since the late 1960s,
we have experimented with generation after generation of electronic publishing tools.
Through our commitment to new products—whether digital journals or entirely
new forms of communication—we have continued to look for the most efficient
and effective means to serve our readership. Our readers have come to expect
excellence from our products, and they can count on us to maintain a commitment
to producing rigorous and innovative information products in whatever forms
the future of publishing may bring. Education is High , Gender is Male."""

# Define the list of anomaly rules
Anomaly_rule_list = {
    "Rule 1": "(Education == 'High') AND (Gender == 'Male')",
    "Rule 2": "(Education == 'High') AND (Age > 0) AND (Income > 66500.0)",
}

# Tokenize the input text and remove stopwords
nltk.download("punkt")
nltk.download("stopwords")

text_tokens = word_tokenize(text)
stop_words = set(stopwords.words("english"))
filtered_text_tokens = [word for word in text_tokens if word.lower() not in stop_words]

# Create a dictionary to store the similarity scores for each rule
rule_similarity = {}

# Calculate Jaccard similarity between the input text and each rule
for rule_name, rule_text in Anomaly_rule_list.items():
    rule_tokens = word_tokenize(rule_text)
    filtered_rule_tokens = [word for word in rule_tokens if word.lower() not in stop_words]
    
    # Calculate Jaccard similarity
    intersection = len(set(filtered_text_tokens).intersection(filtered_rule_tokens))
    union = len(set(filtered_text_tokens).union(filtered_rule_tokens))
    jaccard_similarity = intersection / union
    
    rule_similarity[rule_name] = jaccard_similarity

# Find the rule with the highest similarity score
most_similar_rule = max(rule_similarity, key=rule_similarity.get)

# Get the corresponding rule text
matched_rule_text = Anomaly_rule_list[most_similar_rule]

print(f"Input Text: {text}")
print(f"Most Similar Rule: {most_similar_rule}")
print(f"Matched Rule Text: {matched_rule_text}")


Input Text: Education is 'Low' , 'Medium' and Gender is male. Appendix...
Age is 1 2 4 6 89. XYZ ltd. Among the largest university presses in the world,
The MIT Press publishes over 200 new books each year along with 30 journals
in the arts and humanities, economics, international affairs, history, political
science, science and technology along with other disciplines. We were among
the first university presses to offer titles electronically and we continue to
adopt technologies that allow us to better support the scholarly mission and
disseminate our content widely. The Press's enthusiasm for innovation is
reflected in our continuing exploration of this frontier. Since the late 1960s,
we have experimented with generation after generation of electronic publishing tools.
Through our commitment to new products—whether digital journals or entirely
new forms of communication—we have continued to look for the most efficient
and effective means to serve our readership. Our readers have come 

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pixel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pixel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

# Define the input text
text = """Education is 'Low' , 'Medium' and Gender is male. Appendix...
Age is 1 2 4 6 89. XYZ ltd. Among the largest university presses in the world,
The MIT Press publishes over 200 new books each year along with 30 journals
in the arts and humanities, economics, international affairs, history, political
science, science and technology along with other disciplines. We were among
the first university presses to offer titles electronically and we continue to
adopt technologies that allow us to better support the scholarly mission and
disseminate our content widely. The Press's enthusiasm for innovation is
reflected in our continuing exploration of this frontier. Since the late 1960s,
we have experimented with generation after generation of electronic publishing tools.
Through our commitment to new products—whether digital journals or entirely
new forms of communication—we have continued to look for the most efficient
and effective means to serve our readership. Our readers have come to expect
excellence from our products, and they can count on us to maintain a commitment
to producing rigorous and innovative information products in whatever forms
the future of publishing may bring. Education is High , Gender is Male and age 0"""

# Define the list of anomaly rules
Anomaly_rule_list = {
    "Rule 1": "(Education == 'High') AND (Gender == 'Male')",
    "Rule 2": "(Education == 'High') AND (Age > 0) AND (Income > 66500.0)",
}

# Tokenize the input text and remove stopwords
nltk.download("punkt")
nltk.download("stopwords")

text_sentences = sent_tokenize(text)
stop_words = set(stopwords.words("english"))

# Create a dictionary to store the matched sentences for each rule
matched_sentences = {}

# Iterate through sentences and check if they match any rule
for sentence in text_sentences:
    sentence_tokens = word_tokenize(sentence)
    filtered_sentence_tokens = [word for word in sentence_tokens if word.lower() not in stop_words]
    
    for rule_name, rule_text in Anomaly_rule_list.items():
        rule_tokens = word_tokenize(rule_text)
        filtered_rule_tokens = [word for word in rule_tokens if word.lower() not in stop_words]
        
        # Calculate Jaccard similarity
        intersection = len(set(filtered_sentence_tokens).intersection(filtered_rule_tokens))
        union = len(set(filtered_sentence_tokens).union(filtered_rule_tokens))
        jaccard_similarity = intersection / union
        print(jaccard_similarity)
        # If the similarity is above a threshold (e.g., 0.5), consider it a match
        if jaccard_similarity > 0.15:
            if rule_name in matched_sentences:
                matched_sentences[rule_name].append(sentence)
            else:
                matched_sentences[rule_name] = [sentence]

# Print matched sentences for each rule
for rule_name, matched_sentence_list in matched_sentences.items():
    print(f"Rule '{rule_name}' Matches:")
    for sentence in matched_sentence_list:
        print(f"- {sentence}")
    print()


0.23076923076923078
0.11764705882352941
0.0
0.05263157894736842
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.15384615384615385
0.125
Rule 'Rule 1' Matches:
- Education is 'Low' , 'Medium' and Gender is male.
- Education is High , Gender is Male and age 0



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pixel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pixel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [24]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

# Define the input text
text = """Education is 'Low' , 'Medium' and Gender is male. Appendix...
Age is 1 2 4 6 89. XYZ ltd. Among the largest university presses in the world,
The MIT Press publishes over 200 new books each year along with 30 journals
in the arts and humanities, economics, international affairs, history, political
science, science and technology along with other disciplines. We were among
the first university presses to offer titles electronically and we continue to
adopt technologies that allow us to better support the scholarly mission and
disseminate our content widely. The Press's enthusiasm for innovation is
reflected in our continuing exploration of this frontier. Since the late 1960s,
we have experimented with generation after generation of electronic publishing tools.
Through our commitment to new products—whether digital journals or entirely
new forms of communication—we have continued to look for the most efficient
and effective means to serve our readership. Our readers have come to expect
excellence from our products, and they can count on us to maintain a commitment
to producing rigorous and innovative information products in whatever forms
the future of publishing may bring. Education is High , Gender is Male age 0"""

# Define the list of anomaly rules
Anomaly_rule_list = {
    "Rule 1": "(Education == 'High') AND (Gender == 'Male')",
    "Rule 2": "(Education == 'High') AND (Age > 0) AND (Income > 66500.0)",
}

# Tokenize the input text and remove stopwords
nltk.download("punkt")
nltk.download("stopwords")

text_sentences = sent_tokenize(text)
stop_words = set(stopwords.words("english"))

# Create dictionaries to store the matched sentences and partial matches for each rule
matched_sentences = {}
partial_matches = {}

# Iterate through sentences and check if they match any rule
for sentence in text_sentences:
    sentence_tokens = word_tokenize(sentence)
    filtered_sentence_tokens = [word for word in sentence_tokens if word.lower() not in stop_words]
    
    for rule_name, rule_text in Anomaly_rule_list.items():
        rule_tokens = word_tokenize(rule_text)
        filtered_rule_tokens = [word for word in rule_tokens if word.lower() not in stop_words]
        
        # Calculate Jaccard similarity
        intersection = len(set(filtered_sentence_tokens).intersection(filtered_rule_tokens))
        union = len(set(filtered_sentence_tokens).union(filtered_rule_tokens))
        jaccard_similarity = intersection / union
        #print("jaccard_similarity", jaccard_similarity)
        # If the similarity is above a threshold (e.g., 0.5), consider it a match
        if jaccard_similarity > 0.06:
            if jaccard_similarity >= 0.23:
                if rule_name in matched_sentences:
                    matched_sentences[rule_name].append(sentence)
                else:
                    matched_sentences[rule_name] = [sentence]
            else:
                if rule_name in partial_matches:
                    partial_matches[rule_name].append(sentence)
                else:
                    partial_matches[rule_name] = [sentence]

# Print exact rule matches
print("Exact Rule Matches:")
for rule_name, matched_sentence_list in matched_sentences.items():
    print(f"Rule '{rule_name}' Matches:")
    for sentence in matched_sentence_list:
        print(f"- {sentence}")
    print()

# Print partial rule matches
print("Partial Rule Matches:")
for rule_name, partial_match_sentence_list in partial_matches.items():
    print(f"Rule '{rule_name}' Partial Matches:")
    for sentence in partial_match_sentence_list:
        print(f"- {sentence}")
    print()


Exact Rule Matches:
Rule 'Rule 1' Matches:
- Education is 'Low' , 'Medium' and Gender is male.

Partial Rule Matches:
Rule 'Rule 2' Partial Matches:
- Education is 'Low' , 'Medium' and Gender is male.
- Education is High , Gender is Male age 0

Rule 'Rule 1' Partial Matches:
- Education is High , Gender is Male age 0



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pixel\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pixel\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords

# Define the input text
text = """Education is 'Low' , 'Medium' and Gender is male. Appendix...
Age is 1 2 4 6 89. XYZ ltd. Among the largest university presses in the world,
The MIT Press publishes over 200 new books each year along with 30 journals
in the arts and humanities, economics, international affairs, history, political
science, science and technology along with other disciplines. We were among
the first university presses to offer titles electronically and we continue to
adopt technologies that allow us to better support the scholarly mission and
disseminate our content widely. The Press's enthusiasm for innovation is
reflected in our continuing exploration of this frontier. Since the late 1960s,
we have experimented with generation after generation of electronic publishing tools.
Through our commitment to new products—whether digital journals or entirely
new forms of communication—we have continued to look for the most efficient
and effective means to serve our readership. Our readers have come to expect
excellence from our products, and they can count on us to maintain a commitment
to producing rigorous and innovative information products in whatever forms
the future of publishing may bring. Education is High , Gender is Male age 0"""

# Define the list of anomaly rules
Anomaly_rule_list = {
    "Rule 1": "(Education == 'High') AND (Gender == 'Male')",
    "Rule 2": "(Education == 'High') AND (Age > 0) AND (Income > 66500.0)",
}

# Tokenize the input text and remove stopwords
# nltk.download("punkt")
# nltk.download("stopwords")

text_sentences = sent_tokenize(text)
stop_words = set(stopwords.words("english"))

# Create dictionaries to store the matched sentences and partial matches for each rule
matched_sentences = {}
partial_matches = {}

# Iterate through sentences and check if they match any rule
for sentence in text_sentences:
    sentence_tokens = word_tokenize(sentence)
    filtered_sentence_tokens = [word for word in sentence_tokens if word.lower() not in stop_words]
    
    for rule_name, rule_text in Anomaly_rule_list.items():
        rule_tokens = word_tokenize(rule_text)
        filtered_rule_tokens = [word for word in rule_tokens if word.lower() not in stop_words]
        
        # Calculate Jaccard similarity
        intersection = len(set(filtered_sentence_tokens).intersection(filtered_rule_tokens))
        union = len(set(filtered_sentence_tokens).union(filtered_rule_tokens))
        jaccard_similarity = intersection / union
        #print("jaccard_similarity", jaccard_similarity)
        # If the similarity is above a threshold (e.g., 0.5), consider it a match
        if jaccard_similarity > 0.06:
            if jaccard_similarity >= 0.23:
                if rule_name in matched_sentences:
                    matched_sentences[rule_name].append(sentence)
                else:
                    matched_sentences[rule_name] = [sentence]
            else:
                if rule_name in partial_matches:
                    partial_matches[rule_name].append(sentence)
                else:
                    partial_matches[rule_name] = [sentence]

# Print exact rule matches
print("Exact Rule Matches:")
for rule_name, matched_sentence_list in matched_sentences.items():
    print(f"Rule '{rule_name}' Matches:")
    for sentence in matched_sentence_list:
        print(f"- {sentence}")
    print()

# Print partial rule matches
print("Partial Rule Matches:")
for rule_name, partial_match_sentence_list in partial_matches.items():
    print(f"Rule '{rule_name}' Partial Matches:")
    for sentence in partial_match_sentence_list:
        print(f"- {sentence}")
    print()


Exact Rule Matches:
Rule 'Rule 1' Matches:
- Education is 'Low' , 'Medium' and Gender is male.

Partial Rule Matches:
Rule 'Rule 2' Partial Matches:
- Education is 'Low' , 'Medium' and Gender is male.
- Education is High , Gender is Male age 0

Rule 'Rule 1' Partial Matches:
- Education is High , Gender is Male age 0



In [31]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
#from nltk.corpus import stopwords

# Define the input text
text = """Education is 'Low' , 'Medium' and Gender is male. Appendix...
Age is 1 2 4 6 89. XYZ ltd. Among the largest university presses in the world,
The MIT Press publishes over 200 new books each year along with 30 journals
in the arts and humanities, economics, international affairs, history, political
science, science and technology along with other disciplines. We were among
the first university presses to offer titles electronically and we continue to
adopt technologies that allow us to better support the scholarly mission and
disseminate our content widely. The Press's enthusiasm for innovation is
reflected in our continuing exploration of this frontier. Since the late 1960s,
we have experimented with generation after generation of electronic publishing tools.
Through our commitment to new products—whether digital journals or entirely
new forms of communication—we have continued to look for the most efficient
and effective means to serve our readership. Our readers have come to expect
excellence from our products, and they can count on us to maintain a commitment
to producing rigorous and innovative information products in whatever forms
the future of publishing may bring. Education is High , Gender is Male age 0"""

# Define the list of anomaly rules
Anomaly_rule_list = {
    "Rule 1": "(Education == 'High') AND (Gender == 'Male')",
    "Rule 2": "(Education == 'High') AND (Age > 0) AND (Income > 66500.0)",
}

# Tokenize the input text and remove stopwords
# nltk.download("punkt")
# nltk.download("stopwords")

# Define a list of common stopwords to remove
stop_words = [
    "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves",
    "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their",
    "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was",
    "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and",
    "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between",
    "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off",
    "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any",
    "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so",
    "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"
]

text_sentences = sent_tokenize(text)
#stop_words = set(stopwords.words("english"))
stop_words = stop_words

# Create dictionaries to store the matched sentences and partial matches for each rule
matched_sentences = {}
partial_matches = {}

# Iterate through sentences and check if they match any rule
for sentence in text_sentences:
    sentence_tokens = word_tokenize(sentence)
    filtered_sentence_tokens = [word for word in sentence_tokens if word.lower() not in stop_words]
    
    for rule_name, rule_text in Anomaly_rule_list.items():
        rule_tokens = word_tokenize(rule_text)
        filtered_rule_tokens = [word for word in rule_tokens if word.lower() not in stop_words]
        
        # Calculate Jaccard similarity
        intersection = len(set(filtered_sentence_tokens).intersection(filtered_rule_tokens))
        union = len(set(filtered_sentence_tokens).union(filtered_rule_tokens))
        jaccard_similarity = intersection / union
        #print("jaccard_similarity", jaccard_similarity)
        # If the similarity is above a threshold (e.g., 0.5), consider it a match
        if jaccard_similarity > 0.06:
            if jaccard_similarity >= 0.23:
                if rule_name in matched_sentences:
                    matched_sentences[rule_name].append(sentence)
                else:
                    matched_sentences[rule_name] = [sentence]
            else:
                if rule_name in partial_matches:
                    partial_matches[rule_name].append(sentence)
                else:
                    partial_matches[rule_name] = [sentence]

# Print exact rule matches
print("Exact Rule Matches:")
for rule_name, matched_sentence_list in matched_sentences.items():
    print(f"Rule '{rule_name}' Matches:")
    for sentence in matched_sentence_list:
        print(f"- {sentence}")
    print()

# Print partial rule matches
print("Partial Rule Matches:")
for rule_name, partial_match_sentence_list in partial_matches.items():
    print(f"Rule '{rule_name}' Partial Matches:")
    for sentence in partial_match_sentence_list:
        print(f"- {sentence}")
    print()


Exact Rule Matches:
Rule 'Rule 1' Matches:
- Education is 'Low' , 'Medium' and Gender is male.

Partial Rule Matches:
Rule 'Rule 2' Partial Matches:
- Education is 'Low' , 'Medium' and Gender is male.
- Education is High , Gender is Male age 0

Rule 'Rule 1' Partial Matches:
- Education is High , Gender is Male age 0



In [35]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re

# Define the input text
text = """Education is 'Low' , 'Medium' and Gender is male. Appendix...
Age is 1 2 4 6 89. XYZ ltd. Among the largest university presses in the world,
The MIT Press publishes over 200 new books each year along with 30 journals
in the arts and humanities, economics, international affairs, history, political
science, science and technology along with other disciplines. We were among
the first university presses to offer titles electronically and we continue to
adopt technologies that allow us to better support the scholarly mission and
disseminate our content widely. The Press's enthusiasm for innovation is
reflected in our continuing exploration of this frontier. Since the late 1960s,
we have experimented with generation after generation of electronic publishing tools.
Through our commitment to new products—whether digital journals or entirely
new forms of communication—we have continued to look for the most efficient
and effective means to serve our readership. Our readers have come to expect
excellence from our products, and they can count on us to maintain a commitment
to producing rigorous and innovative information products in whatever forms
the future of publishing may bring. Education is High , Gender is Male."""

# Define the list of anomaly rules
Anomaly_rule_list = {
    "Rule 1": "(Education == 'High') AND (Gender == 'Male')",
    "Rule 2": "(Education == 'High') AND (Age > 0) AND (Income > 66500.0)",
}

# Tokenize the input text
text_sentences = text.split('.')  # Split the text into sentences based on periods

# Define a list of common stopwords to remove
stop_words = set(stopwords.words("english"))

# Create dictionaries to store the matched sentences and partial matches for each rule
matched_sentences = {}
partial_matches = {}

# Define a function to preprocess and extract main words from a rule
def extract_main_words(rule_text):
    # Remove extra spaces and 'AND' from the rule text
    rule_text = re.sub(r'\s+', ' ', rule_text)  # Remove extra spaces
    rule_text = re.sub(r'\bAND\b', '', rule_text)  # Remove 'AND' (word boundary)

    # Tokenize the rule text and remove stopwords
    rule_tokens = word_tokenize(rule_text)
    filtered_rule_tokens = [word.lower() for word in rule_tokens if word.lower() not in stop_words]

    return set(filtered_rule_tokens)

# Iterate through sentences and check if they match any rule
for sentence in text_sentences:
    sentence_tokens = word_tokenize(sentence.lower())
    
    for rule_name, rule_text in Anomaly_rule_list.items():
        rule_main_words = extract_main_words(rule_text.lower())
        #print("rule_main_words", rule_main_words)
        # Calculate Jaccard similarity
        intersection = len(set(sentence_tokens).intersection(rule_main_words))
        union = len(set(sentence_tokens).union(rule_main_words))
        jaccard_similarity = intersection / union
        
        # If the similarity is above a threshold (e.g., 0.5), consider it a match
        if jaccard_similarity > 0.06:
            if jaccard_similarity > 0.2:
                if rule_name in matched_sentences:
                    matched_sentences[rule_name].append(sentence)
                else:
                    matched_sentences[rule_name] = [sentence]
            else:
                if rule_name in partial_matches:
                    partial_matches[rule_name].append(sentence)
                else:
                    partial_matches[rule_name] = [sentence]

# Print exact rule matches
print("Exact Rule Matches:")
for rule_name, matched_sentence_list in matched_sentences.items():
    print(f"Rule '{rule_name}' Matches:")
    for sentence in matched_sentence_list:
        print(f"- {sentence.strip()}")
    print()

# Print partial rule matches
print("Partial Rule Matches:")
for rule_name, partial_match_sentence_list in partial_matches.items():
    print(f"Rule '{rule_name}' Partial Matches:")
    for sentence in partial_match_sentence_list:
        print(f"- {sentence.strip()}")
    print()


Exact Rule Matches:
Rule 'Rule 1' Matches:
- Education is 'Low' , 'Medium' and Gender is male

Partial Rule Matches:
Rule 'Rule 2' Partial Matches:
- Education is 'Low' , 'Medium' and Gender is male
- Education is High , Gender is Male

Rule 'Rule 1' Partial Matches:
- Education is High , Gender is Male



In [38]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re

# Define the input text
text = """Education is 'Low' , 'Medium' and Gender is male. Appendix...
Age is 1 2 4 6 89. XYZ ltd. Among the largest university presses in the world,
The MIT Press publishes over 200 new books each year along with 30 journals
in the arts and humanities, economics, international affairs, history, political
science, science and technology along with other disciplines. We were among
the first university presses to offer titles electronically and we continue to
adopt technologies that allow us to better support the scholarly mission and
disseminate our content widely. The Press's enthusiasm for innovation is
reflected in our continuing exploration of this frontier. Since the late 1960s,
we have experimented with generation after generation of electronic publishing tools.
Through our commitment to new products—whether digital journals or entirely
new forms of communication—we have continued to look for the most efficient
and effective means to serve our readership. Our readers have come to expect
excellence from our products, and they can count on us to maintain a commitment
to producing rigorous and innovative information products in whatever forms
the future of publishing may bring. Education is High , Gender is Male."""

# Define the list of anomaly rules
Anomaly_rule_list = {
    "Rule 1": "(Education == 'High') AND (Gender == 'Male')",
    "Rule 2": "(Education == 'High') AND (Age > 0) AND (Income > 66500.0)",
}

# Tokenize the input text
text_sentences = text.split('.')  # Split the text into sentences based on periods

# Define a list of common stopwords to remove
stop_words = set(stopwords.words("english"))

# Define a function to preprocess and extract main words and operators from a rule
def extract_main_words(rule_text):
    # Remove extra spaces, 'AND', and punctuation
    rule_text = re.sub(r'\s+', ' ', rule_text)  # Remove extra spaces
    rule_text = re.sub(r'\bAND\b', '', rule_text)  # Remove 'AND' (word boundary)
    rule_text = re.sub(r'[^\w\s]', '', rule_text)  # Remove punctuation
    
    # Tokenize the rule text and remove stopwords
    rule_tokens = word_tokenize(rule_text)
    filtered_rule_tokens = [word.lower() for word in rule_tokens if word.lower() not in stop_words]
    
    # Extract main words and operators
    main_words = []
    for i, token in enumerate(filtered_rule_tokens):
        if token in ['is', '=', '==']:  # Consider 'is', '=', or '==' as operators
            if i > 0:
                main_words.append(filtered_rule_tokens[i - 1])
                main_words.append(token)
            if i < len(filtered_rule_tokens) - 1:
                main_words.append(filtered_rule_tokens[i + 1])
    
    return set(main_words)

# Iterate through sentences and check if they match any rule
for sentence in text_sentences:
    sentence_tokens = word_tokenize(sentence.lower())
    
    for rule_name, rule_text in Anomaly_rule_list.items():
        rule_main_words = extract_main_words(rule_text.lower())
        
        # Calculate Jaccard similarity
        intersection = len(set(sentence_tokens).intersection(rule_main_words))
        union = len(set(sentence_tokens).union(rule_main_words))
        
        # Avoid division by zero
        jaccard_similarity = intersection / union if union != 0 else 0
        
        # If the similarity is above a threshold (e.g., 0.5), consider it a match
        if jaccard_similarity > 0.5:
            if jaccard_similarity > 0.3:
                if rule_name in matched_sentences:
                    matched_sentences[rule_name].append(sentence)
                else:
                    matched_sentences[rule_name] = [sentence]
            else:
                if rule_name in partial_matches:
                    partial_matches[rule_name].append(sentence)
                else:
                    partial_matches[rule_name] = [sentence]

# Print exact rule matches
print("Exact Rule Matches:")
for rule_name, matched_sentence_list in matched_sentences.items():
    print(f"Rule '{rule_name}' Matches:")
    for sentence in matched_sentence_list:
        print(f"- {sentence.strip()}")
    print()

# Print partial rule matches
print("Partial Rule Matches:")
for rule_name, partial_match_sentence_list in partial_matches.items():
    print(f"Rule '{rule_name}' Partial Matches:")
    for sentence in partial_match_sentence_list:
        print(f"- {sentence.strip()}")
    print()


Exact Rule Matches:
Rule 'Rule 1' Matches:
- Education is 'Low' , 'Medium' and Gender is male

Partial Rule Matches:
Rule 'Rule 2' Partial Matches:
- Education is 'Low' , 'Medium' and Gender is male
- Education is High , Gender is Male

Rule 'Rule 1' Partial Matches:
- Education is High , Gender is Male



In [40]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re

# Define the input text
text = """Education is 'Low' , 'Medium' and Gender is male. Appendix...
Age is 1 2 4 6 89. XYZ ltd. Among the largest university presses in the world,
The MIT Press publishes over 200 new books each year along with 30 journals
in the arts and humanities, economics, international affairs, history, political
science, science and technology along with other disciplines. We were among
the first university presses to offer titles electronically and we continue to
adopt technologies that allow us to better support the scholarly mission and
disseminate our content widely. The Press's enthusiasm for innovation is
reflected in our continuing exploration of this frontier. Since the late 1960s,
we have experimented with generation after generation of electronic publishing tools.
Through our commitment to new products—whether digital journals or entirely
new forms of communication—we have continued to look for the most efficient
and effective means to serve our readership. Our readers have come to expect
excellence from our products, and they can count on us to maintain a commitment
to producing rigorous and innovative information products in whatever forms
the future of publishing may bring. Education is High , Gender is Male."""

# Define the list of anomaly rules
Anomaly_rule_list = {
    "Rule 1": "(Education == 'High') AND (Gender == 'Male')",
    "Rule 2": "(Education == 'High') AND (Age > 0) AND (Income > 66500.0)",
}

# Tokenize the input text
text_sentences = text.split('.')  # Split the text into sentences based on periods

# Define a list of common stopwords to remove
stop_words = set(stopwords.words("english"))

# Define a function to preprocess and extract main words and operators from a rule
def extract_main_words(rule_text):
    # Remove extra spaces, 'AND', and punctuation
    rule_text = re.sub(r'\s+', ' ', rule_text)  # Remove extra spaces
    rule_text = re.sub(r'\bAND\b', '', rule_text)  # Remove 'AND' (word boundary)
    rule_text = re.sub(r'[^\w\s]', '', rule_text)  # Remove punctuation
    
    # Tokenize the rule text and remove stopwords
    rule_tokens = word_tokenize(rule_text)
    filtered_rule_tokens = [word.lower() for word in rule_tokens if word.lower() not in stop_words]
    
    # Extract main words and operators
    main_words = []
    for i, token in enumerate(filtered_rule_tokens):
        if token in ['is', '=', '==']:  # Consider 'is', '=', or '==' as operators
            if i > 0:
                main_words.append(filtered_rule_tokens[i - 1])
                main_words.append(token)
            if i < len(filtered_rule_tokens) - 1:
                main_words.append(filtered_rule_tokens[i + 1])
    
    return set(main_words)

# Define a function to check if a rule matches a sentence
def rule_matches_sentence(rule_main_words, sentence_tokens):
    for i in range(len(sentence_tokens) - len(rule_main_words) + 1):
        window = sentence_tokens[i:i + len(rule_main_words)]
        if all(word in window for word in rule_main_words):
            return True
    return False

# Iterate through sentences and check if they match any rule
for sentence in text_sentences:
    sentence_tokens = word_tokenize(sentence.lower())
    
    for rule_name, rule_text in Anomaly_rule_list.items():
        rule_main_words = extract_main_words(rule_text.lower())
        
        # Check if the rule matches the sentence
        if rule_matches_sentence(rule_main_words, sentence_tokens):
            if rule_name in matched_sentences:
                matched_sentences[rule_name].append(sentence)
            else:
                matched_sentences[rule_name] = [sentence]

# Print exact rule matches
print("Exact Rule Matches:")
for rule_name, matched_sentence_list in matched_sentences.items():
    print(f"Rule '{rule_name}' Matches:")
    for sentence in matched_sentence_list:
        print(f"- {sentence.strip()}")
    print()


Exact Rule Matches:
Rule 'Rule 1' Matches:
- Education is 'Low' , 'Medium' and Gender is male
- Education is 'Low' , 'Medium' and Gender is male
- Appendix
- 
- 
- Age is 1 2 4 6 89
- XYZ ltd
- Among the largest university presses in the world,
The MIT Press publishes over 200 new books each year along with 30 journals
in the arts and humanities, economics, international affairs, history, political
science, science and technology along with other disciplines
- We were among
the first university presses to offer titles electronically and we continue to
adopt technologies that allow us to better support the scholarly mission and
disseminate our content widely
- The Press's enthusiasm for innovation is
reflected in our continuing exploration of this frontier
- Since the late 1960s,
we have experimented with generation after generation of electronic publishing tools
- Through our commitment to new products—whether digital journals or entirely
new forms of communication—we have continued to

In [41]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re

# Define the input text
text = """Education is 'Low' , 'Medium' and Gender is male. Appendix...
Age is 1 2 4 6 89. XYZ ltd. Among the largest university presses in the world,
The MIT Press publishes over 200 new books each year along with 30 journals
in the arts and humanities, economics, international affairs, history, political
science, science and technology along with other disciplines. We were among
the first university presses to offer titles electronically and we continue to
adopt technologies that allow us to better support the scholarly mission and
disseminate our content widely. The Press's enthusiasm for innovation is
reflected in our continuing exploration of this frontier. Since the late 1960s,
we have experimented with generation after generation of electronic publishing tools.
Through our commitment to new products—whether digital journals or entirely
new forms of communication—we have continued to look for the most efficient
and effective means to serve our readership. Our readers have come to expect
excellence from our products, and they can count on us to maintain a commitment
to producing rigorous and innovative information products in whatever forms
the future of publishing may bring. Education is High , Gender is Male."""

# Define the list of anomaly rules
Anomaly_rule_list = {
    "Rule 1": "(Education == 'High') AND (Gender == 'Male')",
    "Rule 2": "(Education == 'High') AND (Age > 0) AND (Income > 66500.0)",
}

# Tokenize the input text into sentences
text_sentences = sent_tokenize(text)

# Define a list of common stopwords to remove
stop_words = set(stopwords.words("english"))

# Define a function to preprocess and extract main words and operators from a rule
def extract_main_words(rule_text):
    # Remove extra spaces, 'AND', and punctuation
    rule_text = re.sub(r'\s+', ' ', rule_text)  # Remove extra spaces
    rule_text = re.sub(r'\bAND\b', '', rule_text)  # Remove 'AND' (word boundary)
    rule_text = re.sub(r'[^\w\s]', '', rule_text)  # Remove punctuation
    
    # Tokenize the rule text and remove stopwords
    rule_tokens = word_tokenize(rule_text)
    filtered_rule_tokens = [word.lower() for word in rule_tokens if word.lower() not in stop_words]
    
    # Extract main words and operators
    main_words = []
    for i, token in enumerate(filtered_rule_tokens):
        if token in ['is', '=', '==']:  # Consider 'is', '=', or '==' as operators
            if i > 0:
                main_words.append(filtered_rule_tokens[i - 1])
                main_words.append(token)
            if i < len(filtered_rule_tokens) - 1:
                main_words.append(filtered_rule_tokens[i + 1])
    
    return set(main_words)

# Define a function to check if a rule matches a sentence
def rule_matches_sentence(rule_main_words, sentence_tokens):
    for i in range(len(sentence_tokens) - len(rule_main_words) + 1):
        window = sentence_tokens[i:i + len(rule_main_words)]
        if all(word in window for word in rule_main_words):
            return True
    return False

# Iterate through sentences and check if they match any rule
for sentence in text_sentences:
    sentence_tokens = word_tokenize(sentence.lower())
    
    for rule_name, rule_text in Anomaly_rule_list.items():
        rule_main_words = extract_main_words(rule_text.lower())
        
        # Check if the rule matches the sentence
        if rule_matches_sentence(rule_main_words, sentence_tokens):
            print(f"Sentence: {sentence.strip()}")  # Print the matching sentence
            print(f"Rule '{rule_name}' Context: {rule_text}")  # Print the matching rule context
            print()


Sentence: Education is 'Low' , 'Medium' and Gender is male.
Rule 'Rule 1' Context: (Education == 'High') AND (Gender == 'Male')

Sentence: Education is 'Low' , 'Medium' and Gender is male.
Rule 'Rule 2' Context: (Education == 'High') AND (Age > 0) AND (Income > 66500.0)

Sentence: Appendix...
Age is 1 2 4 6 89.
Rule 'Rule 1' Context: (Education == 'High') AND (Gender == 'Male')

Sentence: Appendix...
Age is 1 2 4 6 89.
Rule 'Rule 2' Context: (Education == 'High') AND (Age > 0) AND (Income > 66500.0)

Sentence: XYZ ltd.
Rule 'Rule 1' Context: (Education == 'High') AND (Gender == 'Male')

Sentence: XYZ ltd.
Rule 'Rule 2' Context: (Education == 'High') AND (Age > 0) AND (Income > 66500.0)

Sentence: Among the largest university presses in the world,
The MIT Press publishes over 200 new books each year along with 30 journals
in the arts and humanities, economics, international affairs, history, political
science, science and technology along with other disciplines.
Rule 'Rule 1' Context: (

In [45]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
import re

# Define the input text
text = """Education is 'Low' , 'Medium' and Gender is male. Appendix...
Age is 1 2 4 6 89. XYZ ltd. Among the largest university presses in the world,
The MIT Press publishes over 200 new books each year along with 30 journals
in the arts and humanities, economics, international affairs, history, political
science, science and technology along with other disciplines. We were among
the first university presses to offer titles electronically and we continue to
adopt technologies that allow us to better support the scholarly mission and
disseminate our content widely. The Press's enthusiasm for innovation is
reflected in our continuing exploration of this frontier. Since the late 1960s,
we have experimented with generation after generation of electronic publishing tools.
Through our commitment to new products—whether digital journals or entirely
new forms of communication—we have continued to look for the most efficient
and effective means to serve our readership. Our readers have come to expect
excellence from our products, and they can count on us to maintain a commitment
to producing rigorous and innovative information products in whatever forms
the future of publishing may bring. Education is High , Gender is Male."""

# Define the list of anomaly rules
Anomaly_rule_list = {
    "Rule 1": "(Education == 'High') AND (Gender == 'Male')",
    "Rule 2": "(Education == 'High') AND (Age > 0) AND (Income > 66500.0)",
}

# Tokenize the input text into sentences
text_sentences = sent_tokenize(text)

# Define a list of common stopwords to remove
stop_words = set(stopwords.words("english"))

# Define a function to preprocess and extract main words and operators from a rule
def extract_main_words(rule_text):
    # Remove extra spaces, 'AND', and punctuation
    rule_text = re.sub(r'\s+', ' ', rule_text)  # Remove extra spaces
    rule_text = re.sub(r'\bAND\b', '', rule_text)  # Remove 'AND' (word boundary)
    rule_text = re.sub(r'[^\w\s]', '', rule_text)  # Remove punctuation
    
    # Tokenize the rule text and remove stopwords
    rule_tokens = word_tokenize(rule_text)
    filtered_rule_tokens = [word.lower() for word in rule_tokens if word.lower() not in stop_words]
    
    # Extract main words and operators
    main_words = []
    for i, token in enumerate(filtered_rule_tokens):
        if token in ['is', '=', '==']:  # Consider 'is', '=', or '==' as operators
            if i > 0:
                main_words.append(filtered_rule_tokens[i - 1])
                main_words.append(token)
            if i < len(filtered_rule_tokens) - 1:
                main_words.append(filtered_rule_tokens[i + 1])
    
    return set(main_words)

# Define a function to calculate Jaccard similarity
def calculate_jaccard_similarity(rule_main_words, sentence_tokens):
    intersection = len(set(sentence_tokens).intersection(rule_main_words))
    union = len(set(sentence_tokens).union(rule_main_words))
    
    # Avoid division by zero
    jaccard_similarity = intersection / union if union != 0 else 0
    
    return jaccard_similarity

# Define a function to check if a rule matches a sentence
def rule_matches_sentence(rule_main_words, sentence_tokens, min_jaccard_similarity=0.5):
    for i in range(len(sentence_tokens) - len(rule_main_words) + 1):
        window = sentence_tokens[i:i + len(rule_main_words)]
        if all(word in window for word in rule_main_words):
            jaccard_similarity = calculate_jaccard_similarity(rule_main_words, sentence_tokens)
            if jaccard_similarity >= min_jaccard_similarity:
                return True, jaccard_similarity
    return False, 0

# Define the minimum Jaccard similarity threshold
min_jaccard_similarity_threshold = 0.01 # Adjust as needed

# Iterate through sentences and check if they match any rule
for sentence in text_sentences:
    sentence_tokens = word_tokenize(sentence.lower())
    
    for rule_name, rule_text in Anomaly_rule_list.items():
        rule_main_words = extract_main_words(rule_text.lower())
        print("rule_main_words", rule_main_words)
        # Check if the rule matches the sentence with the specified Jaccard similarity threshold
        matched, jaccard_similarity = rule_matches_sentence(rule_main_words, sentence_tokens, min_jaccard_similarity_threshold)
        if matched:
            print(f"Sentence: {sentence.strip()}")  # Print the matching sentence
            print(f"Rule '{rule_name}' Context: {rule_text}")  # Print the matching rule context
            print(f"Jaccard Similarity: {jaccard_similarity:.2f}")  # Print Jaccard similarity score
            print()


rule_main_words set()
rule_main_words set()
rule_main_words set()
rule_main_words set()
rule_main_words set()
rule_main_words set()
rule_main_words set()
rule_main_words set()
rule_main_words set()
rule_main_words set()
rule_main_words set()
rule_main_words set()
rule_main_words set()
rule_main_words set()
rule_main_words set()
rule_main_words set()
rule_main_words set()
rule_main_words set()
rule_main_words set()
rule_main_words set()
