In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
import itertools
import re

# Create the dataframe with more records and additional variables
data = {
    'Age': [25, 30, 22, 35, 28, 40, 45, 27, 33, 29] * 5,
    'Income': [50000, 60000, 45000, 70000, 55000, 80000, 90000, 52000, 65000, 59000] * 5,
    'Education': ['High', 'Low', 'Medium', 'Medium', 'High', 'Low', 'Medium', 'High', 'Low', 'Medium'] * 5,
    'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'] * 5,
    'Target': [0, 1, 0, 1, 1, 0, 1, 0, 1, 1] * 5
}
df = pd.DataFrame(data)

# One-hot encode categorical variables
df = pd.get_dummies(df, columns=['Education', 'Gender'], drop_first=True)

# Fit a decision tree model
X = df.drop('Target', axis=1)
y = df['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)

# Generate text-based rules from the decision tree
tree_rules = export_text(tree_model, feature_names=X.columns.tolist())

# Define combinations of variables
variable_combinations = list(itertools.combinations(X.columns, 2))

# Extract complex rule conditions based on variable combinations
complex_rule_conditions = []
for line in tree_rules.split('\n'):
    if line.startswith('|---') and 'class:' not in line:
        for var1, var2 in variable_combinations:
            condition = line.replace('|--- ', '').strip()  # Modify condition format
            condition = re.sub(r'([A-Za-z_]+)', r'row["\1"]', condition)  # Replace feature names
            condition = condition.replace(f'row["{var1}"]', var1).replace(f'row["{var2}"]', var2)
            complex_rule_conditions.append(condition)

# Custom function to evaluate conditions row-wise
def evaluate_condition(row, condition):
    try:
        return int(eval(condition, {'__builtins__': None, 'row': row}))
    except:
        return 0

# Create binary features based on the complex rule conditions
for idx, condition in enumerate(complex_rule_conditions):
    feature_name = f'complex_rule_{idx}'
    df[feature_name] = df.apply(lambda row, condition=condition: evaluate_condition(row, condition), axis=1)

# Split the dataset for anomaly detection
X = df.drop('Target', axis=1)
y = df['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use X_train for training your anomaly detection model
# Use X_test for evaluating the model's performance


In [2]:
X_train.head()

Unnamed: 0,Age,Income,Education_Low,Education_Medium,Gender_Male,complex_rule_0,complex_rule_1,complex_rule_2,complex_rule_3,complex_rule_4,...,complex_rule_10,complex_rule_11,complex_rule_12,complex_rule_13,complex_rule_14,complex_rule_15,complex_rule_16,complex_rule_17,complex_rule_18,complex_rule_19
12,22,45000,0,1,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
4,28,55000,0,0,1,0,0,0,0,0,...,0,0,0,0,1,1,1,1,1,1
37,27,52000,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8,33,65000,1,0,1,0,0,0,0,0,...,0,0,0,0,1,1,1,1,1,1
3,35,70000,0,1,0,0,0,0,0,0,...,0,0,0,0,1,1,1,1,1,1


In [5]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

# Fit the Isolation Forest model
isolation_forest_model = IsolationForest(contamination=0.1, random_state=42)
isolation_forest_model.fit(X_train)

# Predict anomalies
anomaly_scores = isolation_forest_model.decision_function(X)

# Add 'anomaly' score column to the original dataset
df['anomaly_score'] = anomaly_scores

# Set a threshold for anomaly classification
anomaly_threshold = -0.2  # Adjust this threshold as needed

# Add 'anomaly' column as True or False
df['anomaly'] = df['anomaly_score'] < anomaly_threshold

# Display the dataset with 'anomaly' column
print(df)

# Display the dataset with 'anomaly' scores
#print(df)



    Age  Income  Target  Education_Low  Education_Medium  Gender_Male  \
0    25   50000       0              0                 0            1   
1    30   60000       1              1                 0            0   
2    22   45000       0              0                 1            1   
3    35   70000       1              0                 1            0   
4    28   55000       1              0                 0            1   
5    40   80000       0              1                 0            0   
6    45   90000       1              0                 1            1   
7    27   52000       0              0                 0            0   
8    33   65000       1              1                 0            1   
9    29   59000       1              0                 1            0   
10   25   50000       0              0                 0            1   
11   30   60000       1              1                 0            0   
12   22   45000       0              0             

In [7]:
df['Target'].value_counts()

1    30
0    20
Name: Target, dtype: int64

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text
import itertools
import re

# Create the dataframe with more records and additional variables
import pandas as pd
import numpy as np

# Create a new dataset with 1000 records
np.random.seed(42)  # For reproducibility
n_records = 1000

age = np.random.randint(20, 60, size=n_records)
income = np.random.randint(30000, 100000, size=n_records)
education = np.random.choice(['High', 'Medium', 'Low'], size=n_records)
gender = np.random.choice(['Male', 'Female'], size=n_records)
target = np.random.choice([0, 1], size=n_records, p=[0.9, 0.1])  # 3% anomalies

data = {
    'Age': age,
    'Income': income,
    'Education': education,
    'Gender': gender,
    'Target': target
}

df = pd.DataFrame(data)

# One-hot encode categorical variables
df = pd.get_dummies(df, columns=['Education', 'Gender'], drop_first=True)

# Display the new dataset
print(df)


# Fit a decision tree model
X = df.drop('Target', axis=1)
y = df['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)

# Generate text-based rules from the decision tree
tree_rules = export_text(tree_model, feature_names=X.columns.tolist())

# Define combinations of variables
variable_combinations = list(itertools.combinations(X.columns, 2))

# Extract complex rule conditions based on variable combinations
complex_rule_conditions = []
for line in tree_rules.split('\n'):
    if line.startswith('|---') and 'class:' not in line:
        for var1, var2 in variable_combinations:
            condition = line.replace('|--- ', '').strip()  # Modify condition format
            condition = re.sub(r'([A-Za-z_]+)', r'row["\1"]', condition)  # Replace feature names
            condition = condition.replace(f'row["{var1}"]', var1).replace(f'row["{var2}"]', var2)
            complex_rule_conditions.append(condition)

# Custom function to evaluate conditions row-wise
def evaluate_condition(row, condition):
    try:
        return int(eval(condition, {'__builtins__': None, 'row': row}))
    except:
        return 0

# Create binary features based on the complex rule conditions
for idx, condition in enumerate(complex_rule_conditions):
    feature_name = f'complex_rule_{idx}'
    df[feature_name] = df.apply(lambda row, condition=condition: evaluate_condition(row, condition), axis=1)

# Split the dataset for anomaly detection
X = df.drop('Target', axis=1)
y = df['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use X_train for training your anomaly detection model
# Use X_test for evaluating the model's performance


     Age  Income  Target  Education_Low  Education_Medium  Gender_Male
0     58   74327       0              1                 0            0
1     48   98904       0              0                 0            1
2     34   33797       0              1                 0            0
3     27   77882       0              1                 0            1
4     40   43718       0              0                 0            1
..   ...     ...     ...            ...               ...          ...
995   27   77076       0              0                 0            0
996   47   83420       0              0                 0            0
997   50   30671       0              0                 0            0
998   28   30163       0              0                 0            1
999   48   71424       0              0                 0            1

[1000 rows x 6 columns]


In [17]:
X_train.head()

Unnamed: 0,Age,Income,Education_Low,Education_Medium,Gender_Male,complex_rule_0,complex_rule_1,complex_rule_2,complex_rule_3,complex_rule_4,...,complex_rule_10,complex_rule_11,complex_rule_12,complex_rule_13,complex_rule_14,complex_rule_15,complex_rule_16,complex_rule_17,complex_rule_18,complex_rule_19
29,34,66368,0,1,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
535,57,92768,0,0,0,0,0,0,0,0,...,0,1,1,1,0,0,0,1,1,1
695,57,49508,0,1,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
557,39,66877,0,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
836,34,46456,0,1,1,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

# Fit the Isolation Forest model
isolation_forest_model = IsolationForest(contamination=0.1, random_state=42)
isolation_forest_model.fit(X_train)

# Predict anomalies
anomaly_scores = isolation_forest_model.decision_function(X)

# Add 'anomaly' score column to the original dataset
df['anomaly_score'] = anomaly_scores

# Set a threshold for anomaly classification
anomaly_threshold = -0.2  # Adjust this threshold as needed

# Add 'anomaly' column as True or False
df['anomaly'] = df['anomaly_score'] < anomaly_threshold

# Display the dataset with 'anomaly' column
print(df)

# Display the dataset with 'anomaly' scores
#print(df)



     Age  Income  Target  Education_Low  Education_Medium  Gender_Male  \
0     58   74327       0              1                 0            0   
1     48   98904       0              0                 0            1   
2     34   33797       0              1                 0            0   
3     27   77882       0              1                 0            1   
4     40   43718       0              0                 0            1   
..   ...     ...     ...            ...               ...          ...   
995   27   77076       0              0                 0            0   
996   47   83420       0              0                 0            0   
997   50   30671       0              0                 0            0   
998   28   30163       0              0                 0            1   
999   48   71424       0              0                 0            1   

     complex_rule_0  complex_rule_1  complex_rule_2  complex_rule_3  ...  \
0                 0               1

In [19]:
df['Target'].value_counts()

0    911
1     89
Name: Target, dtype: int64

In [20]:
df[(df['anomaly'] == True) & (df['Target']==1)]

Unnamed: 0,Age,Income,Target,Education_Low,Education_Medium,Gender_Male,complex_rule_0,complex_rule_1,complex_rule_2,complex_rule_3,...,complex_rule_12,complex_rule_13,complex_rule_14,complex_rule_15,complex_rule_16,complex_rule_17,complex_rule_18,complex_rule_19,anomaly_score,anomaly


In [21]:
df['anomaly'].value_counts()

False    1000
Name: anomaly, dtype: int64

In [33]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Sample dataset
data = pd.DataFrame({
    'maturity_value': [100, 50, 10, 500, 10, 500, 20, 0],
    'category': ['S.A', 'D.A', 'J.K', 'J.K', 'S.A', 'I.T', 'D.A', 'J.K'],
    'prod_category': ['S.A', 'D.A', 'R.Y', 'S.A', 'R.Y', 'D.A', 'R.Y', 'I.T'],
    'amount': [100, 150, 100, 200, 50, 300, 120, 0]
})

# Generate 2000 records
n_records = 2000
random_indices = np.random.choice(data.index, size=n_records, replace=True)
anomaly_probabilities = np.random.uniform(0, 1, size=n_records)
response_variable = [1 if p < 0.1 else 0 for p in anomaly_probabilities]  # 3% anomalies

anomaly_data = data.iloc[random_indices].copy()
anomaly_data['response'] = response_variable

# Use LabelEncoder for categorical variables
label_encoder = LabelEncoder()

categorical_columns = ['category', 'prod_category']
for column in categorical_columns:
    anomaly_data[column] = label_encoder.fit_transform(anomaly_data[column])

# Display the new anomaly dataset with encoded categorical variables
print(anomaly_data)


    maturity_value  category  prod_category  amount  response
0              100         3              3     100         1
0              100         3              3     100         0
5              500         1              0     300         0
3              500         2              3     200         0
6               20         0              2     120         1
..             ...       ...            ...     ...       ...
4               10         3              2      50         0
2               10         2              2     100         0
4               10         3              2      50         0
2               10         2              2     100         0
1               50         0              0     150         0

[2000 rows x 5 columns]


In [34]:
df = anomaly_data

In [35]:
# Fit a decision tree model
X = df.drop('response', axis=1)
y = df['response']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)

# Generate text-based rules from the decision tree
tree_rules = export_text(tree_model, feature_names=X.columns.tolist())

# Define combinations of variables
variable_combinations = list(itertools.combinations(X.columns, 2))

# Extract complex rule conditions based on variable combinations
complex_rule_conditions = []
for line in tree_rules.split('\n'):
    if line.startswith('|---') and 'class:' not in line:
        for var1, var2 in variable_combinations:
            condition = line.replace('|--- ', '').strip()  # Modify condition format
            condition = re.sub(r'([A-Za-z_]+)', r'row["\1"]', condition)  # Replace feature names
            condition = condition.replace(f'row["{var1}"]', var1).replace(f'row["{var2}"]', var2)
            complex_rule_conditions.append(condition)

# Custom function to evaluate conditions row-wise
def evaluate_condition(row, condition):
    try:
        return int(eval(condition, {'__builtins__': None, 'row': row}))
    except:
        return 0

# Create binary features based on the complex rule conditions
for idx, condition in enumerate(complex_rule_conditions):
    feature_name = f'complex_rule_{idx}'
    df[feature_name] = df.apply(lambda row, condition=condition: evaluate_condition(row, condition), axis=1)

# Split the dataset for anomaly detection
X = df.drop('response', axis=1)
y = df['response']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use X_train for training your anomaly detection model
# Use X_test for evaluating the model's performance


In [36]:
X_train.head()

Unnamed: 0,maturity_value,category,prod_category,amount,complex_rule_0,complex_rule_1,complex_rule_2,complex_rule_3,complex_rule_4,complex_rule_5,complex_rule_6,complex_rule_7,complex_rule_8,complex_rule_9,complex_rule_10,complex_rule_11
4,10,3,2,50,0,0,0,0,0,0,0,1,1,0,0,1
3,500,2,3,200,0,0,0,0,0,0,0,1,1,0,0,1
2,10,2,2,100,0,0,0,0,0,0,0,1,1,0,0,1
2,10,2,2,100,0,0,0,0,0,0,0,1,1,0,0,1
4,10,3,2,50,0,0,0,0,0,0,0,1,1,0,0,1


In [37]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

# Fit the Isolation Forest model
isolation_forest_model = IsolationForest(contamination=0.1, random_state=42)
isolation_forest_model.fit(X_train)

# Predict anomalies
anomaly_scores = isolation_forest_model.decision_function(X)

# Add 'anomaly' score column to the original dataset
df['anomaly_score'] = anomaly_scores

# Set a threshold for anomaly classification
anomaly_threshold = -0.0  # Adjust this threshold as needed

# Add 'anomaly' column as True or False
df['anomaly'] = df['anomaly_score'] < anomaly_threshold

# Display the dataset with 'anomaly' column
print(df)

# Display the dataset with 'anomaly' scores
#print(df)



    maturity_value  category  prod_category  amount  response  complex_rule_0  \
0              100         3              3     100         1               0   
0              100         3              3     100         0               0   
5              500         1              0     300         0               0   
3              500         2              3     200         0               0   
6               20         0              2     120         1               0   
..             ...       ...            ...     ...       ...             ...   
4               10         3              2      50         0               0   
2               10         2              2     100         0               0   
4               10         3              2      50         0               0   
2               10         2              2     100         0               0   
1               50         0              0     150         0               0   

    complex_rule_1  complex

In [38]:
df['anomaly'].value_counts()

False    2000
Name: anomaly, dtype: int64

In [52]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Create a new dataset with anomalies
n_records = 2000

# Create random data
data = {
    'feature_1': np.random.normal(0, 1, n_records),
    'feature_2': np.random.uniform(0, 1, n_records),
    'feature_3': np.random.choice(['A', 'B', 'C'], size=n_records),
    'feature_4': np.random.randint(1, 11, n_records)
}

# Introduce anomalies in the dataset
anomaly_indices = np.random.choice(np.arange(n_records), size=int(0.1 * n_records), replace=False)  # 3% anomalies
data['feature_1'][anomaly_indices] = np.random.normal(10, 1, len(anomaly_indices))
data['feature_2'][anomaly_indices] = np.random.uniform(5, 10, len(anomaly_indices))
data['feature_3'][anomaly_indices] = 'D'
data['feature_4'][anomaly_indices] = np.random.randint(20, 30, len(anomaly_indices))

# Create a DataFrame
anomaly_df = pd.DataFrame(data)

# Use LabelEncoder for categorical variable
label_encoder = LabelEncoder()
anomaly_df['feature_3'] = label_encoder.fit_transform(anomaly_df['feature_3'])

# Create the target variable (response)
anomaly_df['anomaly'] = 0
anomaly_df['anomaly'].iloc[anomaly_indices] = 1

# Display the anomaly dataset
print(anomaly_df)


      feature_1  feature_2  feature_3  feature_4  anomaly
0     -1.175870   0.852645          2          9        0
1      0.345086   0.993827          0          7        0
2     -0.341503   0.455026          0         10        0
3      1.139486   0.511764          2          3        0
4      9.480229   7.709887          3         22        1
...         ...        ...        ...        ...      ...
1995   0.145909   0.403033          0          6        0
1996   0.920770   0.694385          1          7        0
1997  10.125692   8.750591          3         25        1
1998  -0.787343   0.036730          0          7        0
1999   8.906708   7.567314          3         20        1

[2000 rows x 5 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  anomaly_df['anomaly'].iloc[anomaly_indices] = 1


In [53]:
df = anomaly_df
df.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,anomaly
0,-1.17587,0.852645,2,9,0
1,0.345086,0.993827,0,7,0
2,-0.341503,0.455026,0,10,0
3,1.139486,0.511764,2,3,0
4,9.480229,7.709887,3,22,1


In [54]:
df['anomaly'].value_counts()

0    1800
1     200
Name: anomaly, dtype: int64

In [55]:
# Fit a decision tree model
X = df.drop('anomaly', axis=1)
y = df['anomaly']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)

# Generate text-based rules from the decision tree
tree_rules = export_text(tree_model, feature_names=X.columns.tolist())

# Define combinations of variables
variable_combinations = list(itertools.combinations(X.columns, 2))

# Extract complex rule conditions based on variable combinations
complex_rule_conditions = []
for line in tree_rules.split('\n'):
    if line.startswith('|---') and 'class:' not in line:
        for var1, var2 in variable_combinations:
            condition = line.replace('|--- ', '').strip()  # Modify condition format
            condition = re.sub(r'([A-Za-z_]+)', r'row["\1"]', condition)  # Replace feature names
            condition = condition.replace(f'row["{var1}"]', var1).replace(f'row["{var2}"]', var2)
            complex_rule_conditions.append(condition)

# Custom function to evaluate conditions row-wise
def evaluate_condition(row, condition):
    try:
        return int(eval(condition, {'__builtins__': None, 'row': row}))
    except:
        return 0

# Create binary features based on the complex rule conditions
for idx, condition in enumerate(complex_rule_conditions):
    feature_name = f'complex_rule_{idx}'
    df[feature_name] = df.apply(lambda row, condition=condition: evaluate_condition(row, condition), axis=1)

# Split the dataset for anomaly detection
X = df.drop('anomaly', axis=1)
y = df['anomaly']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Use X_train for training your anomaly detection model
# Use X_test for evaluating the model's performance


In [56]:
X_train.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,complex_rule_0,complex_rule_1,complex_rule_2,complex_rule_3,complex_rule_4,complex_rule_5,complex_rule_6,complex_rule_7,complex_rule_8,complex_rule_9,complex_rule_10,complex_rule_11
968,-0.990089,0.648779,0,6,0,0,0,0,0,0,0,0,0,0,0,0
240,1.137692,0.211066,2,7,0,0,0,0,0,0,0,0,0,0,0,0
819,0.33514,0.862149,2,2,0,0,0,0,0,0,0,0,0,0,0,0
692,1.886712,0.830365,0,2,0,0,0,0,0,0,0,0,0,0,0,0
420,0.852342,0.30796,1,1,0,0,0,0,0,0,0,0,0,0,0,0


In [57]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

# Fit the Isolation Forest model
isolation_forest_model = IsolationForest(contamination=0.1, random_state=42)
isolation_forest_model.fit(X_train)

# Predict anomalies
anomaly_scores = isolation_forest_model.decision_function(X)

# Add 'anomaly' score column to the original dataset
df['anomaly_score'] = anomaly_scores

# Set a threshold for anomaly classification
anomaly_threshold = -0.0  # Adjust this threshold as needed

# Add 'anomaly' column as True or False
df['anomaly_new'] = df['anomaly_score'] < anomaly_threshold

# Display the dataset with 'anomaly' column
print(df)

# Display the dataset with 'anomaly' scores
#print(df)



      feature_1  feature_2  feature_3  feature_4  anomaly  complex_rule_0  \
0     -1.175870   0.852645          2          9        0               0   
1      0.345086   0.993827          0          7        0               0   
2     -0.341503   0.455026          0         10        0               0   
3      1.139486   0.511764          2          3        0               0   
4      9.480229   7.709887          3         22        1               0   
...         ...        ...        ...        ...      ...             ...   
1995   0.145909   0.403033          0          6        0               0   
1996   0.920770   0.694385          1          7        0               0   
1997  10.125692   8.750591          3         25        1               0   
1998  -0.787343   0.036730          0          7        0               0   
1999   8.906708   7.567314          3         20        1               0   

      complex_rule_1  complex_rule_2  complex_rule_3  complex_rule_4  \
0  

In [58]:
df['anomaly'].value_counts()

0    1800
1     200
Name: anomaly, dtype: int64

In [60]:
df[(df['anomaly_new'] == True) & (df['anomaly']==1)].shape

(194, 19)