In [12]:
import itertools
import random
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

dataset_size = 200000  # Total number of observations in the dataset
fraud_rate = 0.01  # Desired fraud rate (2% of the dataset)

# Calculate the number of fraud cases based on the fraud rate
num_fraud_cases = int(dataset_size * fraud_rate)

# Generate a dataset of observations with random variable values and fraud cases
dataset = []
for _ in range(dataset_size):
    data = {
        'amount': random.randint(1, 500),
        'last_amount_60day': random.randint(0, 200),
        'address_change60day': random.randint(0, 3),
        'score': random.randint(0, 999),
        'age': random.randint(18, 65),
        'transaction_count': random.randint(1, 100),
        'address_change30day': random.randint(0, 10),
        'var2': random.randint(1.0, 10.0),
        'var3': random.randint(10.0, 100.0),
        'var4': random.randint(100.0, 1000.0),
        'var5': random.randint(1, 30),
        'var6': random.randint(100, 500),
        'var7': random.randint(18, 65),
        'var8': random.randint(10.0, 100.0),
        'var9': random.randint(50, 300),
        'var10': random.randint(10, 100),
        'fraud': 0
    }
    dataset.append(data)

# Set the fraud cases in the dataset
fraud_indices = random.sample(range(dataset_size), num_fraud_cases)
for idx in fraud_indices:
    dataset[idx]['fraud'] = 1

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(dataset)

# Train a decision tree classifier on the dataset
X = df.drop('fraud', axis=1)
y = df['fraud']
model = DecisionTreeClassifier(random_state=42)
model.fit(X, y)

# Extract the important variables from the decision tree model
importance_scores = model.feature_importances_
important_variables = X.columns[importance_scores>0.05]
important_variables

Index(['amount', 'last_amount_60day', 'score', 'age', 'transaction_count',
       'var3', 'var4', 'var5', 'var6', 'var7', 'var8', 'var9', 'var10'],
      dtype='object')

In [13]:

def create_strategy_rule(variables):
    rule_parts = []
    for var in variables:
        if var == 'score':
            rule_parts.append(f"{var} > 500")
        else:
            if variables[var]['value'] is not None:
                rule_parts.append(f"{var} > {variables[var]['value']}")
            else:
                rule_parts.append(f"{var} is not None")
    rule = " and ".join(rule_parts)
    return rule

# Define the list of variables to create rules on
variables = list(important_variables)  # Use the important variables obtained from the decision tree model

# Generate combinations of variables
combinations = []
for r in range(1, len(variables) + 1):
    combinations.extend(itertools.combinations(variables, r))

# Apply the strategy rules to the dataset
rules_with_fraud_counts = []
for combo in combinations:
    if set(variables).issubset(set(combo)):  # Check if all variables are present in the combination
        fraud_count = 0
        for idx, data in df.iterrows():
            strategy_variables = {var: {'value': data[var]} for var in combo}
            rule = create_strategy_rule(strategy_variables)
            if data['fraud'] == 1:
                fraud_count += 1
            fraud_percentage = (fraud_count / num_fraud_cases) * 100
            rules_with_fraud_counts.append((rule, fraud_percentage))

# Sort the rules based on the fraud counts percentage in descending order
sorted_rules = sorted(rules_with_fraud_counts, key=lambda x: x[1], reverse=True)

# Retrieve the top 10 rules
top_10_rules = sorted_rules[:10]

# Display the top 10 rules and their fraud counts percentage
for rule, fraud_percentage in top_10_rules:
    print(f"Rule: {rule}")
    print(f"Total frauds captured: {int(fraud_percentage * num_fraud_cases / 100)}/{num_fraud_cases}")
    print("% fraud captured", fraud_percentage)
    print()


Rule: amount > 202 and last_amount_60day > 96 and score > 500 and age > 61 and transaction_count > 64 and var3 > 70 and var4 > 808 and var5 > 20 and var6 > 122 and var7 > 50 and var8 > 88 and var9 > 75 and var10 > 96
Total frauds captured: 2000/2000
% fraud captured 100.0

Rule: amount > 173 and last_amount_60day > 80 and score > 500 and age > 65 and transaction_count > 63 and var3 > 33 and var4 > 587 and var5 > 22 and var6 > 446 and var7 > 53 and var8 > 97 and var9 > 191 and var10 > 34
Total frauds captured: 2000/2000
% fraud captured 100.0

Rule: amount > 297 and last_amount_60day > 69 and score > 500 and age > 53 and transaction_count > 33 and var3 > 77 and var4 > 723 and var5 > 25 and var6 > 143 and var7 > 61 and var8 > 55 and var9 > 182 and var10 > 16
Total frauds captured: 2000/2000
% fraud captured 100.0

Rule: amount > 91 and last_amount_60day > 43 and score > 500 and age > 58 and transaction_count > 55 and var3 > 54 and var4 > 114 and var5 > 9 and var6 > 385 and var7 > 48 and 

In [4]:
import itertools
import random
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 

df = pd.DataFrame(dataset)

# Train a decision tree classifier on the dataset
X = df.drop('fraud', axis=1)
y = df['fraud']
model = DecisionTreeClassifier(random_state=42)
model.fit(X, y)

# Extract the important variables from the decision tree model
importance_scores = model.feature_importances_
important_variables = X.columns[importance_scores > 0]

important_variables

Index(['amount', 'last_amount_60day', 'address_change60day', 'score'], dtype='object')

In [2]:
import itertools
import random
import pandas as pd

dataset_size = 200000  # Total number of observations in the dataset
fraud_rate = 0.01  # Desired fraud rate (2% of the dataset)

# Calculate the number of fraud cases based on the fraud rate
num_fraud_cases = int(dataset_size * fraud_rate)

# Generate a dataset of observations with random variable values and fraud cases
dataset = []
for _ in range(dataset_size):
    data = {
        'amount': random.randint(1, 500),
        'last_amount_60day': random.randint(0, 200),
        'address_change60day': random.randint(0, 3),
        'score': random.randint(0, 999),
        'fraud': 0
    }
    dataset.append(data)

# Set the fraud cases in the dataset
fraud_indices = random.sample(range(dataset_size), num_fraud_cases)
for idx in fraud_indices:
    dataset[idx]['fraud'] = 1

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(dataset)

def create_strategy_rule(variables):
    rule_parts = []
    for var in variables:
        if var == 'score':
            rule_parts.append(f"{var} > 500")
        else:
            if variables[var]['value'] is not None:
                rule_parts.append(f"{var} > {variables[var]['value']}")
            else:
                rule_parts.append(f"{var} is not None")
    rule = " and ".join(rule_parts)
    return rule

# Define the list of variables to create rules on
variables = ['amount', 'last_amount_60day', 'address_change60day', 'score']

# Generate combinations of variables
combinations = []
for r in range(1, len(variables) + 1):
    combinations.extend(itertools.combinations(variables, r))

# Apply the strategy rules to the dataset
#rules_with_fraud_counts = []
for combo in combinations:
    if set(variables).issubset(set(combo)):  # Check if all variables are present in the combination
        fraud_count = 0
        rules_with_fraud_counts = []
        for idx, data in df.iterrows():
            strategy_variables = {var: {'value': data[var]} for var in combo}
            rule = create_strategy_rule(strategy_variables)
            #print("rule", rule)
            if data['fraud'] == 1:
                fraud_count += 1
            rule_fraud_captured = (str(rule) + "|" + str(fraud_count))
            #print("rule_fraud_captured", rule_fraud_captured)
            fraud_percentage = (fraud_count / num_fraud_cases) * 100
            rules_with_fraud_counts.append((rule, fraud_percentage))
#print("rules_with_fraud_counts",rules_with_fraud_counts)


#Sort the rules based on the fraud counts percentage in descending order
sorted_rules = sorted(rules_with_fraud_counts, key=lambda x: x[1], reverse=True)

# Retrieve the top 10 rules
top_10_rules = sorted_rules[:10]

# Display the top 10 rules and their fraud counts percentage
for rule, fraud_percentage in top_10_rules:
    print(f"Rule: {rule}")
    print(f"Total frauds captured: {int(fraud_percentage * num_fraud_cases / 100)}/{num_fraud_cases}")
    print("% fraud captured", fraud_percentage)
    print()


Rule: amount > 27 and last_amount_60day > 74 and address_change60day > 1 and score > 500
Total frauds captured: 2000/2000
% fraud captured 100.0

Rule: amount > 268 and last_amount_60day > 128 and address_change60day > 0 and score > 500
Total frauds captured: 2000/2000
% fraud captured 100.0

Rule: amount > 68 and last_amount_60day > 134 and address_change60day > 2 and score > 500
Total frauds captured: 2000/2000
% fraud captured 100.0

Rule: amount > 464 and last_amount_60day > 96 and address_change60day > 3 and score > 500
Total frauds captured: 2000/2000
% fraud captured 100.0

Rule: amount > 282 and last_amount_60day > 134 and address_change60day > 0 and score > 500
Total frauds captured: 2000/2000
% fraud captured 100.0

Rule: amount > 14 and last_amount_60day > 110 and address_change60day > 2 and score > 500
Total frauds captured: 2000/2000
% fraud captured 100.0

Rule: amount > 186 and last_amount_60day > 186 and address_change60day > 2 and score > 500
Total frauds captured: 200