In [12]:
import itertools
import random
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

dataset_size = 200000  # Total number of observations in the dataset
fraud_rate = 0.01  # Desired fraud rate (2% of the dataset)

# Calculate the number of fraud cases based on the fraud rate
num_fraud_cases = int(dataset_size * fraud_rate)

# Generate a dataset of observations with random variable values and fraud cases
dataset = []
for _ in range(dataset_size):
    data = {
        'amount': random.randint(1, 500),
        'last_amount_60day': random.randint(0, 200),
        'address_change60day': random.randint(0, 3),
        'score': random.randint(0, 999),
        'age': random.randint(18, 65),
        'transaction_count': random.randint(1, 100),
        'address_change30day': random.randint(0, 10),
        'var2': random.randint(1.0, 10.0),
        'var3': random.randint(10.0, 100.0),
        'var4': random.randint(100.0, 1000.0),
        'var5': random.randint(1, 30),
        'var6': random.randint(100, 500),
        'var7': random.randint(18, 65),
        'var8': random.randint(10.0, 100.0),
        'var9': random.randint(50, 300),
        'var10': random.randint(10, 100),
        'fraud': 0
    }
    dataset.append(data)

# Set the fraud cases in the dataset
fraud_indices = random.sample(range(dataset_size), num_fraud_cases)
for idx in fraud_indices:
    dataset[idx]['fraud'] = 1

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(dataset)

# Train a decision tree classifier on the dataset
X = df.drop('fraud', axis=1)
y = df['fraud']
model = DecisionTreeClassifier(random_state=42)
model.fit(X, y)

# Extract the important variables from the decision tree model
importance_scores = model.feature_importances_
important_variables = X.columns[importance_scores>0.05]
important_variables

Index(['amount', 'last_amount_60day', 'score', 'age', 'transaction_count',
       'var3', 'var4', 'var5', 'var6', 'var7', 'var8', 'var9', 'var10'],
      dtype='object')

In [13]:

def create_strategy_rule(variables):
    rule_parts = []
    for var in variables:
        if var == 'score':
            rule_parts.append(f"{var} > 500")
        else:
            if variables[var]['value'] is not None:
                rule_parts.append(f"{var} > {variables[var]['value']}")
            else:
                rule_parts.append(f"{var} is not None")
    rule = " and ".join(rule_parts)
    return rule

# Define the list of variables to create rules on
variables = list(important_variables)  # Use the important variables obtained from the decision tree model

# Generate combinations of variables
combinations = []
for r in range(1, len(variables) + 1):
    combinations.extend(itertools.combinations(variables, r))

# Apply the strategy rules to the dataset
rules_with_fraud_counts = []
for combo in combinations:
    if set(variables).issubset(set(combo)):  # Check if all variables are present in the combination
        fraud_count = 0
        for idx, data in df.iterrows():
            strategy_variables = {var: {'value': data[var]} for var in combo}
            rule = create_strategy_rule(strategy_variables)
            if data['fraud'] == 1:
                fraud_count += 1
            fraud_percentage = (fraud_count / num_fraud_cases) * 100
            rules_with_fraud_counts.append((rule, fraud_percentage))

# Sort the rules based on the fraud counts percentage in descending order
sorted_rules = sorted(rules_with_fraud_counts, key=lambda x: x[1], reverse=True)

# Retrieve the top 10 rules
top_10_rules = sorted_rules[:10]

# Display the top 10 rules and their fraud counts percentage
for rule, fraud_percentage in top_10_rules:
    print(f"Rule: {rule}")
    print(f"Total frauds captured: {int(fraud_percentage * num_fraud_cases / 100)}/{num_fraud_cases}")
    print("% fraud captured", fraud_percentage)
    print()


Rule: amount > 202 and last_amount_60day > 96 and score > 500 and age > 61 and transaction_count > 64 and var3 > 70 and var4 > 808 and var5 > 20 and var6 > 122 and var7 > 50 and var8 > 88 and var9 > 75 and var10 > 96
Total frauds captured: 2000/2000
% fraud captured 100.0

Rule: amount > 173 and last_amount_60day > 80 and score > 500 and age > 65 and transaction_count > 63 and var3 > 33 and var4 > 587 and var5 > 22 and var6 > 446 and var7 > 53 and var8 > 97 and var9 > 191 and var10 > 34
Total frauds captured: 2000/2000
% fraud captured 100.0

Rule: amount > 297 and last_amount_60day > 69 and score > 500 and age > 53 and transaction_count > 33 and var3 > 77 and var4 > 723 and var5 > 25 and var6 > 143 and var7 > 61 and var8 > 55 and var9 > 182 and var10 > 16
Total frauds captured: 2000/2000
% fraud captured 100.0

Rule: amount > 91 and last_amount_60day > 43 and score > 500 and age > 58 and transaction_count > 55 and var3 > 54 and var4 > 114 and var5 > 9 and var6 > 385 and var7 > 48 and 

In [4]:
import itertools
import random
import pandas as pd
from sklearn.tree import DecisionTreeClassifier 

df = pd.DataFrame(dataset)

# Train a decision tree classifier on the dataset
X = df.drop('fraud', axis=1)
y = df['fraud']
model = DecisionTreeClassifier(random_state=42)
model.fit(X, y)

# Extract the important variables from the decision tree model
importance_scores = model.feature_importances_
important_variables = X.columns[importance_scores > 0]

important_variables

Index(['amount', 'last_amount_60day', 'address_change60day', 'score'], dtype='object')

In [2]:
import itertools
import random
import pandas as pd

dataset_size = 200000  # Total number of observations in the dataset
fraud_rate = 0.01  # Desired fraud rate (2% of the dataset)

# Calculate the number of fraud cases based on the fraud rate
num_fraud_cases = int(dataset_size * fraud_rate)

# Generate a dataset of observations with random variable values and fraud cases
dataset = []
for _ in range(dataset_size):
    data = {
        'amount': random.randint(1, 500),
        'last_amount_60day': random.randint(0, 200),
        'address_change60day': random.randint(0, 3),
        'score': random.randint(0, 999),
        'fraud': 0
    }
    dataset.append(data)

# Set the fraud cases in the dataset
fraud_indices = random.sample(range(dataset_size), num_fraud_cases)
for idx in fraud_indices:
    dataset[idx]['fraud'] = 1

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(dataset)

def create_strategy_rule(variables):
    rule_parts = []
    for var in variables:
        if var == 'score':
            rule_parts.append(f"{var} > 500")
        else:
            if variables[var]['value'] is not None:
                rule_parts.append(f"{var} > {variables[var]['value']}")
            else:
                rule_parts.append(f"{var} is not None")
    rule = " and ".join(rule_parts)
    return rule

# Define the list of variables to create rules on
variables = ['amount', 'last_amount_60day', 'address_change60day', 'score']

# Generate combinations of variables
combinations = []
for r in range(1, len(variables) + 1):
    combinations.extend(itertools.combinations(variables, r))

# Apply the strategy rules to the dataset
#rules_with_fraud_counts = []
for combo in combinations:
    if set(variables).issubset(set(combo)):  # Check if all variables are present in the combination
        fraud_count = 0
        rules_with_fraud_counts = []
        for idx, data in df.iterrows():
            strategy_variables = {var: {'value': data[var]} for var in combo}
            rule = create_strategy_rule(strategy_variables)
            #print("rule", rule)
            if data['fraud'] == 1:
                fraud_count += 1
            rule_fraud_captured = (str(rule) + "|" + str(fraud_count))
            #print("rule_fraud_captured", rule_fraud_captured)
            fraud_percentage = (fraud_count / num_fraud_cases) * 100
            rules_with_fraud_counts.append((rule, fraud_percentage))
#print("rules_with_fraud_counts",rules_with_fraud_counts)


#Sort the rules based on the fraud counts percentage in descending order
sorted_rules = sorted(rules_with_fraud_counts, key=lambda x: x[1], reverse=True)

# Retrieve the top 10 rules
top_10_rules = sorted_rules[:10]

# Display the top 10 rules and their fraud counts percentage
for rule, fraud_percentage in top_10_rules:
    print(f"Rule: {rule}")
    print(f"Total frauds captured: {int(fraud_percentage * num_fraud_cases / 100)}/{num_fraud_cases}")
    print("% fraud captured", fraud_percentage)
    print()


Rule: amount > 27 and last_amount_60day > 74 and address_change60day > 1 and score > 500
Total frauds captured: 2000/2000
% fraud captured 100.0

Rule: amount > 268 and last_amount_60day > 128 and address_change60day > 0 and score > 500
Total frauds captured: 2000/2000
% fraud captured 100.0

Rule: amount > 68 and last_amount_60day > 134 and address_change60day > 2 and score > 500
Total frauds captured: 2000/2000
% fraud captured 100.0

Rule: amount > 464 and last_amount_60day > 96 and address_change60day > 3 and score > 500
Total frauds captured: 2000/2000
% fraud captured 100.0

Rule: amount > 282 and last_amount_60day > 134 and address_change60day > 0 and score > 500
Total frauds captured: 2000/2000
% fraud captured 100.0

Rule: amount > 14 and last_amount_60day > 110 and address_change60day > 2 and score > 500
Total frauds captured: 2000/2000
% fraud captured 100.0

Rule: amount > 186 and last_amount_60day > 186 and address_change60day > 2 and score > 500
Total frauds captured: 200

In [4]:
import itertools
import random
import pandas as pd

dataset_size = 200000  # Total number of observations in the dataset
fraud_rate = 0.03  # Desired fraud rate (2% of the dataset)

# Calculate the number of fraud cases based on the fraud rate
num_fraud_cases = int(dataset_size * fraud_rate)

# Generate a dataset of observations with random variable values and fraud cases
dataset = []
for _ in range(dataset_size):
    data = {
        'amount': random.randint(1, 1500),
        'last_amount_60day': random.randint(0, 200),
        'address_change60day': random.randint(0, 3),
        'score': random.randint(0, 999),
        'fraud': 0
    }
    dataset.append(data)

# Set the fraud cases in the dataset
fraud_indices = random.sample(range(dataset_size), num_fraud_cases)
for idx in fraud_indices:
    dataset[idx]['fraud'] = 1

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(dataset)

def create_strategy_rule(variables):
    rule_parts = []
    for var in variables:
        if var == 'score':
            rule_parts.append(f"{var} > 500")
        elif var == 'amount':
            rule_parts.append(f"{var} > 500")
        else:
            if variables[var]['value'] is not None:
                rule_parts.append(f"{var} > {variables[var]['value']}")
            else:
                rule_parts.append(f"{var} is not None")
    rule = " and ".join(rule_parts)
    return rule

# Define the list of variables to create rules on
variables = ['amount', 'last_amount_60day', 'address_change60day', 'score']

# Generate combinations of variables
combinations = []
for r in range(1, len(variables) + 1):
    combinations.extend(itertools.combinations(variables, r))

# Apply the strategy rules to the dataset
#rules_with_fraud_counts = []
for combo in combinations:
    if set(variables).issubset(set(combo)):  # Check if all variables are present in the combination
        fraud_count = 0
        rules_with_fraud_counts = []
        for idx, data in df.iterrows():
            strategy_variables = {var: {'value': data[var]} for var in combo}
            rule = create_strategy_rule(strategy_variables)
            #print("rule", rule)
            if data['fraud'] == 1:
                fraud_count += 1
            rule_fraud_captured = (str(rule) + "|" + str(fraud_count))
            #print("rule_fraud_captured", rule_fraud_captured)
            fraud_percentage = (fraud_count / num_fraud_cases) * 100
            rules_with_fraud_counts.append((rule, fraud_percentage))
#print("rules_with_fraud_counts",rules_with_fraud_counts)


#Sort the rules based on the fraud counts percentage in descending order
sorted_rules = sorted(rules_with_fraud_counts, key=lambda x: x[1], reverse=True)

# Retrieve the top 10 rules
top_10_rules = sorted_rules[:10]

# Display the top 10 rules and their fraud counts percentage
for rule, fraud_percentage in top_10_rules:
    print(f"Rule: {rule}")
    print(f"Total frauds captured: {int(fraud_percentage * num_fraud_cases / 100)}/{num_fraud_cases}")
    print("% fraud captured", fraud_percentage)
    print()


Rule: amount > 500 and last_amount_60day > 22 and address_change60day > 1 and score > 500
Total frauds captured: 6000/6000
% fraud captured 100.0

Rule: amount > 500 and last_amount_60day > 88 and address_change60day > 0 and score > 500
Total frauds captured: 6000/6000
% fraud captured 100.0

Rule: amount > 500 and last_amount_60day > 59 and address_change60day > 3 and score > 500
Total frauds captured: 6000/6000
% fraud captured 100.0

Rule: amount > 500 and last_amount_60day > 37 and address_change60day > 1 and score > 500
Total frauds captured: 6000/6000
% fraud captured 100.0

Rule: amount > 500 and last_amount_60day > 42 and address_change60day > 2 and score > 500
Total frauds captured: 6000/6000
% fraud captured 100.0

Rule: amount > 500 and last_amount_60day > 51 and address_change60day > 3 and score > 500
Total frauds captured: 6000/6000
% fraud captured 100.0

Rule: amount > 500 and last_amount_60day > 144 and address_change60day > 2 and score > 500
Total frauds captured: 6000

In [6]:
import itertools
import random
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

dataset_size = 200000  # Total number of observations in the dataset
fraud_rate = 0.01  # Desired fraud rate (2% of the dataset)

# Calculate the number of fraud cases based on the fraud rate
num_fraud_cases = int(dataset_size * fraud_rate)

# Generate a dataset of observations with random variable values and fraud cases
dataset = []
for _ in range(dataset_size):
    data = {
        'amount': random.randint(1, 500),
        'last_amount_60day': random.randint(0, 200),
        'address_change60day': random.randint(0, 3),
        'score': random.randint(0, 999),
        'age': random.randint(18, 65),
        'transaction_count': random.randint(1, 100),
        'address_change30day': random.randint(0, 10),
        'var2': random.randint(1.0, 10.0),
        'var3': random.randint(10.0, 100.0),
        'var4': random.randint(100.0, 1000.0),
        'var5': random.randint(1, 30),
        'var6': random.randint(100, 500),
        'var7': random.randint(18, 65),
        'var8': random.randint(10.0, 100.0),
        'var9': random.randint(50, 300),
        'var10': random.randint(10, 100),
        'fraud': 0
    }
    dataset.append(data)

# Set the fraud cases in the dataset
fraud_indices = random.sample(range(dataset_size), num_fraud_cases)
for idx in fraud_indices:
    dataset[idx]['fraud'] = 1

# Convert the dataset to a pandas DataFrame
dt = pd.DataFrame(dataset)

In [7]:
list(dt)

['amount',
 'last_amount_60day',
 'address_change60day',
 'score',
 'age',
 'transaction_count',
 'address_change30day',
 'var2',
 'var3',
 'var4',
 'var5',
 'var6',
 'var7',
 'var8',
 'var9',
 'var10',
 'fraud']

In [8]:
import pandas as pd
from sklearn.ensemble import IsolationForest

# Load the dataset
df = dt  # Replace 'your_dataset.csv' with the actual filename or path

# Select the variables of interest
variables = ['amount',
 'last_amount_60day',
 'address_change60day',
 'score',
 'age',
 'transaction_count',
 'address_change30day',
 'var2',
 'var3',
 'var4',
 'var5',
 'var6',
 'var7',
 'var8',
 'var9',
 'var10',
 'fraud']

# Extract the feature matrix from the dataset
X = df[variables].values

# Fit the Isolation Forest model
model = IsolationForest(contamination=0.01)  # Adjust the contamination parameter based on the expected fraud rate
model.fit(X)

# Predict the anomaly scores for each observation
anomaly_scores = model.decision_function(X)

# Add the anomaly scores to the dataset
df['anomaly_score'] = anomaly_scores

# Sort the dataset by anomaly scores in ascending order
sorted_df = df.sort_values(by='anomaly_score')

# Display the observations with the lowest anomaly scores (most unusual patterns)
top_unusual_patterns = sorted_df.head(10)
print(top_unusual_patterns)


        amount  last_amount_60day  address_change60day  score  age  \
136007     480                 50                    3      1   22   
66037      334                 74                    0    954   19   
82887       26                191                    0    809   22   
89015       98                 17                    1    602   29   
101652     416                 18                    3    852   53   
148520     498                157                    0    146   20   
102364     500                 28                    3     39   26   
151470     458                  7                    0     18   41   
116902      18                 30                    3    101   18   
182151     478                 85                    3    893   62   

        transaction_count  address_change30day  var2  var3  var4  var5  var6  \
136007                 13                    9     1    11   392    25   110   
66037                   8                    0     1    57   917    2

In [9]:
df['fraud'].value_counts()

0    198000
1      2000
Name: fraud, dtype: int64

In [10]:
import pandas as pd

# Load the dataset
df = pd.DataFrame(dataset)

# Step 1: Analyze the distribution of each variable
fraud_data = df[df['fraud'] == 1]
non_fraud_data = df[df['fraud'] == 0]

# Example analysis for 'amount' variable
amount_fraud_dist = fraud_data['amount'].describe()
amount_non_fraud_dist = non_fraud_data['amount'].describe()

# Step 2: Identify potential threshold values
amount_thresholds = [100, 200, 300, 400]

# Step 3: Evaluate fraud rates captured
fraud_rates = []
for threshold in amount_thresholds:
    fraud_count = len(fraud_data[fraud_data['amount'] > threshold])
    fraud_rate = (fraud_count / num_fraud_cases) * 100
    fraud_rates.append(fraud_rate)

# Step 4: Consider business impact
# Evaluate other factors (false positive rate, false negative rate, operational costs, etc.)

# Step 5: Iterate and fine-tune if necessary

# Example of printing the results
print("Amount Threshold | Fraud Rate Captured")
for i in range(len(amount_thresholds)):
    print(f"{amount_thresholds[i]} | {fraud_rates[i]}%")


Amount Threshold | Fraud Rate Captured
100 | 80.9%
200 | 61.8%
300 | 41.8%
400 | 20.45%


In [11]:
import pandas as pd

# Load the dataset
df = pd.DataFrame(dataset)

# Step 1: Analyze the distribution of each variable
fraud_data = df[df['fraud'] == 1]
non_fraud_data = df[df['fraud'] == 0]

# Example analysis for 'amount' variable
amount_fraud_dist = fraud_data['last_amount_60day'].describe()
amount_non_fraud_dist = non_fraud_data['last_amount_60day'].describe()

# Step 2: Identify potential threshold values
amount_thresholds = [100, 200, 300, 400]

# Step 3: Evaluate fraud rates captured
fraud_rates = []
for threshold in amount_thresholds:
    fraud_count = len(fraud_data[fraud_data['last_amount_60day'] > threshold])
    fraud_rate = (fraud_count / num_fraud_cases) * 100
    fraud_rates.append(fraud_rate)

# Step 4: Consider business impact
# Evaluate other factors (false positive rate, false negative rate, operational costs, etc.)

# Step 5: Iterate and fine-tune if necessary

# Example of printing the results
print("'last_amount_60day' | Fraud Rate Captured")
for i in range(len(amount_thresholds)):
    print(f"{amount_thresholds[i]} | {fraud_rates[i]}%")


'last_amount_60day' | Fraud Rate Captured
100 | 48.949999999999996%
200 | 0.0%
300 | 0.0%
400 | 0.0%


In [12]:
import pandas as pd

# Load the dataset
df = pd.DataFrame(dataset)

# Step 1: Analyze the distribution of each variable
fraud_data = df[df['fraud'] == 1]
non_fraud_data = df[df['fraud'] == 0]

# Define variables of interest
variables = ['amount', 'last_amount_60day', 'address_change60day', 'score']

# Dictionary to store threshold values
thresholds = {}

# Iterate over variables
for var in variables:
    # Calculate descriptive statistics for fraud and non-fraud cases
    fraud_stats = fraud_data[var].describe()
    non_fraud_stats = non_fraud_data[var].describe()

    # Set the threshold as the median of non-fraud cases
    threshold = non_fraud_stats['50%']
    thresholds[var] = threshold

# Step 2: Evaluate fraud rates captured
fraud_rates = []
for var in variables:
    threshold = thresholds[var]
    fraud_count = len(fraud_data[fraud_data[var] > threshold])
    fraud_rate = (fraud_count / num_fraud_cases) * 100
    fraud_rates.append(fraud_rate)

# Step 3: Consider business impact
# Evaluate other factors (false positive rate, false negative rate, operational costs, etc.)

# Example of printing the results
print("Variable | Threshold | Fraud Rate Captured")
for i in range(len(variables)):
    var = variables[i]
    threshold = thresholds[var]
    print(f"{var} | {threshold} | {fraud_rates[i]}%")


Variable | Threshold | Fraud Rate Captured
amount | 250.0 | 51.7%
last_amount_60day | 100.0 | 48.949999999999996%
address_change60day | 2.0 | 24.25%
score | 500.0 | 48.0%


In [13]:
import pandas as pd

# Load the dataset
df = pd.DataFrame(dataset)

# Step 1: Analyze the distribution of each variable
fraud_data = df[df['fraud'] == 1]
non_fraud_data = df[df['fraud'] == 0]

# Define variables of interest
variables = ['amount', 'last_amount_60day', 'address_change60day', 'score']

# Dictionary to store threshold ranges
threshold_ranges = {}

# Iterate over variables
for var in variables:
    # Calculate descriptive statistics for fraud and non-fraud cases
    fraud_stats = fraud_data[var].describe()
    non_fraud_stats = non_fraud_data[var].describe()

    # Set the threshold range based on percentiles
    lower_percentile = 25  # Lower percentile (e.g., 25%)
    upper_percentile = 75  # Upper percentile (e.g., 75%)
    lower_threshold = non_fraud_data[var].quantile(lower_percentile / 100)
    upper_threshold = non_fraud_data[var].quantile(upper_percentile / 100)
    threshold_ranges[var] = (lower_threshold, upper_threshold)

# Step 2: Evaluate fraud rates captured within the threshold range
fraud_rates = []
for var in variables:
    lower_threshold, upper_threshold = threshold_ranges[var]
    fraud_count = len(fraud_data[(fraud_data[var] > lower_threshold) & (fraud_data[var] < upper_threshold)])
    fraud_rate = (fraud_count / num_fraud_cases) * 100
    fraud_rates.append(fraud_rate)

# Step 3: Consider business impact
# Evaluate other factors (false positive rate, false negative rate, operational costs, etc.)

# Example of printing the results
print("Variable | Threshold Range | Fraud Rate Captured")
for i in range(len(variables)):
    var = variables[i]
    lower_threshold, upper_threshold = threshold_ranges[var]
    print(f"{var} | {lower_threshold}-{upper_threshold} | {fraud_rates[i]}%")


Variable | Threshold Range | Fraud Rate Captured
amount | 125.0-376.0 | 50.349999999999994%
last_amount_60day | 50.0-150.0 | 49.45%
address_change60day | 1.0-2.0 | 0.0%
score | 249.0-750.0 | 49.3%


In [17]:
list(df)

['amount',
 'last_amount_60day',
 'address_change60day',
 'score',
 'age',
 'transaction_count',
 'address_change30day',
 'var2',
 'var3',
 'var4',
 'var5',
 'var6',
 'var7',
 'var8',
 'var9',
 'var10',
 'fraud',
 'anomaly_score',
 'amount_bin',
 'last_amount_60day_bin',
 'address_change60day_bin',
 'score_bin']

In [16]:
dt.head()

Unnamed: 0,amount,last_amount_60day,address_change60day,score,age,transaction_count,address_change30day,var2,var3,var4,...,var7,var8,var9,var10,fraud,anomaly_score,amount_bin,last_amount_60day_bin,address_change60day_bin,score_bin
0,47,6,0,223,54,78,5,5,73,667,...,44,30,197,65,0,0.081231,,,,
1,259,64,1,574,64,82,2,6,35,387,...,34,100,214,25,0,0.095451,2.0,0.0,0.0,3.0
2,428,30,2,575,24,95,0,10,59,260,...,56,76,206,69,0,0.026609,,,4.0,3.0
3,210,63,2,512,26,51,8,1,94,227,...,25,46,194,36,0,0.070326,1.0,0.0,4.0,2.0
4,47,2,3,43,64,75,8,5,18,668,...,63,74,171,66,0,0.048869,,,,


In [15]:
import pandas as pd

# Load the dataset
df = dt

# Step 1: Analyze the distribution of each variable
fraud_data = df[df['fraud'] == 1]
non_fraud_data = df[df['fraud'] == 0]

# Define variables of interest
variables = ['amount', 'last_amount_60day', 'address_change60day', 'score']

# Dictionary to store threshold ranges and bins
threshold_ranges = {}
bins = {}

# Iterate over variables
for var in variables:
    # Calculate descriptive statistics for fraud and non-fraud cases
    fraud_stats = fraud_data[var].describe()
    non_fraud_stats = non_fraud_data[var].describe()

    # Set the threshold range based on percentiles
    lower_percentile = 25  # Lower percentile (e.g., 25%)
    upper_percentile = 75  # Upper percentile (e.g., 75%)
    lower_threshold = non_fraud_data[var].quantile(lower_percentile / 100)
    upper_threshold = non_fraud_data[var].quantile(upper_percentile / 100)
    threshold_ranges[var] = (lower_threshold, upper_threshold)

    # Create bins based on the threshold range
    bin_edges = [lower_threshold]  # Start with lower threshold as the first bin edge
    num_bins = 5  # Number of bins (adjust as needed)
    bin_size = (upper_threshold - lower_threshold) / num_bins
    for i in range(1, num_bins):
        bin_edge = lower_threshold + i * bin_size
        bin_edges.append(bin_edge)
    bin_edges.append(upper_threshold)  # Add upper threshold as the last bin edge
    bins[var] = bin_edges

# Step 2: Assign data points to bins
for var in variables:
    bin_edges = bins[var]
    df[f'{var}_bin'] = pd.cut(df[var], bins=bin_edges, labels=False, include_lowest=True)

# Step 3: Calculate fraud rates for each bin
fraud_rates = {}
for var in variables:
    fraud_rates[var] = df.groupby(f'{var}_bin')['fraud'].mean() * 100

# Example of printing the results
print("Variable | Bin | Fraud Rate")
for var in variables:
    bin_edges = bins[var]
    fraud_rate = fraud_rates[var]
    for i in range(len(bin_edges) - 1):
        bin_label = f"{bin_edges[i]} - {bin_edges[i+1]}"
        rate = fraud_rate.get(i, 0)
        print(f"{var} | {bin_label} | {rate}%")


Variable | Bin | Fraud Rate
amount | 125.0 - 175.2 | 0.9106847180286355%
amount | 175.2 - 225.4 | 1.01171992387058%
amount | 225.4 - 275.6 | 0.9653290205195594%
amount | 275.6 - 325.8 | 1.075914527348246%
amount | 325.8 - 376.0 | 1.11439842209073%
last_amount_60day | 50.0 - 70.0 | 1.0189924891163948%
last_amount_60day | 70.0 - 90.0 | 1.10803324099723%
last_amount_60day | 90.0 - 110.0 | 0.8919176228892118%
last_amount_60day | 110.0 - 130.0 | 0.9265787088327123%
last_amount_60day | 130.0 - 150.0 | 1.0523652921831521%
address_change60day | 1.0 - 1.2 | 1.0458117538843008%
address_change60day | 1.2 - 1.4 | 0%
address_change60day | 1.4 - 1.6 | 0%
address_change60day | 1.6 - 1.8 | 0%
address_change60day | 1.8 - 2.0 | 0.9872021973210198%
score | 249.0 - 349.2 | 1.0084033613445378%
score | 349.2 - 449.4 | 1.0880515817046141%
score | 449.4 - 549.6 | 1.064099315936154%
score | 549.6 - 649.8 | 0.9025448806470705%
score | 649.8 - 750.0 | 0.8836816760164821%


In [21]:
import pandas as pd

# Load the dataset
df = dt

# Step 1: Analyze the distribution of each variable
fraud_data = df[df['fraud'] == 1]
non_fraud_data = df[df['fraud'] == 0]

# Define variables of interest
variables = ['amount', 'last_amount_60day', 'address_change60day', 'score']

# Dictionary to store threshold ranges and bins
threshold_ranges = {}
bins = {}

# Iterate over variables
for var in variables:
    # Calculate descriptive statistics for fraud and non-fraud cases
    fraud_stats = fraud_data[var].describe()
    non_fraud_stats = non_fraud_data[var].describe()

    # Set the threshold range based on percentiles
    lower_percentile = 25  # Lower percentile (e.g., 25%)
    upper_percentile = 75  # Upper percentile (e.g., 75%)
    lower_threshold = non_fraud_stats['25%']
    upper_threshold = non_fraud_stats['75%']
    threshold_ranges[var] = (lower_threshold, upper_threshold)

    # Create bins based on the threshold range
    bin_edges = [lower_threshold]  # Start with lower threshold as the first bin edge
    num_bins = 5  # Number of bins (adjust as needed)
    bin_size = (upper_threshold - lower_threshold) / num_bins
    for i in range(1, num_bins + 1):  # Adjusted the range to include upper threshold
        bin_edge = lower_threshold + i * bin_size
        bin_edges.append(bin_edge)
    bins[var] = bin_edges

# Step 2: Assign data points to bins
for var in variables:
    bin_edges = bins[var]
    df[f'{var}_bin'] = pd.cut(df[var], bins=bin_edges, labels=False, include_lowest=True)

# Step 3: Calculate fraud rates, count, and amount for each bin
fraud_summary = {}
for var in variables:
    bin_edges = bins[var]
    grouped_data = df.groupby(f'{var}_bin')
    fraud_summary[var] = grouped_data.agg(
        fraud_count=('fraud', 'sum'),
        fraud_rate=('fraud', lambda x: x.sum() / x.count()),
        fraud_amount=('amount', 'sum')
    )

# Example of printing the results
print("Variable | Bin | Fraud Count | Fraud Rate | Fraud Amount")
for var in variables:
    bin_edges = bins[var]
    summary = fraud_summary[var]
    for i in range(len(bin_edges) - 1):
        bin_label = f"{bin_edges[i]} - {bin_edges[i+1]}"
        count = summary['fraud_count'].iloc[i] if i < len(summary['fraud_count']) else 0
        rate = summary['fraud_rate'].iloc[i] * 100 if i < len(summary['fraud_rate']) else 0
        amount = summary['fraud_amount'].iloc[i] if i < len(summary['fraud_amount']) else 0
        print(f"{var} | {bin_label} | {count} | {rate:.2f}% | {amount}")


Variable | Bin | Fraud Count | Fraud Rate | Fraud Amount
amount | 125.0 - 175.2 | 187 | 0.91% | 3081076
amount | 175.2 - 225.4 | 202 | 1.01% | 4001354
amount | 225.4 - 275.6 | 191 | 0.97% | 4956515
amount | 275.6 - 325.8 | 215 | 1.08% | 6003265
amount | 325.8 - 376.0 | 226 | 1.11% | 7120198
last_amount_60day | 50.0 - 70.0 | 213 | 1.02% | 5221393
last_amount_60day | 70.0 - 90.0 | 220 | 1.11% | 4953157
last_amount_60day | 90.0 - 110.0 | 178 | 0.89% | 4978885
last_amount_60day | 110.0 - 130.0 | 184 | 0.93% | 4942331
last_amount_60day | 130.0 - 150.0 | 208 | 1.05% | 4937479
address_change60day | 1.0 - 1.2 | 523 | 1.05% | 12474861
address_change60day | 1.2 - 1.4 | 496 | 0.99% | 12594833
address_change60day | 1.4 - 1.6 | 0 | 0.00% | 0
address_change60day | 1.6 - 1.8 | 0 | 0.00% | 0
address_change60day | 1.8 - 2.0 | 0 | 0.00% | 0
score | 249.0 - 349.2 | 204 | 1.01% | 5030917
score | 349.2 - 449.4 | 216 | 1.09% | 4981972
score | 449.4 - 549.6 | 210 | 1.06% | 4926560
score | 549.6 - 649.8 | 183

In [22]:
df['fraud'].value_counts()

0    198000
1      2000
Name: fraud, dtype: int64

In [39]:
import itertools
import random
import pandas as pd

dataset_size = 20000  # Total number of observations in the dataset
fraud_rate = 0.09  # Desired fraud rate (2% of the dataset)

# Calculate the number of fraud cases based on the fraud rate
num_fraud_cases = int(dataset_size * fraud_rate)

# Generate a dataset of observations with random variable values and fraud cases
dataset = []
for _ in range(dataset_size):
    data = {
        'amount': random.randint(1, 1500),
        'last_amount_60day': random.randint(0, 200),
        'address_change60day': random.randint(0, 3),
        'score': random.randint(0, 999),
        'fraud': 0
    }
    dataset.append(data)

# Set the fraud cases in the dataset
fraud_indices = random.sample(range(dataset_size), num_fraud_cases)
for idx in fraud_indices:
    dataset[idx]['fraud'] = 1

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(dataset)

def create_strategy_rule(variables):
    rule_parts = []
    for var in variables:
        if var == 'score':
            rule_parts.append(f"{var} > 500")
        elif var == 'amount':
            rule_parts.append(f"{var} > 500")
        else:
            if variables[var]['value'] is not None:
                rule_parts.append(f"{var} > {variables[var]['value']}")
            else:
                rule_parts.append(f"{var} is not None")
    rule = " and ".join(rule_parts)
    return rule

# Define the list of variables to create rules on
variables = ['amount', 'last_amount_60day', 'address_change60day', 'score']

# Generate combinations of variables
combinations = []
for r in range(1, len(variables) + 1):
    combinations.extend(itertools.combinations(variables, r))

# Apply the strategy rules to the dataset
#rules_with_fraud_counts = []
for combo in combinations:
    if set(variables).issubset(set(combo)):  # Check if all variables are present in the combination
        fraud_count = 0
        rules_with_fraud_counts = []
        for idx, data in df.iterrows():
            strategy_variables = {var: {'value': data[var]} for var in combo}
            rule = create_strategy_rule(strategy_variables)
            #print("rule", rule)
            if data['fraud'] == 1:
                fraud_count += 1
            rule_fraud_captured = (str(rule) + "|" + str(fraud_count))
            #print("rule_fraud_captured", rule_fraud_captured)
            fraud_percentage = (fraud_count / num_fraud_cases) * 100
            rules_with_fraud_counts.append((rule, fraud_percentage))
#print("rules_with_fraud_counts",rules_with_fraud_counts)


#Sort the rules based on the fraud counts percentage in descending order
sorted_rules = sorted(rules_with_fraud_counts, key=lambda x: x[1], reverse=True)

# Retrieve the top 10 rules
top_10_rules = sorted_rules[:10]

# Display the top 10 rules and their fraud counts percentage
for rule, fraud_percentage in top_10_rules:
    print(f"Rule: {rule}")
    print(f"Total frauds captured: {int(fraud_percentage * num_fraud_cases / 100)}/{num_fraud_cases}")
    print("% fraud captured", fraud_percentage)
    print()


Rule: amount > 500 and last_amount_60day > 140 and address_change60day > 0 and score > 500
Total frauds captured: 1800/1800
% fraud captured 100.0

Rule: amount > 500 and last_amount_60day > 137 and address_change60day > 1 and score > 500
Total frauds captured: 1799/1800
% fraud captured 99.94444444444444

Rule: amount > 500 and last_amount_60day > 39 and address_change60day > 1 and score > 500
Total frauds captured: 1799/1800
% fraud captured 99.94444444444444

Rule: amount > 500 and last_amount_60day > 126 and address_change60day > 3 and score > 500
Total frauds captured: 1799/1800
% fraud captured 99.94444444444444

Rule: amount > 500 and last_amount_60day > 1 and address_change60day > 2 and score > 500
Total frauds captured: 1799/1800
% fraud captured 99.94444444444444

Rule: amount > 500 and last_amount_60day > 64 and address_change60day > 0 and score > 500
Total frauds captured: 1799/1800
% fraud captured 99.94444444444444

Rule: amount > 500 and last_amount_60day > 33 and addres

In [44]:
dt['fraud'].value_counts()

0    198000
1      2000
Name: fraud, dtype: int64

In [46]:
non_fraud_data.shape

(198000, 22)

In [None]:
import numpy as np

# Load the dataset
df = dt

# Step 1: Analyze the distribution of each variable
fraud_data = df[df['fraud'] == 1]
non_fraud_data = df[df['fraud'] == 0]

# Define variables of interest
variables = ['amount', 'last_amount_60day', 'address_change60day', 'score']

# Dictionary to store threshold ranges and bins
threshold_ranges = {}
bins = {}

# Iterate over variables
for var in variables:
    # Calculate descriptive statistics for fraud and non-fraud cases
    fraud_stats = fraud_data[var].describe()
    non_fraud_stats = non_fraud_data[var].describe()

    # Set the threshold range based on percentiles
    lower_percentile = 25  # Lower percentile (e.g., 25%)
    upper_percentile = 75  # Upper percentile (e.g., 75%)
    lower_threshold = non_fraud_stats['25%']
    upper_threshold = non_fraud_stats['75%']
    threshold_ranges[var] = (lower_threshold, upper_threshold)

    # Create bins based on the threshold range
    bin_edges = [lower_threshold]  # Start with lower threshold as the first bin edge
    num_bins = 5  # Number of bins (adjust as needed)
    bin_size = (upper_threshold - lower_threshold) / num_bins
    for i in range(1, num_bins + 1):  # Adjusted the range to include upper threshold
        bin_edge = lower_threshold + i * bin_size
        bin_edges.append(bin_edge)
    bins[var] = bin_edges

# Step 2: Replace missing values with a default value
default_value = -1  # Choose a suitable default value
df['last_amount_60day'] = df['last_amount_60day'].fillna(default_value)

# Step 3: Assign data points to bins
for var in variables:
    bin_edges = bins[var]
    df[f'{var}_bin'] = pd.cut(df[var], bins=bin_edges, labels=False, include_lowest=True)

# Step 4: Calculate fraud rates, count, and amount for each bin
fraud_summary = {}
for var in variables:
    bin_edges = bins[var]
    grouped_data = df.groupby(f'{var}_bin')
    fraud_summary[var] = grouped_data.agg(
        fraud_count=('fraud', 'sum'),
        fraud_rate=('fraud', lambda x: x.sum() / x.count()),
        fraud_amount=('amount', 'sum')
    )

# Define the list of variables to create rules on
variables = ['amount', 'last_amount_60day', 'address_change60day', 'score']

# Generate combinations of variables
combinations = []
for r in range(1, len(variables) + 1):
    combinations.extend(itertools.combinations(variables, r))

# Apply the strategy rules to the dataset
rules_with_fraud_counts = []
for combo in combinations:
    if set(variables).issubset(set(combo)):  # Check if all variables are present in the combination
        fraud_count = 0
        for idx, data in df.iterrows():
            strategy_variables = {var: {'value': None, 'bin_values': []} for var in combo}
            for var in combo:
                bin_value = data[f'{var}_bin']
                if not np.isnan(bin_value):
                    bin_values = (bins[var][int(bin_value)], bins[var][int(bin_value) + 1])
                    strategy_variables[var]['value'] = bin_values
                    strategy_variables[var]['bin_values'].extend(bin_values)
            rule = create_strategy_rule(strategy_variables)
            if data['fraud'] == 1:
                fraud_count += 1
            print("fraud_count",fraud_count)
            print("num_fraud_cases", num_fraud_cases)
            rule_fraud_captured = (str(rule) + "|" + str(fraud_count))
            fraud_percentage = (fraud_count / num_fraud_cases) * 100
            rules_with_fraud_counts.append((rule, fraud_percentage))

# Sort the rules based on the fraud counts percentage in descending order
sorted_rules = sorted(rules_with_fraud_counts, key=lambda x: x[1], reverse=True)

# Retrieve the top 10 rules
top_10_rules = sorted_rules[:10]
print(top_10_rules)
# Display the top 10 rules and their fraud counts percentage
for rule, fraud_percentage in top_10_rules:
    print(f"Rule: {rule}")
    print(f"Total frauds captured: {int(fraud_percentage * num_fraud_cases / 100)}/{num_fraud_cases}")
    print("% fraud captured", fraud_percentage)
    print()


In [23]:
import itertools
import random
import pandas as pd

dataset_size = 200000  # Total number of observations in the dataset
fraud_rate = 0.01  # Desired fraud rate (2% of the dataset)

# Calculate the number of fraud cases based on the fraud rate
num_fraud_cases = int(dataset_size * fraud_rate)

# Generate a dataset of observations with random variable values and fraud cases
dataset = []
for _ in range(dataset_size):
    data = {
        'amount': random.randint(1, 500),
        'last_amount_60day': random.randint(0, 200),
        'address_change60day': random.randint(0, 3),
        'score': random.randint(0, 999),
        'fraud': 0
    }
    dataset.append(data)

# Set the fraud cases in the dataset
fraud_indices = random.sample(range(dataset_size), num_fraud_cases)
for idx in fraud_indices:
    dataset[idx]['fraud'] = 1

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(dataset)

def create_strategy_rule(variables):
    rule_parts = []
    for var in variables:
        if var == 'score':
            rule_parts.append(f"{var} > {threshold_ranges[var][1]}")
        else:
            rule_parts.append(f"{var} > {threshold_ranges[var][0]}")
    rule = " and ".join(rule_parts)
    return rule

# Define the list of variables to create rules on
variables = ['amount', 'last_amount_60day', 'address_change60day', 'score']

# Generate combinations of variables
combinations = []
for r in range(1, len(variables) + 1):
    combinations.extend(itertools.combinations(variables, r))

# Apply the strategy rules to the dataset
rules_with_fraud_counts = []
for combo in combinations:
    if set(variables).issubset(set(combo)):  # Check if all variables are present in the combination
        fraud_count = 0
        for idx, data in df.iterrows():
            strategy_variables = {var: {'value': data[var]} for var in combo}
            rule = create_strategy_rule(strategy_variables)
            if data['fraud'] == 1:
                fraud_count += 1
        fraud_percentage = (fraud_count / num_fraud_cases) * 100
        rules_with_fraud_counts.append((combo, fraud_percentage))

# Sort the rules based on the fraud counts percentage in descending order
sorted_rules = sorted(rules_with_fraud_counts, key=lambda x: x[1], reverse=True)

# Retrieve the top 10 rules
top_10_rules = sorted_rules[:10]

# Display the top 10 rules and their fraud counts percentage
for rule, fraud_percentage in top_10_rules:
    rule_str = " and ".join(rule)
    print(f"Rule: {rule_str}")
    print(f"Total frauds captured: {int(fraud_percentage * num_fraud_cases / 100)}/{num_fraud_cases}")
    print("% fraud captured: {fraud_percentage:.2f}%")
    print()


Rule: amount and last_amount_60day and address_change60day and score
Total frauds captured: 2000/2000
% fraud captured: {fraud_percentage:.2f}%



In [38]:
import itertools
import random
import pandas as pd

dataset_size = 20000  # Total number of observations in the dataset
fraud_rate = 0.05  # Desired fraud rate (2% of the dataset)

# Calculate the number of fraud cases based on the fraud rate
num_fraud_cases = int(dataset_size * fraud_rate)

# Generate a dataset of observations with random variable values and fraud cases
dataset = []
for _ in range(dataset_size):
    data = {
        'amount': random.randint(1, 500),
        'last_amount_60day': random.randint(0, 200),
        'address_change60day': random.randint(0, 3),
        'score': random.randint(0, 999),
        'fraud': 0
    }
    dataset.append(data)

# Set the fraud cases in the dataset
fraud_indices = random.sample(range(dataset_size), num_fraud_cases)
for idx in fraud_indices:
    dataset[idx]['fraud'] = 1

# Convert the dataset to a pandas DataFrame
df = pd.DataFrame(dataset)

# Define the number of bins for each variable
num_bins = {
    'amount': 5,
    'last_amount_60day': 5,
    'address_change60day': 4,
    'score': 5
}

# Calculate the bin ranges for each variable
bin_ranges = {}
for variable in num_bins:
    min_value = df[variable].min()
    max_value = df[variable].max()
    bin_edges = pd.cut(df[variable], bins=num_bins[variable], retbins=True)[1]
    bin_ranges[variable] = list(zip(bin_edges[:-1], bin_edges[1:]))

def create_strategy_rule(variables):
    rule_parts = []
    for var in variables:
        ranges = bin_ranges[var]
        for bin_range in ranges:
            rule_parts.append(f"{bin_range[0]} < {var} <= {bin_range[1]}")
    rule = " and ".join(rule_parts)
    return rule

# Define the list of variables to create rules on
variables = ['amount', 'last_amount_60day', 'address_change60day', 'score']

# Generate combinations of variables
combinations = []
for r in range(1, len(variables) + 1):
    combinations.extend(itertools.combinations(variables, r))

# Apply the strategy rules to the dataset
rules_with_fraud_counts = []
for combo in combinations:
    fraud_count = 0
    fraud_amount = 0
    for idx, data in df.iterrows():
        strategy_variables = {var: {'value': data[var]} for var in combo}
        rule = create_strategy_rule(strategy_variables)
        print("r", rule)
        if eval(rule) and data['fraud'] == 1:
            fraud_count += 1
            fraud_amount += data['amount']
        fraud_percentage = (fraud_count / num_fraud_cases) * 100
        rules_with_fraud_counts.append((combo, fraud_percentage, fraud_count, fraud_amount))

# Sort the rules based on the fraud counts percentage in descending order
sorted_rules = sorted(rules_with_fraud_counts, key=lambda x: x[1], reverse=True)

# Retrieve the top 10 rules
top_10_rules = sorted_rules[:10]

# Display the top 10 rules and their fraud counts, percentages, and amounts
for rule, fraud_percentage, fraud_count, fraud_amount in top_10_rules:
    print(f"Rule: {rule}")
    print(f"Total frauds captured: {int(fraud_percentage * num_fraud_cases / 100)}/{num_fraud_cases}")
    print(f"Fraud count: {fraud_count}")
    print(f"Fraud amount: ${fraud_amount}")
    print()


r 0.501 < amount <= 100.8 and 100.8 < amount <= 200.6 and 200.6 < amount <= 300.4 and 300.4 < amount <= 400.2 and 400.2 < amount <= 500.0
r 0.501 < amount <= 100.8 and 100.8 < amount <= 200.6 and 200.6 < amount <= 300.4 and 300.4 < amount <= 400.2 and 400.2 < amount <= 500.0
r 0.501 < amount <= 100.8 and 100.8 < amount <= 200.6 and 200.6 < amount <= 300.4 and 300.4 < amount <= 400.2 and 400.2 < amount <= 500.0
r 0.501 < amount <= 100.8 and 100.8 < amount <= 200.6 and 200.6 < amount <= 300.4 and 300.4 < amount <= 400.2 and 400.2 < amount <= 500.0
r 0.501 < amount <= 100.8 and 100.8 < amount <= 200.6 and 200.6 < amount <= 300.4 and 300.4 < amount <= 400.2 and 400.2 < amount <= 500.0
r 0.501 < amount <= 100.8 and 100.8 < amount <= 200.6 and 200.6 < amount <= 300.4 and 300.4 < amount <= 400.2 and 400.2 < amount <= 500.0
r 0.501 < amount <= 100.8 and 100.8 < amount <= 200.6 and 200.6 < amount <= 300.4 and 300.4 < amount <= 400.2 and 400.2 < amount <= 500.0
r 0.501 < amount <= 100.8 and 100.

NameError: name 'last_amount_60day' is not defined