In [24]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler

# Create a synthetic dataset
np.random.seed(42)
n_samples = 1000

data = {
    'transaction_time': pd.date_range(start='2022-01-01', periods=n_samples, freq='H'),
    'amount': np.random.uniform(10, 1000, n_samples),
    'user_id': np.random.choice(10, 1000, n_samples),
    'fraud_tag': np.random.choice([0, 1], size=n_samples),
    'time_since_last_expiration': np.random.randint(1, 30, n_samples),
    'count_apple_ids_24hr': np.random.randint(0, 5, n_samples),
    'count_apple_ids_30days': np.random.randint(0, 20, n_samples),
    'days_since_card_active': np.random.randint(1, 365, n_samples),
    'avs_address_mismatch': np.random.choice([0, 1], size=n_samples),
    'days_since_address_change': np.random.randint(1, 365, n_samples),
    'days_since_phone_change': np.random.randint(1, 365, n_samples),
}

df = pd.DataFrame(data)

# Example feature engineering for a few features

# 1. Transaction Velocity
df['transaction_velocity'] = df.groupby('fraud_tag')['transaction_time'].diff().dt.total_seconds()

# 2. User Behavior Consistency Index
df['user_behavior_consistency'] = df.groupby('fraud_tag')['amount'].pct_change()

# 3. Sequential Pattern Indicator
df['sequential_pattern'] = df['amount'].rolling(window=3).mean()

# 4. Social Network Features
df['transaction_density_social_network'] = df.groupby('user_id')['amount'].transform('mean')

# 5. Temporal Anomaly Score
df['temporal_anomaly_score'] = df.groupby('user_id')['amount'].transform(lambda x: (x - x.mean()) / x.std())

# Standardize numeric features
numeric_cols = ['transaction_velocity', 'user_behavior_consistency', 'sequential_pattern',
                'transaction_density_social_network', 'temporal_anomaly_score']

scaler = StandardScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# Display the updated DataFrame
print(df.head())


     transaction_time      amount  user_id  fraud_tag  \
0 2022-01-01 00:00:00  380.794718        7          0   
1 2022-01-01 01:00:00  951.207163        2          1   
2 2022-01-01 02:00:00  734.674002        7          1   
3 2022-01-01 03:00:00  602.671899        4          0   
4 2022-01-01 04:00:00  164.458454        0          1   

   time_since_last_expiration  count_apple_ids_24hr  count_apple_ids_30days  \
0                          17                     1                      10   
1                          10                     3                       1   
2                          11                     0                      15   
3                          22                     1                       8   
4                           9                     0                      14   

   days_since_card_active  avs_address_mismatch  days_since_address_change  \
0                     225                     0                        129   
1                     313 

In [26]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

np.random.seed(42)

# Generate random data for the sample dataset
data = {
    'amount': np.random.uniform(10, 1000, 2000),
    'transaction_time': [datetime.now() - timedelta(days=np.random.randint(1, 365)) for _ in range(2000)],
    'fraud_tag': np.random.choice([0, 1], size=2000, p=[0.95, 0.05]),
    'time_since_last_expiration_date': np.random.randint(1, 365, 2000),
    'count_apple_ids_24hr': np.random.randint(0, 10, 2000),
    'count_apple_ids_30days': np.random.randint(0, 20, 2000),
    'days_since_card_active': np.random.randint(1, 365, 2000),
    'avs_address_mismatch': np.random.choice([0, 1], size=2000, p=[0.8, 0.2]),
    'days_since_address_change': np.random.randint(1, 365, 2000),
    'days_since_phone_change': np.random.randint(1, 365, 2000),
}

df = pd.DataFrame(data)


In [27]:
df['hour_of_day'] = df['transaction_time'].dt.hour
df['day_of_week'] = df['transaction_time'].dt.dayofweek
df['amount_category'] = pd.cut(df['amount'], bins=[0, 100, 500, 1000], labels=['low', 'medium', 'high'])
df['transaction_count_24hr'] = df['transaction_time'].apply(lambda x: df[(df['transaction_time'] > x - timedelta(hours=24)) & (df['transaction_time'] <= x)].shape[0])
df['transaction_count_7days'] = df['transaction_time'].apply(lambda x: df[(df['transaction_time'] > x - timedelta(days=7)) & (df['transaction_time'] <= x)].shape[0])
df['card_age_category'] = pd.cut(df['days_since_card_active'], bins=[0, 30, 180, 365], labels=['new', 'medium', 'old'])


In [28]:
df['fraud_tag'].value_counts()

0    1923
1      77
Name: fraud_tag, dtype: int64

In [29]:
df.head()

Unnamed: 0,amount,transaction_time,fraud_tag,time_since_last_expiration_date,count_apple_ids_24hr,count_apple_ids_30days,days_since_card_active,avs_address_mismatch,days_since_address_change,days_since_phone_change,hour_of_day,day_of_week,amount_category,transaction_count_24hr,transaction_count_7days,card_age_category
0,380.794718,2023-06-10 08:59:29.208167,0,17,9,15,187,0,232,225,8,5,medium,7,35,old
1,951.207163,2023-08-06 08:59:29.208167,0,196,1,13,302,0,89,310,8,6,high,8,44,old
2,734.674002,2023-06-10 08:59:29.208167,0,327,9,0,200,1,77,61,8,5,high,7,35,old
3,602.671899,2023-04-21 08:59:29.208167,0,318,5,15,39,0,136,16,8,4,high,7,41,medium
4,164.458454,2023-09-18 08:59:29.208167,0,297,5,17,220,1,110,229,8,0,medium,4,42,old


In [30]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Define features and target variable
features = df.drop(['fraud_tag', 'transaction_time'], axis=1)
target = df['fraud_tag']

# Convert categorical variables to dummy/indicator variables
features = pd.get_dummies(features)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Build a Random Forest Classifier
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.96

Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98       384
           1       0.00      0.00      0.00        16

    accuracy                           0.96       400
   macro avg       0.48      0.50      0.49       400
weighted avg       0.92      0.96      0.94       400


Confusion Matrix:
 [[384   0]
 [ 16   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
# Get variable importance
importances = model.feature_importances_

# Create a DataFrame to display the feature importance
feature_importance_df = pd.DataFrame({'Feature': features.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)

# Print the variable importance
print(feature_importance_df)


                            Feature  Importance
0                            amount    0.131527
6         days_since_address_change    0.130677
7           days_since_phone_change    0.114428
4            days_since_card_active    0.112385
1   time_since_last_expiration_date    0.112023
11          transaction_count_7days    0.085893
3            count_apple_ids_30days    0.071396
10           transaction_count_24hr    0.065104
2              count_apple_ids_24hr    0.060014
9                       day_of_week    0.051540
16         card_age_category_medium    0.011766
13           amount_category_medium    0.010860
14             amount_category_high    0.010739
17            card_age_category_old    0.009958
5              avs_address_mismatch    0.009740
15            card_age_category_new    0.006314
12              amount_category_low    0.005634
8                       hour_of_day    0.000000


In [38]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE, SelectKBest, mutual_info_classif
from sklearn.linear_model import Lasso
from sklearn.decomposition import KernelPCA
from sklearn.manifold import TSNE
from sklearn.feature_selection import RFECV
#from boruta import BorutaPy
#import umap
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Generating synthetic data for demonstration
np.random.seed(42)
data = {
    'amount': np.random.uniform(10, 1000, 2000),
    'transaction_time': np.random.uniform(0, 1, 2000),
    'fraud_tag': np.random.choice([0, 1], size=2000, p=[0.95, 0.05]),
    'time_since_last_expiration_date': np.random.randint(1, 365, 2000),
    # Add other features as needed
}

df = pd.DataFrame(data)

def evaluate_model(X_train_reduced, X_test_reduced, model):
    model.fit(X_train_reduced, y_train)
    y_pred = model.predict(X_test_reduced)

    print("Model Performance on Reduced Features:")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

    
# Assuming 'fraud_tag' is the target variable
features = df.drop(['fraud_tag'], axis=1)
target = df['fraud_tag']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define a list of feature reduction methods
feature_reduction_methods = [
    ("RFE", RFE(RandomForestClassifier(n_estimators=100, random_state=42), n_features_to_select=5)),
    ("Mutual Information", SelectKBest(score_func=mutual_info_classif, k=3)),
    ("LASSO", Lasso(alpha=0.1)),
    ("Kernel PCA", KernelPCA(n_components=2, kernel='rbf')),
    ("t-SNE", TSNE(n_components=2, random_state=42)),
    ("RFECV", RFECV(RandomForestClassifier(n_estimators=100, random_state=42), step=1, cv=5, scoring='accuracy')),
    #("Boruta", BorutaPy(RandomForestClassifier(n_estimators='auto', random_state=42), n_estimators='auto', verbose=2)),
    #("UMAP", umap.UMAP(n_components=2, random_state=42))
]

# Evaluate each feature reduction method
for method_name, method in feature_reduction_methods:
    print(f"\n{'='*40}\n{method_name}\n{'='*40}")

    # Fit the feature reduction method
    X_train_reduced = method.fit_transform(X_train_scaled, y_train)
    X_test_reduced = method.transform(X_test_scaled)

    # Display the variable importance or feature selection scores if available
    if hasattr(method, 'support_') and method_name != "t-SNE":
        # For methods with support (excluding t-SNE)
        if hasattr(method, 'estimator_'):
            # For methods with an estimator (e.g., RFE)
            try:
                importances = method.estimator_.feature_importances_
            except AttributeError:
                importances = None
        else:
            # For methods without an estimator (e.g., SelectKBest)
            importances = method.scores_
        if importances is not None:
            feature_names = X_train.columns[method.support_]
            variable_importance = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
            variable_importance = variable_importance.sort_values(by='Importance', ascending=False)
            print(f"Variable Importance or Score with {method_name}:\n{variable_importance}")
        else:
            print(f"{method_name} does not provide variable importance information.")
    else:
        print(f"{method_name} does not provide variable importance information.")

    # Evaluate the model with reduced features
    evaluate_model(X_train_reduced, X_test_reduced, RandomForestClassifier(n_estimators=100, random_state=42))



RFE
Variable Importance or Score with RFE:
                           Feature  Importance
1                 transaction_time    0.367466
0                           amount    0.348696
2  time_since_last_expiration_date    0.283838
Model Performance on Reduced Features:
Accuracy: 0.94

Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       377
           1       0.00      0.00      0.00        23

    accuracy                           0.94       400
   macro avg       0.47      0.50      0.48       400
weighted avg       0.89      0.94      0.91       400


Confusion Matrix:
 [[376   1]
 [ 23   0]]

Mutual Information
Mutual Information does not provide variable importance information.
Model Performance on Reduced Features:
Accuracy: 0.94

Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97       377
           1       0.00      0.00      0.0

AttributeError: 'Lasso' object has no attribute 'fit_transform'

In [43]:
import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

# Generating synthetic data for demonstration
np.random.seed(42)
data = {
    'amount': np.random.uniform(10, 1000, 2000),
    'transaction_time': np.random.uniform(0, 1, 2000),
    'fraud_tag': np.random.choice([0, 1], size=2000, p=[0.95, 0.05]),
    'time_since_last_expiration_date': np.random.randint(1, 365, 2000),
    # Add other features as needed
}

data = {
    'amount': np.random.uniform(10, 1000, 2000),
   # 'transaction_time': [datetime.now() - timedelta(days=np.random.randint(1, 365)) for _ in range(2000)],
    'fraud_tag': np.random.choice([0, 1], size=2000, p=[0.95, 0.05]),
    'time_since_last_expiration_date': np.random.randint(1, 365, 2000),
    'count_apple_ids_24hr': np.random.randint(0, 10, 2000),
    'count_apple_ids_30days': np.random.randint(0, 20, 2000),
    'days_since_card_active': np.random.randint(1, 365, 2000),
    'avs_address_mismatch': np.random.choice([0, 1], size=2000, p=[0.8, 0.2]),
    'days_since_address_change': np.random.randint(1, 365, 2000),
    'days_since_phone_change': np.random.randint(1, 365, 2000),
}

df = pd.DataFrame(data)

# Assuming 'fraud_tag' is the target variable
features = df.drop(['fraud_tag'], axis=1)
target = df['fraud_tag']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Select top k features based on mutual information
k_best = 3  # Choose the desired number of features
selector_mi = SelectKBest(score_func=mutual_info_classif, k=k_best)
X_train_mi = selector_mi.fit_transform(X_train_scaled, y_train)
X_test_mi = selector_mi.transform(X_test_scaled)

# Display the selected features
selected_features_mi = X_train.columns[selector_mi.get_support()]
print(f"Selected Features based on Mutual Information: {selected_features_mi}")

# Display the selected features and their values
selected_features_mi = X_train.columns[selector_mi.get_support()]
selected_values_mi = X_train_mi[0]  # Assuming you want the values of the first instance, adjust as needed
print(f"Selected Features based on Mutual Information: {selected_features_mi}")
print(f"Values of the Selected Features: {selected_values_mi}")


# Evaluate a model with the selected features
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
model_rf.fit(X_train_mi, y_train)
y_pred_mi = model_rf.predict(X_test_mi)

# Evaluate model performance
print("\nModel Performance with Mutual Information-based Feature Selection:")
print("Accuracy:", accuracy_score(y_test, y_pred_mi))
print("\nClassification Report:\n", classification_report(y_test, y_pred_mi))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_mi))


Selected Features based on Mutual Information: Index(['count_apple_ids_24hr', 'count_apple_ids_30days',
       'days_since_card_active'],
      dtype='object')
Selected Features based on Mutual Information: Index(['count_apple_ids_24hr', 'count_apple_ids_30days',
       'days_since_card_active'],
      dtype='object')
Values of the Selected Features: [1.57110071 0.24603328 1.14458295]

Model Performance with Mutual Information-based Feature Selection:
Accuracy: 0.9425

Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.99      0.97       379
           1       0.00      0.00      0.00        21

    accuracy                           0.94       400
   macro avg       0.47      0.50      0.49       400
weighted avg       0.90      0.94      0.92       400


Confusion Matrix:
 [[377   2]
 [ 21   0]]


In [None]:
import pandas as pd
import numpy as np

# Create a synthetic dataset
np.random.seed(42)
n_samples = 1000

data = {
    'user_id': np.random.choice(range(1, 101), size=n_samples),
    'transaction_type': np.random.choice(['purchase', 'withdrawal', 'transfer'], size=n_samples),
    'amount': np.random.uniform(10, 1000, n_samples),
    'fraud_tag': np.random.choice([0, 1], size=n_samples),
    'transaction_time': pd.date_range(start='2022-01-01', periods=n_samples, freq='H'),
}

df = pd.DataFrame(data)

# Sort the DataFrame by user_id and transaction_time
df = df.sort_values(by=['user_id', 'transaction_type'])

# User Behavior Consistency Index
df['user_behavior_consistency'] = df.groupby('user_id')['amount'].pct_change()

# Transaction Diversity
transaction_diversity = df.groupby('user_id')['transaction_type'].nunique().reset_index()
transaction_diversity.columns = ['user_id', 'transaction_diversity']

# Merge the Transaction Diversity feature back to the main DataFrame
df = pd.merge(df, transaction_diversity, on='user_id', how='left')

# Fill NaN values with 0 (for users with only one type of transaction)
df['transaction_diversity'].fillna(0, inplace=True)

# Display the DataFrame with the new features
print(df[['user_id', 'fraud_tag','transaction_type', 'amount', 'user_behavior_consistency', 'transaction_diversity']])
