In [4]:
from data_pipeline import ETL_Pipeline
from dataset import Fraud_Dataset
from model import Fraud_Detector_Model
from metrics import Metrics

In [21]:
# Instantiate the ETL_Pipeline class
etl_pipeline = ETL_Pipeline()

# read the given .csv file into a df
transactions_df = etl_pipeline.extract('experimental_transaction.csv')  

# add features to signify time(hour) of transactions
transactions_df = etl_pipeline.categorize_time(transactions_df)

# add features to signify part of year for trasactions
transactions_df = etl_pipeline.categorize_year(transactions_df)

# add average transactions as a feature
transactions_df = etl_pipeline.calculate_avg_transactions(transactions_df)

# finally, remove all the unnecessary columns
transactions_df = etl_pipeline.remove_columns(transactions_df)

# save as a transformed csv file
transactions_df = etl_pipeline.load(transactions_df)

In [6]:
# Create an instance of Fraud_Dataset
fraud_dataset = Fraud_Dataset(data_path='transformed_data.csv', k_folds=5, random_state=42)

# make changes to the path for jupyter

# Load data
fraud_dataset.load_data()

# Balance data and choose number of observation
fraud_dataset.balance_data(n=5000)

# Split data into k folds
fraud_dataset.split_data()

# Access training dataset for a specific fold (e.g., fold 0)
training_data = fraud_dataset.get_training_dataset(fold=0)

# Access validation dataset for the same fold
validation_data = fraud_dataset.get_validation_dataset(fold=0)

# Access the entire dataset as the testing dataset
testing_data = fraud_dataset.get_testing_dataset()


In [7]:
candidate_models = ['logistic_regression', 'random_forest', 'ensemble']

for model in candidate_models:
  fraud_detector_model = Fraud_Detector_Model()
  model_type = model

  # Train Model
  X_train = training_data.drop(columns=['is_fraud'])
  y_train = training_data['is_fraud']
  trained_model = fraud_detector_model.train(X_train, y_train, model_type)

  # Validate Model
  X_validate = validation_data.drop(columns=['is_fraud'])
  y_validate = validation_data['is_fraud']
  validated_model = fraud_detector_model.validate(X_validate, y_validate, model_type)

  # Test Model and generate predictions
  X_test = testing_data.drop(columns=['is_fraud'])
  y_test = testing_data['is_fraud']
  y_pred = trained_model.predict(X_test)

   # Generate Metrics
  metrics = Metrics(model_name=model)
  metrics.generate_report( y_pred, y_test)

Report generated and saved to logistic_regression_report_2024-02-28_15-59-11.txt
Report generated and saved to random_forest_report_2024-02-28_15-59-15.txt
Report generated and saved to ensemble_report_2024-02-28_15-59-16.txt
