In [1]:
# Import the ETL_Pipeline class
from data_pipeline import ETL_Pipeline  

# File path to the raw transaction data CSV file
csv_file = 'transactions.csv'

# Create an instance of the ETL_Pipeline class
etl_pipeline = ETL_Pipeline()

# Transform the raw transaction data
transformed_filename = etl_pipeline.transform(csv_file)

print("Transformed CSV file:", transformed_filename)


Transformed CSV file: transformed_data.csv


In [3]:
import pandas as pd
df = pd.read_csv('transformed_data.csv'); df.head()

Unnamed: 0,amt,cc_num,is_night,is_holidays,is_post_holidays,is_summer,is_fraud
0,4.97,2703186189652095,1,1,1,0,0
1,107.23,630423337322,1,1,1,0,0
2,220.11,38859492057661,1,1,1,0,0
3,45.0,3534093764340240,1,1,1,0,0
4,41.96,375534208663984,1,1,1,0,0


In [5]:
from dataset import Fraud_Dataset

# Instantiate the Fraud_Dataset class
fraud_dataset = Fraud_Dataset(data_path='transformed_data.csv', k_folds=5, random_state=42, balance_samples=1000)

# Split data into k folds
fraud_dataset.split_data()

# Get training and validation datasets for a specific fold (e.g., fold 0)
fold = 4
train_dataset, val_dataset = fraud_dataset.get_training_validation_datasets(fold)

# Get the entire dataset as the testing dataset
test_dataset = fraud_dataset.get_testing_dataset()


In [7]:
print(test_dataset.shape)
print(train_dataset.shape)
print(val_dataset.shape)

(999999, 7)
(800000, 7)
(199999, 7)


In [9]:
val_dataset.head()

Unnamed: 0,amt,cc_num,is_night,is_holidays,is_post_holidays,is_summer,is_fraud
13,96.29,5559857416065248,1,1,1,0,0
15,3.26,6011999606625827,1,1,1,0,0
16,327.0,6011860238257910,1,1,1,0,0
18,63.07,2348245054386329,1,1,1,0,0
23,9.55,630441765090,1,1,1,0,0


In [13]:
from model import Fraud_Detector_Model

In [17]:
fdm_lr = Fraud_Detector_Model()
model_type='logistic_regression'
trained_lr_model = fdm_lr.train(train_dataset, train_datafile=None, model_type=model_type)
validated_lr_model = fdm_lr.validate(val_dataset, val_datafile=None, model_type=model_type)
tested_lr_model = fdm_lr.test(test_df=test_dataset, test_datafile=None, model_type=model_type)
print(f'The accuracy of {model_type} is {tested_lr_model[2]}')

The accuracy of logistic_regression is 0.8714978714978715


In [19]:
fdm_ensemble = Fraud_Detector_Model()
model_type='ensemble'
trained_ensemble_model = fdm_ensemble.train(train_dataset, train_datafile=None, model_type=model_type)
validated_ensemble_model = fdm_ensemble.validate(val_dataset, val_datafile=None, model_type=model_type)
tested_ensemble_model = fdm_ensemble.test(test_dataset, test_datafile=None, model_type=model_type)
print(f'The accuracy of {model_type} is {tested_ensemble_model[2]}')

The accuracy of ensemble is 0.8714978714978715


In [23]:
print(f'The accuracy of {model_type} is {tested_ensemble_model[2]}')

The accuracy of random_forest is 0.9955989955989956


In [None]:
fdm_rf = Fraud_Detector_Model()
model_type='random_forest'
trained_rf_model = fdm_rf.train(train_dataset, train_datafile=None, model_type=model_type)
validated_rf_model = fdm_rf.validate(val_dataset, val_datafile=None, model_type=model_type)
tested_rf_model = fdm_rf.test(test_df=test_dataset, test_datafile=None, model_type=model_type)
print(f'The accuracy of {model_type} is {tested_lr_model[2]}')

In [27]:
print(f'The accuracy of {model_type} is {tested_rf_model[2]}')

The accuracy of random_forest is 0.9956719956719957


In [39]:
from metrics import Metrics
rf_metrics = Metrics(model_name='random_forest', model_dir='None')
rf_metrics.generate_report(tested_rf_model[0], tested_rf_model[1])

Report generated and saved to random_forest_report_2024-03-22_17-33-52.txt


In [41]:
from metrics import Metrics
lr_metrics = Metrics(model_name='logistic_regression', model_dir='None')
lr_metrics.generate_report(tested_lr_model[0], tested_lr_model[1])

Report generated and saved to logistic_regression_report_2024-03-22_17-34-35.txt


In [43]:
from metrics import Metrics
ensemble_metrics = Metrics(model_name='ensemble', model_dir='None')
ensemble_metrics.generate_report(tested_ensemble_model[0], tested_ensemble_model[1])

Report generated and saved to ensemble_report_2024-03-22_17-41-18.txt


In [52]:
from model import Fraud_Detector_Model

In [57]:
rf_model = Fraud_Detector_Model()
determination = rf_model.infer_model('check_transaction.csv', 'random_forest')
print(determination)

No, It is a legitimate transaction


In [59]:
lr_model = Fraud_Detector_Model()
determination = lr_model.infer_model('check_transaction.csv', 'logistic_regression')
print(determination)

Yes, It is a fraudulent transaction, take immediate action


In [61]:
ensemble_model = Fraud_Detector_Model()
determination = ensemble_model.infer_model('check_transaction.csv', 'ensemble')
print(determination)

No, It is a legitimate transaction
