In [2]:
import pandas as pd
import numpy as np
import os
from sklearn.metrics import roc_auc_score
import pathlib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import resample
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

In [3]:
path = "/data/mlproject22" if os.path.exists("/data/mlproject22") else "."
train_data = pd.read_csv(os.path.join(path, "transactions.csv.zip"))

# Split the dataset into features (X) and target (y)
X = train_data.drop(columns="Class")
y = train_data["Class"]

In [12]:
# CLASS WEIGHTS METHOD (high precision on frauds)

# Split the dataset into features (X) and target (y)
X = train_data.drop(columns="Class")
y = train_data["Class"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Perform data normalization using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Calculate class weights
class_weights = dict(1 / y_train.value_counts())

# Train Random Forest classifier with class weights
rf_classifier = RandomForestClassifier(random_state=123, class_weight=class_weights)
rf_classifier.fit(X_train_scaled, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test_scaled)

# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred)
print("AUC-ROC Score:", auc_roc)

# Print classification report
print(classification_report(y_test, y_pred))

# Save the trained model and scaler
joblib.dump(rf_classifier, "random_forest_params.pkl")
joblib.dump(scaler, "random_forest_scaler.pkl")


AUC-ROC Score: 0.873395738665613
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     45490
           1       0.97      0.75      0.84        79

    accuracy                           1.00     45569
   macro avg       0.98      0.87      0.92     45569
weighted avg       1.00      1.00      1.00     45569



['random_forest_scaler.pkl']

In [13]:
import joblib
def leader_board_predict_fn(values):
    
    # Load the trained model parameters
    rf_classifier = joblib.load("/home/csaz7668/random_forest_params.pkl")
    scaler = joblib.load("/home/csaz7668/random_forest_scaler.pkl")
    X = values
    X_scaled = scaler.transform(X)

    # Predict the likelihood of fraud (decision function values) for each transaction
    decision_function_values = rf_classifier.predict_proba(X_scaled)[:, 1]  # Get the probability of the positive class
    print(decision_function_values)
    return decision_function_values

In [29]:
def get_score():
    """
    Function to compute scores for train and test datasets.
    """

    import pandas as pd
    import numpy as np
    import os
    from sklearn.metrics import roc_auc_score
    import pathlib

    try:
        path = "/data/mlproject22" if os.path.exists("/data/mlproject22") else "."
        test_data = pd.read_csv(os.path.join(path, "transactions.csv.zip"))
        X_test = test_data.drop(columns = "Class")
        y_test = test_data["Class"]
        print("Shape of X_test:", X_test.shape)
        print("Shape of y_test:", y_test.shape)
        decision_function_values = leader_board_predict_fn(X_test)
        assert decision_function_values.shape == (X_test.shape[0],)
        dataset_score = roc_auc_score(y_test, decision_function_values)
        assert dataset_score >= 0.0 and dataset_score <= 1.0
    except Exception as e:
        print("Error:", str(e))
        dataset_score = float("nan")
    print(f"Train Dataset Score: {dataset_score}")

    import os
    import pwd
    import time
    import datetime
    import pandas as pd
    user_id = pwd.getpwuid( os.getuid() ).pw_name
    curtime = time.time()
    dt_now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M")

    try:
        HIDDEN_DATASET_PATH = os.path.expanduser("/data/mlproject22-test-data")
        test_data = pd.read_csv(os.path.join(HIDDEN_DATASET_PATH,"transactions_scoreboard.csv.zip"))
        X_test = test_data.drop(columns=["Class"])
        y_test = test_data["Class"]
        decision_function_values = leader_board_predict_fn(X_test)
        hiddendataset_score = roc_auc_score(y_test, decision_function_values)
        print(f"Test Dataset Score: {hiddendataset_score}")
        score_dict = dict(
            score_hidden=hiddendataset_score,
            score_train=dataset_score,
            unixtime=curtime,
            user=user_id,
            dt=dt_now,
            comment="",
        )
    except Exception as e:
        err = str(e)
        score_dict = dict(
            score_hidden=float("nan"),
            score_train=dataset_score,
            unixtime=curtime,
            user=user_id,
            dt=dt_now,
            comment=err
        )

    #if list(pathlib.Path(os.getcwd()).parents)[0].name == 'source':
    #    print("we are in the source directory... replacing values.")
    #    print(pd.DataFrame([score_dict]))
    #    score_dict["score_hidden"] = -1
    #    score_dict["score_train"] = -1
    #    print("new values:")
    #    print(pd.DataFrame([score_dict]))

    pd.DataFrame([score_dict]).to_csv("transactions_test.csv", index=False)
    
get_score()

Shape of X_test: (227845, 30)
Shape of y_test: (227845,)
[0. 0. 0. ... 0. 0. 0.]
Train Dataset Score: 0.9935252189198022
[0. 0. 0. ... 0. 0. 0.]
Test Dataset Score: 0.9527033889667291


In [92]:
# UNDERSAMPLING NON-FRAUDULENT TRANSACTIONS METHOD (low precision on frauds)
'''
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Perform data normalization using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Undersample the majority class
X_train_resampled, y_train_resampled = resample(X_train_scaled[y_train == 0], y_train[y_train == 0],
                                               n_samples=np.sum(y_train == 1), random_state=42)

# Concatenate the minority class with the undersampled majority class
X_train_resampled = np.concatenate((X_train_scaled[y_train == 1], X_train_resampled), axis=0)
y_train_resampled = np.concatenate((y_train[y_train == 1], y_train_resampled), axis=0)

# Train a Random Forest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test_scaled)

# Calculate the AUC-ROC score
auc_roc = roc_auc_score(y_test, y_pred)
print("AUC-ROC Score:", auc_roc)

# Save parameters & scaler
# joblib.dump(rf_classifier, "random_forest_params.pkl")
# joblib.dump(scaler, "random_forest_scaler.pkl")

print(classification_report(y_test, y_pred))
'''

AUC-ROC Score: 0.9524828102434532
              precision    recall  f1-score   support

           0       1.00      0.97      0.98     45490
           1       0.05      0.94      0.09        79

    accuracy                           0.97     45569
   macro avg       0.52      0.95      0.54     45569
weighted avg       1.00      0.97      0.98     45569



In [26]:
print(train_data.shape[0])

227845


In [16]:
print(class_weights)

{0: 5.495683140892829e-06, 1: 0.0031746031746031746}


In [17]:
print(y_train.value_counts())

0    181961
1       315
Name: Class, dtype: int64


In [27]:
fraud_indices = y_test[y_test == 1].index
non_fraud_indices = y_test[y_test == 0].index

#plt.scatter(fraud_indices, y_test[y_test == 1], c='blue', label='Actual Fraud')
#plt.scatter(non_fraud_indices, y_test[y_test == 0], c='orange', label='Actual Non-Fraud')

print(y_pred[y_test == 0].sum())
print(y_pred[y_test == 1].sum())

#plt.scatter(non_fraud_indices, y_pred[y_test == 0], c='green', label='Predicted Non-Fraud')
plt.scatter(fraud_indices, y_pred[y_test == 1], c='red', label='Predicted Fraud')

plt.xlabel('Data Point')
plt.ylabel('Class (0: Non-Fraud, 1: Fraud)')
plt.title('Binary Classification Results')
plt.legend()
plt.show()
print(len(y_test))


2
59


NameError: name 'plt' is not defined

In [20]:
predicted_index = []
for i in range(len(y_pred)):
    if(y_pred[i] == 1):
        predicted_index.append(i)
print(predicted_index)
print("List has a length of:", len(predicted_index))

[485, 1165, 1249, 1927, 3134, 3555, 3887, 4707, 4724, 4885, 4897, 5149, 6248, 6771, 8512, 8718, 8788, 9321, 9628, 10361, 11062, 11652, 11855, 12046, 12376, 13627, 14217, 14446, 16067, 16158, 16487, 16719, 16959, 17750, 18694, 19778, 20431, 20968, 22537, 23085, 23420, 23895, 25370, 25432, 26090, 26894, 27424, 28269, 30956, 32349, 33638, 34035, 34098, 34440, 36131, 37255, 38665, 39195, 39978, 42833, 44057]
List has a length of: 61


In [21]:
found = 0
false_values = 0
frauds_as_non_frauds = 0
comparision_indices = 0
for i in range(len(y_test)):
    if(y_test.iloc[i] == 1 and i in predicted_index):
        found += 1
    if(y_test.iloc[i] == 0 and i in predicted_index):
        false_values += 1
    if(y_test.iloc[i] == 1 and i not in predicted_index):
        frauds_as_non_frauds += 1
        
print("Correct frauds detected by the model:", found)
print("Non-frauds predicted as frauds by the model:", false_values)
print("Frauds predicted as non-frauds by the model:", frauds_as_non_frauds)
print("Total frauds in the test dataset:", y_test[y_test == 1].size)

Correct frauds detected by the model: 59
Non-frauds predicted as frauds by the model: 2
Frauds predicted as non-frauds by the model: 20
Total frauds in the test dataset: 79


In [22]:
tp, tn, fp, fn = 0, 0, 0, 0
for t, p in zip(y_test, y_pred):
    if(t == 1 and p == 1):
        tp += 1
    if(t == 0 and p == 0):
        tn += 1
    if(t == 0 and p == 1):
        fp += 1
    if(t == 1 and p == 0):
        fn += 1
print("True positive values:", tp)
print("True negative values:", tn)
print("False positive values:", fp)
print("False negative values:", fn)
print("Total sum:", tp + tn + fp + fn)
print("Accuracy:", (tp + tn) / (tp + tn + fp + fn))

True positive values: 59
True negative values: 45488
False positive values: 2
False negative values: 20
Total sum: 45569
Accuracy: 0.9995172156509908


In [28]:
print(y.value_counts())

0    227451
1       394
Name: Class, dtype: int64
