In [92]:
import pandas as pd
import numpy as np
import kagglehub
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
from copy import deepcopy
import json
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [93]:
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
df = pd.read_csv(path + '/creditcard.csv')

In [None]:
df.describe()

In [95]:
df = df.drop(columns="Time")
scaler = StandardScaler()
for col in df.columns:
    if col == "Class":
        continue
    df[col] = scaler.fit_transform(df[[col]])

In [None]:
df.describe()

In [None]:
df.info()

In [98]:
X, y = df.drop(columns="Class"), df["Class"]

### Undersampling

In [99]:
class_counts = y.value_counts()
minority_class = class_counts.idxmin()
minority_count = class_counts.min()
majority_class = class_counts.idxmax()

desired_ratio = 256
majority_count = minority_count * desired_ratio
sampling_strategy = {
    minority_class: minority_count,
    majority_class: majority_count
    }

In [100]:
rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=17)
X_resampled, y_resampled = rus.fit_resample(X, y)

In [101]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=17)

In [None]:

print(f"Imbalance ratio in training set: {(y_train.count()-np.sum(y_train))/np.sum(y_train)}")
print(f"Class proportion in testing set: {(y_test.count()-np.sum(y_test))/np.sum(y_test)}")

Creating new smaller training data set that has both classes:

In [None]:
rng = np.random.RandomState(42)
labeled_X = pd.DataFrame()
labeled_y = pd.Series()
chosen_indexes = []
chosen_0_class = 0
chosen_1_class = 0
for i in y_train.index.tolist():
    class_value = y_train.loc[i]
    if class_value == 0 and chosen_0_class < desired_ratio:
        chosen_0_class += 1
    elif class_value == 1 and chosen_1_class < 1:
        chosen_1_class += 1
    elif chosen_0_class == desired_ratio and chosen_1_class == 1:
        break
    else:
        continue
    labeled_X = pd.concat([labeled_X, X_train.loc[[i]]])
    X_train = X_train.drop(i)
    labeled_y = pd.concat([labeled_y, y_train.loc[[i]]])
    y_train = y_train.drop(i)
    
    

# while not (labeled_y_sum > 0 and len(labeled_y) != labeled_y_sum):
#     random_index = rng.choice(y_train.index.tolist())
#     labeled_y = pd.concat([labeled_y, y_train.loc[[random_index]]])
#     y_train = y_train.drop(random_index)
#     labeled_X = pd.concat([labeled_X, X_train.loc[[random_index]]])
#     X_train = X_train.drop(random_index)
#     labeled_y_sum = np.sum(labeled_y)


In [None]:
labeled_y

In [103]:
def get_one_class_probability(binary_proba):
    single_values = []
    for proba in binary_proba:
        single_values.append(round(proba[1], 4))
    return single_values

In [97]:
labeled_X_active = deepcopy(labeled_X)
labeled_X_passive = deepcopy(labeled_X)
labeled_y_active = deepcopy(labeled_y)
labeled_y_passive = deepcopy(labeled_y)

unlabeled_X_active = deepcopy(X_train)
unlabeled_X_passive = deepcopy(X_train)
unlabeled_y_active = deepcopy(y_train)
unlabeled_y_passive = deepcopy(y_train)


Active Learning


In [None]:
ITERS = 300
results = {
    "training set size": len(X_train),
    "count of minority class members": int(np.sum(y_train)),
    "true labels": [int(i) for i in y_test.values],
    "models": []}
active_model = SVC(kernel="rbf", probability=True, cache_size=1000)
for i in range(ITERS):
    active_model.fit(labeled_X_active, labeled_y_active)
    
    test_proba = active_model.predict_proba(X_test)
    one_class_proba = get_one_class_probability(test_proba)
    model = {"training instances": len(labeled_y_active), 
             "predictions": one_class_proba,
             "minority_class_examples_used": sum(labeled_y_active)}
    results["models"].append(model)
    
    probabilities = active_model.predict_proba(unlabeled_X_active)
    uncertainty = np.abs(probabilities[:, 0] - probabilities[:, 1])
    least_confident_index = np.argmin(uncertainty)
    least_confident_sample = unlabeled_X_active.iloc[least_confident_index]
    df_index_number = least_confident_sample.name
    
    labeled_X_active = pd.concat([labeled_X_active, least_confident_sample.to_frame().T])
    labeled_y_active = pd.concat([labeled_y_active, unlabeled_y_active.loc[[df_index_number]]])
    unlabeled_X_active = unlabeled_X_active.drop(df_index_number)
    unlabeled_y_active = unlabeled_y_active.drop(df_index_number)
    print(f"{i}/{ITERS-1}")

In [126]:
with open(f"one_positive_class/rbf_active_learning_IR{desired_ratio}.json", "w", encoding="utf-8") as json_file:
    json.dump(results, json_file, ensure_ascii=False, indent=4, separators=(',', ': '))

In [127]:
ITERS = 300
results = {
    "training set size": len(X_train),
    "count of minority class members": int(np.sum(y_train)),
    "true labels": [int(i) for i in y_test.values],
    "models": []}
active_model = SVC(kernel="rbf", probability=True, cache_size=1000)
for i in range(ITERS):
    active_model.fit(labeled_X_passive, labeled_y_passive)
    
    test_proba = active_model.predict_proba(X_test)
    one_class_proba = get_one_class_probability(test_proba)
    model = {"training instances": len(labeled_y_passive), 
             "predictions": one_class_proba,
             "minority_class_examples_used": sum(labeled_y_passive)}
    results["models"].append(model)
    
    random_record = unlabeled_X_passive.sample(n=1)
    labeled_X_passive = pd.concat([labeled_X_passive, random_record])
    labeled_y_passive = pd.concat([labeled_y_passive, unlabeled_y_passive.loc[random_record.index]])
    
    unlabeled_X_passive = unlabeled_X_passive.drop(random_record.index)
    unlabeled_y_passive = unlabeled_y_passive.drop(random_record.index)
    
    print(f"{i}/{ITERS-1}")

In [128]:
with open(f"one_positive_class/rbf_passive_learning_IR{desired_ratio}.json", "w", encoding="utf-8") as json_file:
    json.dump(results, json_file, ensure_ascii=False, indent=4, separators=(',', ': '))

Active learning for NN

In [None]:
ITERS = 300

results = {
    "training set size": len(X_train),
    "count of minority class members": int(np.sum(y_train)),
    "true labels": [int(i) for i in y_test.values],
    "models": []
}

active_model = MLPClassifier(hidden_layer_sizes=(10, 10), max_iter=1000, random_state=42)

for i in range(ITERS):
    active_model.fit(labeled_X_active, labeled_y_active)
    
    test_proba = active_model.predict_proba(X_test)
    one_class_proba = get_one_class_probability(test_proba)
    model = {"training instances": len(labeled_y_active), 
             "predictions": one_class_proba,
             "minority_class_examples_used": sum(labeled_y_active)}
    results["models"].append(model)
    
    probabilities = active_model.predict_proba(unlabeled_X_active)
    uncertainty = np.abs(probabilities[:, 0] - probabilities[:, 1])
    least_confident_index = np.argmin(uncertainty)
    least_confident_sample = unlabeled_X_active.iloc[least_confident_index]
    df_index_number = least_confident_sample.name
    
    labeled_X_active = pd.concat([labeled_X_active, least_confident_sample.to_frame().T])
    labeled_y_active = pd.concat([labeled_y_active, unlabeled_y_active.loc[[df_index_number]]])
    unlabeled_X_active = unlabeled_X_active.drop(df_index_number)
    unlabeled_y_active = unlabeled_y_active.drop(df_index_number)
    print(f"{i}/{ITERS-1}")

In [17]:
with open(f"one_positive_class_nn/active_learning_IR{desired_ratio}.json", "w", encoding="utf-8") as json_file:
    json.dump(results, json_file, ensure_ascii=False, indent=4, separators=(',', ': '))

In [None]:
ITERS = 300

results = {
    "training set size": len(X_train),
    "count of minority class members": int(np.sum(y_train)),
    "true labels": [int(i) for i in y_test.values],
    "models": []
}

active_model = MLPClassifier(
    hidden_layer_sizes=(10, 10), max_iter=1000, random_state=42)

for i in range(ITERS):
    active_model.fit(labeled_X_passive, labeled_y_passive)
    
    test_proba = active_model.predict_proba(X_test)
    one_class_proba = get_one_class_probability(test_proba)
    model = {"training instances": len(labeled_y_passive), 
             "predictions": one_class_proba,
             "minority_class_examples_used": sum(labeled_y_passive)}
    results["models"].append(model)
    
    random_record = unlabeled_X_passive.sample(n=1)
    labeled_X_passive = pd.concat([labeled_X_passive, random_record])
    labeled_y_passive = pd.concat([labeled_y_passive, unlabeled_y_passive.loc[random_record.index]])
    
    unlabeled_X_passive = unlabeled_X_passive.drop(random_record.index)
    unlabeled_y_passive = unlabeled_y_passive.drop(random_record.index)
    
    print(f"{i}/{ITERS-1}")

In [19]:
with open(f"one_positive_class_nn/passive_learning_IR{desired_ratio}.json", "w", encoding="utf-8") as json_file:
    json.dump(results, json_file, ensure_ascii=False, indent=4, separators=(',', ': '))

In [104]:
passive_model = MLPClassifier(
    hidden_layer_sizes=(10, 10), max_iter=1000, random_state=42)
passive_model.fit(X_train, y_train)
    
test_proba = passive_model.predict_proba(X_test)
one_class_proba = get_one_class_probability(test_proba)
results = {
        "training_set_size": len(X_train),
        "test_set_size": len(X_test),
        "minority_class_count": int(np.sum(y_train)),
        "true labels": [int(i) for i in y_test.values],
        "model": {
            "predictions": one_class_proba,
            "minority_class_examples_used": int(np.sum(y_train))
        }
    }

with open(f"full_training/nn_passive_learning_IR{desired_ratio}.json", "w", encoding="utf-8") as json_file:
    json.dump(results, json_file, ensure_ascii=False, indent=4, separators=(',', ': '))