In [1]:
import pandas as pd
import numpy as np
import kagglehub
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import numpy as np
from imblearn.under_sampling import RandomUnderSampler
import json
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier

In [2]:
path = kagglehub.dataset_download("mlg-ulb/creditcardfraud")
df = pd.read_csv(path + '/creditcard.csv')

In [None]:
df.describe()

In [4]:
df = df.drop(columns="Time")
scaler = StandardScaler()
for col in df.columns:
    if col == "Class":
        continue
    df[col] = scaler.fit_transform(df[[col]])

In [None]:
df.describe()

In [None]:
df.info()

In [7]:
X, y = df.drop(columns="Class"), df["Class"]

### Undersampling

In [8]:
class_counts = y.value_counts()
minority_class = class_counts.idxmin()
minority_count = class_counts.min()
majority_class = class_counts.idxmax()

desired_ratio = 2
majority_count = minority_count * desired_ratio
sampling_strategy = {
    minority_class: minority_count,
    majority_class: majority_count
    }

In [9]:
rus = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=17)
X_resampled, y_resampled = rus.fit_resample(X, y)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=17)

In [None]:

print(f"Imbalance ratio in training set: {(y_train.count()-np.sum(y_train))/np.sum(y_train)}")
print(f"Class proportion in testing set: {(y_test.count()-np.sum(y_test))/np.sum(y_test)}")

Creating new smaller training data set that has both classes:

In [None]:
rng = np.random.RandomState(42)
labeled_X = pd.DataFrame()
labeled_y = pd.Series()
chosen_indexes = []
chosen_0_class = 0
chosen_1_class = 0
for i in y_train.index.tolist():
    class_value = y_train.loc[i]
    if class_value == 0 and chosen_0_class < desired_ratio:
        chosen_0_class += 1
    elif class_value == 1 and chosen_1_class < 1:
        chosen_1_class += 1
    elif chosen_0_class == desired_ratio and chosen_1_class == 1:
        break
    else:
        continue
    labeled_X = pd.concat([labeled_X, X_train.loc[[i]]])
    X_train = X_train.drop(i)
    labeled_y = pd.concat([labeled_y, y_train.loc[[i]]])
    y_train = y_train.drop(i)


In [13]:
def get_one_class_probability(binary_proba):
    single_values = []
    for proba in binary_proba:
        single_values.append(round(proba[1], 4))
    return single_values

In [19]:
def active_learning(labeled_X, labeled_y, X_train, y_train, model, iterations, save_path):
    # Struktura wyników
    results = {
        "training_set_size": len(X_train),
        "test_set_size": len(X_test),
        "count_of_minority_class_members": int(y_train.sum()),
        "true_labels": [int(i) for i in y_test.values],
        "models": []
    }

    # Główna pętla uczenia
    for i in range(iterations):
        model.fit(labeled_X, labeled_y)
        
        test_proba = model.predict_proba(X_test)
        one_class_proba = get_one_class_probability(test_proba)
        model_result = {"training instances": len(labeled_y), 
                        "predictions": one_class_proba,
                        "minority_class_examples_used": sum(labeled_y)}
        results["models"].append(model_result)
        
        probabilities = model.predict_proba(X_train)
        uncertainty = np.abs(probabilities[:, 0] - probabilities[:, 1])
        least_confident_index = np.argmin(uncertainty)
        least_confident_sample = X_train.iloc[least_confident_index]
        df_index_number = least_confident_sample.name
        
        labeled_X = pd.concat([labeled_X, least_confident_sample.to_frame().T])
        labeled_y = pd.concat([labeled_y, y_train.loc[[df_index_number]]])
        X_train = X_train.drop(df_index_number)
        y_train = y_train.drop(df_index_number)
        print(f"{i}/{iterations-1}")

    # Zapis do pliku
    with open(save_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4, ensure_ascii=False)
        
    return results

In [21]:
def passive_learning(labeled_X, labeled_y, X_train, y_train, model, iterations, save_path):
    # Struktura wyników
    results = {
        "training_set_size": len(X_train),
        "test_set_size": len(X_test),
        "count_of_minority_class_members": int(y_train.sum()),
        "true_labels": [int(i) for i in y_test.values],
        "models": []
    }

    # Główna pętla uczenia
    for i in range(iterations):
        model.fit(labeled_X, labeled_y)
        
        test_proba = model.predict_proba(X_test)
        one_class_proba = get_one_class_probability(test_proba)
        model_result = {"training instances": len(labeled_y), 
                        "predictions": one_class_proba,
                        "minority_class_examples_used": sum(labeled_y)}
        results["models"].append(model_result)
        
    random_record = X_train.sample(n=1)
    labeled_X = pd.concat([labeled_X, random_record])
    labeled_y = pd.concat([labeled_y, y_train.loc[random_record.index]])
    
    X_train = X_train.drop(random_record.index)
    y_train = y_train.drop(random_record.index)
    
    print(f"{i}/{iterations-1}")

    # Zapis do pliku
    with open(save_path, "w", encoding="utf-8") as f:
        json.dump(results, f, indent=4, ensure_ascii=False)
        
    return results

Active Learning for SVM


In [None]:
active_learning(
    labeled_X=labeled_X,
    labeled_y=labeled_y,
    X_train=X_train,
    y_train=y_train,
    model=SVC(kernel="rbf", probability=True, cache_size=1000),
    iterations=300,
    save_path=f"one_positive_class/rbf_active_learning_IR{desired_ratio}.json"
)

Passive learing for SVM

In [None]:
passive_learning(
    labeled_X=labeled_X,
    labeled_y=labeled_y,
    X_train=X_train,
    y_train=y_train,
    model=SVC(kernel="rbf", probability=True, cache_size=1000),
    iterations=300,
    save_path=f"one_positive_class/rbf_passive_learning_IR{desired_ratio}.json"
)

Active learning for NN

In [None]:
active_learning(
    labeled_X=labeled_X,
    labeled_y=labeled_y,
    X_train=X_train,
    y_train=y_train,
    model=MLPClassifier(hidden_layer_sizes=(10, 10), max_iter=1000, random_state=42),
    iterations=300,
    save_path=f"one_positive_class_nn/active_learning_IR{desired_ratio}.json"
)

Passive learning for NN

In [None]:
passive_learning(
    labeled_X=labeled_X,
    labeled_y=labeled_y,
    X_train=X_train,
    y_train=y_train,
    model=MLPClassifier(hidden_layer_sizes=(10, 10), max_iter=1000, random_state=42),
    iterations=300,
    save_path=f"one_positive_class_nn/passive_learning_IR{desired_ratio}.json"
)

In [None]:
passive_model = MLPClassifier(
    hidden_layer_sizes=(10, 10), max_iter=1000, random_state=42)
passive_model.fit(X_train, y_train)
    
test_proba = passive_model.predict_proba(X_test)
one_class_proba = get_one_class_probability(test_proba)
results = {
        "training_set_size": len(X_train),
        "test_set_size": len(X_test),
        "minority_class_count": int(np.sum(y_train)),
        "true labels": [int(i) for i in y_test.values],
        "model": {
            "predictions": one_class_proba,
            "minority_class_examples_used": int(np.sum(y_train))
        }
    }

with open(f"full_training/rbf_passive_learning_IR{desired_ratio}.json", "w", encoding="utf-8") as json_file:
    json.dump(results, json_file, ensure_ascii=False, indent=4, separators=(',', ': '))