# Split the initial and the unlabeled sets for pool-based AL
Input: 
- processed data
- n_initial
- initial_method

Output: 
- Saved initial and unlabeled sets

In [18]:
import os
import random
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler

In [19]:
TRAIN_PATH = "../data/randomsplit/train"

initial_method = 'random'
# initial_method = 'ocsvm'
# initial_method = 'lof'


In [20]:
def comp_ratio(dataset):
    ''' Compute anomaly ratio
    '''
    instance = dataset[(dataset['Label']==1)]
    rate=len(instance)/len(dataset)*100
    return round(rate,2), len(instance)

### Split dataset

In [21]:
def split_dataset(initial_method, csv_file, n_initial, random_seed):
    # Load the CSV file into a pandas DataFrame
    data_df = pd.read_csv(csv_file)
    error_ratio, _ = comp_ratio(data_df)
    error_ratio = error_ratio/100

    if initial_method=='random': 
        # Randomly select n_initial samples
        initial_set = data_df.sample(n=n_initial, random_state=random_seed)

    elif initial_method=='ocsvm':
        n_seed = int(n_initial/2)
        k = n_initial - n_seed
        subset1 = data_df.sample(n=n_seed, random_state=random_seed)
        subset2 = data_df.drop(subset1.index)
        normal_data = subset1[subset1['Label']==0]
        
        X = normal_data.drop(columns=['ID', 'Date', 'Label'])  # Replace 'label_column' with the actual label column name

        # Standardize the features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Fit the OC-SVM model
        ocsvm = OneClassSVM(nu=0.01, kernel='rbf', gamma='scale')  # You can adjust parameters as needed
        ocsvm.fit(X_scaled)

        Z = subset2.drop(columns=['ID', 'Date', 'Label'])

        # Standardize the features
        Z_scaled = scaler.fit_transform(Z)

        # Predict anomaly scores for instances
        anomaly_scores = ocsvm.decision_function(Z_scaled)

        # Get the indices of the top k most anomalous instances
        top_k_anomalies_indices = anomaly_scores.argsort()[:k]
        
        # Select the top k most anomalous instances from the original DataFrame
        initial_set = pd.concat([subset1, subset2.iloc[top_k_anomalies_indices]])

    elif initial_method=='lof': 
        # Drop any columns that are not features
        X = data_df.drop(columns=['ID', 'Date', 'Label'])  # Replace 'label_column' with the actual label column name

        # Standardize the features
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)

        # Fit the LOF model
        lof = LocalOutlierFactor(n_neighbors=2, contamination=error_ratio)  # You can adjust parameters as needed
        labels = lof.fit_predict(X_scaled)

        # Get the indices of the top k most anomalous instances
        top_k_anomalies_indices = labels.argsort()[:n_initial]

        # Select the top k most anomalous instances from the original DataFrame
        initial_set = data_df.iloc[top_k_anomalies_indices]

    elif initial_method=='iforest': 
        # Fit the Isolation Forest model
        isoforest = IsolationForest(contamination=0.1)  # You can adjust parameters as needed
        isoforest.fit(X_scaled)

        # Predict anomaly scores (negative scores indicate anomalies)
        anomaly_scores = isoforest.decision_function(X_scaled)

        # Transform anomaly scores into probability-like values
        # Using sigmoid function to map scores to [0, 1]
        prob_scores = 1 / (1 + np.exp(-anomaly_scores))

        # Get the indices of the top k most anomalous instances
        top_k_anomalies_indices = prob_scores.argsort()[:n_initial]

        # Select the top k most anomalous instances from the original DataFrame
        initial_set = data_df.iloc[top_k_anomalies_indices]

    else: 
        print('Sorry the split method is not supported. (´-ω-`)')
        return
    
    # Get the remaining samples for the unlabeled set
    unlabeled_set = data_df.drop(initial_set.index)
    
    # Get the directory of the input CSV file
    csv_dir = os.path.dirname(csv_file)
    
    # Get the base filename without the extension
    base_filename = os.path.splitext(os.path.basename(csv_file))[0]
    
    # Save the initial set and unlabeled set in the same directory
    initial_set.to_csv(os.path.join(csv_dir, f'{initial_method}_{base_filename}_{n_initial}_rand_{random_seed}_initial.csv'), index=False)
    unlabeled_set.to_csv(os.path.join(csv_dir, f'{initial_method}_{base_filename}_{n_initial}_rand_{random_seed}_unlabeled.csv'), index=False)
    
    print(f"{initial_method} split: {error_ratio} errors, {n_initial} initial samples, {initial_set.Label.sum()} anomalies.")

In [22]:
float_numbers = [
    '4903217',
    '4903218',
    '4903220', 
    '4903052',
    '4903054',
]

float_numbers = [
    '4903217',
    '4903218',
    '4903220', 
    '4903052',
    '4903054',
]

random_seeds = [84, 65, 34, 25, 3]

n_initials = [100, 200, 300, 400]

for random_seed in random_seeds: 
    for float_number in float_numbers: 
        print(f'------ Float: {float_number} Random seed: {random_seed} ------')
        for n_initial in n_initials: 
            # Specify the CSV file path and the number of initial samples
            csv_file = os.path.join(TRAIN_PATH, f'PR_PF_{float_number}.csv')
            
            # Call the function to split the dataset
            split_dataset(initial_method, csv_file, n_initial, random_seed)
    

------ Float: 4903217 Random seed: 84 ------
random split: 0.3372 errors, 100 initial samples, 30 anomalies.
random split: 0.3372 errors, 200 initial samples, 72 anomalies.
random split: 0.3372 errors, 300 initial samples, 101 anomalies.
random split: 0.3372 errors, 400 initial samples, 131 anomalies.
------ Float: 4903218 Random seed: 84 ------
random split: 0.0084 errors, 100 initial samples, 0 anomalies.
random split: 0.0084 errors, 200 initial samples, 3 anomalies.
random split: 0.0084 errors, 300 initial samples, 4 anomalies.
random split: 0.0084 errors, 400 initial samples, 5 anomalies.
------ Float: 4903220 Random seed: 84 ------
random split: 0.0016 errors, 100 initial samples, 1 anomalies.
random split: 0.0016 errors, 200 initial samples, 1 anomalies.
random split: 0.0016 errors, 300 initial samples, 1 anomalies.
random split: 0.0016 errors, 400 initial samples, 1 anomalies.
------ Float: 4903052 Random seed: 84 ------
random split: 0.0069 errors, 100 initial samples, 0 anomal