# Unsupervised learning model to select initial set

In [58]:
import os
import numpy as np
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler

In [59]:
TRAIN_PATH = "../data/randomsplit/train"
VAL_PATH = "../data/randomsplit/val"

In [60]:
def comp_ratio(dataset):
    ''' Compute anomaly ratio
    '''
    instance = dataset[(dataset['Label']==1)]
    rate=len(instance)/len(dataset)*100
    return round(rate,2), len(instance)

### Load data

In [61]:
float_numbers = [
    '4903052',
    '4903054',
    '4903058',
    '4903215',
    '4903217',
    '4903218',
    '4903220'
]
float_number = float_numbers[6]

train_file = os.path.join(TRAIN_PATH, f'PR_PF_{float_number}.csv')
val_file = os.path.join(VAL_PATH, f'PR_PF_{float_number}.csv')

data_df = pd.read_csv(train_file)
val_data_df = pd.read_csv(val_file)

error_ratio, _ = comp_ratio(data_df)
error_ratio = error_ratio/100

error_ratio


0.0016

In [62]:
n_initial = 400

In [63]:
# Drop any columns that are not features
X = data_df.drop(columns=['ID', 'Date', 'Label'])  # Replace 'label_column' with the actual label column name
X_val = val_data_df.drop(columns=['ID', 'Date', 'Label'])

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_val_scaled = scaler.fit_transform(X_val)

### iForest

In [64]:
import pandas as pd
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

# Specify the number of top k anomalous instances to select
k = n_initial

# Fit the Isolation Forest model
isoforest = IsolationForest(n_estimators=100, contamination=error_ratio)  # You can adjust parameters as needed
isoforest.fit(X_scaled)

# Predict anomaly scores (negative scores indicate anomalies)
# anomaly_scores = isoforest.decision_function(X_scaled)
anomaly_scores = isoforest.decision_function(X_scaled)

# Get the indices of the top k most anomalous instances
top_k_anomalies_indices = np.argsort(anomaly_scores)[:k]

# Select the top k most anomalous instances from the original DataFrame
top_k_anomalies = data_df.iloc[top_k_anomalies_indices]

# print("Top {} most anomalous instances:".format(k))
top_k_anomalies.Label.sum()

0

### OCSVM

In [65]:
import pandas as pd
from sklearn.svm import OneClassSVM
from sklearn.preprocessing import StandardScaler

# Load your dataset as a DataFrame
n_random = int(n_initial/2)
initial_set = data_df.sample(n=n_random, random_state=42)
unlabeled_set = data_df.drop(initial_set.index)

initial_set.Label.sum()
normal_data = initial_set[initial_set['Label']==0]
normal_data.shape[0]

# Specify the number of top k anomalous instances to select
k = n_initial - n_random
# Drop any columns that are not features
X_norm = normal_data.drop(columns=['ID', 'Date', 'Label'])  # Replace 'label_column' with the actual label column name
X_val = val_data_df.drop(columns=['ID', 'Date', 'Label'])


# Standardize the features
scaler = StandardScaler()
X_norm_scaled = scaler.fit_transform(X_norm)

# Fit the OC-SVM model
ocsvm = OneClassSVM(nu=0.01, kernel='rbf', gamma='scale')  # You can adjust parameters as needed
ocsvm.fit(X_norm_scaled)

Z = unlabeled_set.drop(columns=['ID', 'Date', 'Label'])

# Standardize the features
Z_scaled = scaler.fit_transform(Z)

# Predict anomaly scores for instances
distances = ocsvm.decision_function(Z_scaled)
labels = ocsvm.predict(Z_scaled)
# anomaly_scores = ocsvm.decision_function(X_val_scaled)

# Get the indices of the top k most anomalous instances
# top_k_anomalies_indices = anomaly_scores.argsort()[:k]
top_k_anomalies_indices = distances.argsort()[:k]

# Select the top k most anomalous instances from the original DataFrame
# ocsvm_top_k_anomalies = unlabeled_set.iloc[top_k_anomalies_indices]
ocsvm_top_k_anomalies = data_df.iloc[top_k_anomalies_indices]

# print("Top {} most anomalous instances:".format(k))
ocsvm_top_k_anomalies.Label.sum()


0

### LOF

In [72]:
# Specify the number of top k anomalous instances to select
k = n_initial
# k = 100

# Fit the LOF model
lof = LocalOutlierFactor(n_neighbors=2, contamination=error_ratio)  # You can adjust parameters as needed
labels = lof.fit_predict(X_scaled)

anomaly_scores = lof.negative_outlier_factor_
# labels = lof.fit_predict(X_val_scaled)

# Get the indices of the top k most anomalous instances
top_k_anomalies_indices = np.argsort(anomaly_scores)[:k]

# Select the top k most anomalous instances from the original DataFrame
lof_top_k_anomalies = data_df.iloc[top_k_anomalies_indices]

# print("Top {} most anomalous instances:".format(k))
lof_top_k_anomalies.Label.sum()

7

In [73]:
n_initial

400

In [76]:
3/100/0.0016

18.75