# Unsupervised learning methods for classification
Methods: 
- LOF
- OCSVM
- iForest
- DEBSCAN


### Global settings

In [3]:
import os
import pandas as pd

In [4]:
split_type = 'random'

TRAIN_DIR = f'../data/{split_type}split/train/'
TEST_DIR = f'../data/{split_type}split/test/'


float_numbers = [
    '4903052',
    '4903054',
    '4903058',
    '4903215',
    '4903217',
    '4903218',
    '4903220'
]

# float_number = '4903217' # high
float_number = '4903218' # low1
# float_number = '4903220' # low2
# float_number = '4903052' # low3
# float_number = '4903054' # low4



TRAIN_FILE = os.path.join(TRAIN_DIR, f'PR_PF_{float_number}.csv')
TEST_FILE = os.path.join(TEST_DIR, f'PR_PF_{float_number}.csv')

RESULT_DIR = f'../results/{split_type}split/{float_number}'

os.makedirs(RESULT_DIR, exist_ok=True)

In [5]:
def comp_ratio(dataset):
    ''' Compute anomaly ratio
    '''
    instance = dataset[(dataset['Label']==1)]
    rate=len(instance)/len(dataset)
    return rate, len(instance)

### Load data

In [6]:
train_data = pd.read_csv(TRAIN_FILE)
test_data = pd.read_csv(TEST_FILE)
train_data.drop(['ID', 'Date'], axis=1, inplace=True)
test_data.drop(['ID', 'Date'], axis=1, inplace=True)
train_data.head()


Unnamed: 0,Normalized_date,Latitude,Longitude,Pressure,Salinity,Temperature,Label
0,-0.972335,-0.392211,-1.281755,-0.652866,0.678335,0.551029,0
1,1.071506,1.753434,0.774128,-0.836737,1.040375,1.151896,0
2,1.175936,1.035057,0.746482,-0.832563,0.89744,1.16094,0
3,1.154624,1.295163,0.776619,-0.859222,0.849481,1.193639,0
4,1.218561,0.949184,0.823358,-0.891487,0.956682,1.09276,0


In [7]:
test_data

Unnamed: 0,Normalized_date,Latitude,Longitude,Pressure,Salinity,Temperature,Label
0,0.355416,0.579524,0.322254,-0.885756,1.002760,1.235034,0
1,1.576606,1.294321,1.093063,-0.898463,0.996178,0.982373,0
2,0.753954,0.365366,1.044996,0.926038,-1.132805,-1.136430,0
3,0.964945,0.878866,0.942539,-0.820480,0.947279,0.915585,0
4,1.261185,1.504443,0.953027,1.779428,-1.127163,-1.303981,0
...,...,...,...,...,...,...,...
58523,-0.531172,-0.468230,-0.065854,-0.882455,1.104320,1.255210,0
58524,-0.445923,-0.274119,-0.294182,1.667250,-1.138448,-1.300966,0
58525,0.502470,-0.138289,1.016256,-0.789773,0.691500,0.778181,0
58526,-1.289887,-1.175349,-1.067926,-0.829574,1.049778,1.213235,0


In [8]:
print(f'------- {float_number} ------')
print(f'Train: {train_data.shape[0]}; {round(comp_ratio(train_data)[0]*100, 2)}%')
print(f'Test: {test_data.shape[0]}; {round(comp_ratio(test_data)[0]*100, 2)}%')

------- 4903218 ------
Train: 175583; 0.84%
Test: 58528; 0.84%


### Classification

In [9]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, cohen_kappa_score, roc_auc_score, average_precision_score, confusion_matrix

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor
from sklearn.cluster import DBSCAN



In [10]:
def random_sampling(train_data, test_data, label_column, sampling_ratio):
    # Separate features and labels
    train_labels = train_data[label_column]
    train_features = train_data.drop(label_column, axis=1)
    test_labels = test_data[label_column]
    test_features = test_data.drop(label_column, axis=1)

    # Randomly select a subset of the train set
    train_features_sample, _, train_labels_sample, _ = train_test_split(train_features, train_labels, train_size=sampling_ratio, random_state=42)

    return train_features_sample, train_labels_sample, test_features, test_labels


In [11]:
# Usage 
label_column = 'Label'  # Name of the label column in the CSV files
sampling_ratio = 0.99 # Sampling ratio of 0.5 (50%)

# Perform random sampling
train_features_sample, train_labels_sample, test_features, test_labels = random_sampling(train_data, test_data, label_column, sampling_ratio)
train_features_sample.head()

Unnamed: 0,Normalized_date,Latitude,Longitude,Pressure,Salinity,Temperature
130108,-0.488548,-0.277046,-0.178464,-0.903508,1.191773,1.293822
50340,-0.761344,-0.844432,-0.742107,-0.799676,1.010283,0.949907
15829,-1.22595,-1.428635,-1.178019,1.271792,-1.137507,-1.227684
154238,-0.951023,-0.369676,-1.25761,-0.826086,1.092095,0.873958
18460,-1.247262,-1.366578,-1.105354,0.648863,-1.220259,-1.103499


In [12]:
contamination = comp_ratio(train_data)[0]
contamination

0.008372108917150293

In [16]:
def fit_predict_model(model_name, test_features):
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(test_features)
        
    if model_name == 'LOF':
        model = LocalOutlierFactor(n_neighbors = 20, contamination=contamination)
        labels_lof = model.fit_predict(X_scaled)
        predictions = np.where(labels_lof == -1, 1, 0)

    elif model_name == 'iForest':
        model = IsolationForest(n_estimators=20, random_state=42, contamination=contamination)
        model.fit(X_scaled)
        predictions = np.where(model.predict(X_scaled) == -1, 1, 0)

    elif model_name == 'OCSVM':
        X_norm = train_features[train_labels==0]
        # Standardize the features
        scaler = StandardScaler()
        X_norm_scaled = scaler.fit_transform(X_norm)

        # Fit the OC-SVM model
        model = OneClassSVM(nu=0.01, kernel='rbf', gamma='scale')  # You can adjust parameters as needed
        model.fit(X_norm_scaled)
    
    elif model_name == 'DBSCAN': 
        # Create a DBSCAN model
        eps = 0.3  # The maximum distance between two samples for one to be considered as in the neighborhood of the other
        min_samples = 20  # The number of samples (or total weight) in a neighborhood for a point to be considered as a core point
        model = DBSCAN(eps=eps, min_samples=min_samples)
        labels_dbscan = model.fit_predict(X_scaled)
        predictions = np.where(labels_dbscan == -1, 1, 0)

        # # Strategy 1: Map two clusters (0 and 1) to binary classes directly
        # # This is straightforward if DBSCAN found exactly 2 clusters
        # if len(np.unique(labels_dbscan)) == 3:  # Two clusters + noise
        #     binary_labels = np.where(labels_dbscan == -1, 2, labels_dbscan)  # Temporarily map noise to 2
        #     unique_clusters = np.unique(binary_labels)
        #     mapping = {unique_clusters[0]: 0, unique_clusters[1]: 1, 2: 0}  # Example mapping, noise to 0
        #     predictions = np.vectorize(mapping.get)(binary_labels)

        # # Strategy 2: One cluster and noise
        # # Here, we treat the cluster as 1 and noise as 0
        # elif len(np.unique(labels_dbscan)) == 2 and -1 in labels_dbscan:
        #     predictions = np.where(labels_dbscan == -1, 0, 1)

    else:
        raise ValueError(f"Invalid model name: {model_name}")
    return predictions

def evaluate_model(predictions, test_labels):
    precision = precision_score(test_labels, predictions, zero_division=0)
    recall = recall_score(test_labels, predictions, zero_division=0)
    f1 = f1_score(test_labels, predictions, zero_division=0)
    kappa = cohen_kappa_score(test_labels, predictions)
    confusion = confusion_matrix(test_labels, predictions)
    return precision, recall, f1, kappa, confusion

# Usage 
sampling_ratio = 0.99 # Sampling ratio of 0.5 (50%)

# Perform random sampling
train_features_sample, train_labels_sample, test_features, test_labels = random_sampling(train_data, test_data, label_column, sampling_ratio)


model_names = ['LOF', 'iForest', 'DBSCAN']  # Model names to evaluate
# model_names = ['LOF']  # Model names to evaluate
results = []
for model_name in model_names: 
    # Fit a model on the sampled train set
    predictions = fit_predict_model(model_name, test_features)

    # Evaluate the model on the test set
    precision, recall, f1, kappa, confusion = evaluate_model(predictions, test_labels)
    result = {'Model': model_name, 'Sampling Ratio': sampling_ratio,
                                            'Precision': precision, 'Recall': recall, 'F1-score': f1, "Cohen's Kappa": kappa, 'Confusion Matrix': confusion}

    results.append(result)

results_df = pd.DataFrame(results)

results_df

Unnamed: 0,Model,Sampling Ratio,Precision,Recall,F1-score,Cohen's Kappa,Confusion Matrix
0,LOF,0.99,0.036735,0.036735,0.036735,0.028602,"[[57566, 472], [472, 18]]"
1,iForest,0.99,0.0,0.0,0.0,-0.008443,"[[57548, 490], [490, 0]]"
2,DBSCAN,0.99,0.207792,0.032653,0.056437,0.054287,"[[57977, 61], [474, 16]]"


In [17]:
filtered_results = results_df[results_df['Sampling Ratio'] == 0.99]

# Select the desired columns
selected_columns = ['Model', 'Precision', 'Recall', 'F1-score', "Cohen's Kappa"]
filtered_results = filtered_results[selected_columns]

# Round numerical values to 4 decimals
filtered_results = filtered_results.round(4)

# Convert the results to LaTeX table format
latex_table = filtered_results.to_latex(index=False, escape=False)

# Print the LaTeX table
print(f"Float: {float_number}")

print(latex_table)


Float: 4903218
\begin{tabular}{lrrrr}
\toprule
  Model &  Precision &  Recall &  F1-score &  Cohen's Kappa \\
\midrule
    LOF &     0.0367 &  0.0367 &    0.0367 &         0.0286 \\
iForest &     0.0000 &  0.0000 &    0.0000 &        -0.0084 \\
 DBSCAN &     0.2078 &  0.0327 &    0.0564 &         0.0543 \\
\bottomrule
\end{tabular}



  latex_table = filtered_results.to_latex(index=False, escape=False)


In [15]:
# Save the results to a CSV file
output_file = os.path.join(RESULT_DIR, 'unsupervised_random_sampling.csv')
results_df.to_csv(output_file, index=False)
print("Results saved to:", output_file)

Results saved to: ../results/randomsplit/4903054/unsupervised_random_sampling.csv
