# Supervised learning methods for classification
Methods: 
- KNN
- LR
- RF      
- XGboost      
- CatBoost
- LightGBM

### Global settings

In [5]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, cohen_kappa_score, roc_auc_score, average_precision_score, confusion_matrix


In [None]:
split_type = 'random'

TRAIN_DIR = f'../data/{split_type}split/train/'
TEST_DIR = f'../data/{split_type}split/test/'

float_number = '4903217' # high
# float_number = '4903218' # low1
# float_number = '4903220' # low2
# float_number = '4903052' # low3
# float_number = '4903054' # low4




TRAIN_FILE = os.path.join(TRAIN_DIR, f'PR_PF_{float_number}.csv')
TEST_FILE = os.path.join(TEST_DIR, f'PR_PF_{float_number}.csv')

RESULT_DIR = f'../results/{split_type}split/{float_number}'

os.makedirs(RESULT_DIR, exist_ok=True)

In [58]:
def comp_ratio(dataset):
    ''' Compute anomaly ratio
    '''
    instance = dataset[(dataset['Label']==1)]
    rate=len(instance)/len(dataset)*100
    return round(rate,2), len(instance)

### Load data

In [60]:
train_data = pd.read_csv(TRAIN_FILE)
test_data = pd.read_csv(TEST_FILE)
train_data.drop(['ID', 'Date'], axis=1, inplace=True)
test_data.drop(['ID', 'Date'], axis=1, inplace=True)
train_data.head()


Unnamed: 0,Normalized_date,Latitude,Longitude,Pressure,Salinity,Temperature,Label
0,0.024591,1.988715,-0.037768,-0.83818,0.946121,0.98013,0
1,-1.669044,0.861651,0.9842,-0.603728,0.309813,0.267241,0
2,-1.41425,0.368683,1.349974,-0.792556,0.762956,0.736119,0
3,-0.377941,-0.814887,0.648801,-0.750002,0.870756,0.689826,0
4,1.041629,-0.974327,-0.91316,1.834425,-0.795855,-1.299581,1


In [61]:
test_data

Unnamed: 0,Normalized_date,Latitude,Longitude,Pressure,Salinity,Temperature,Label
0,1.696816,-1.544808,-1.982873,2.044185,-0.762466,-1.345333,1
1,-0.673418,-0.046107,0.689347,-0.851279,0.633214,0.994408,0
2,1.337106,-0.060530,-1.387324,-0.850777,1.427883,0.994083,1
3,0.574862,-0.103651,-0.310668,-0.786226,1.303865,0.917397,1
4,1.463432,-1.463739,-1.625058,0.753162,-0.872174,-1.151941,1
...,...,...,...,...,...,...,...
59842,-0.165969,1.340966,0.186850,-0.765670,1.054875,0.804152,0
59843,0.320068,1.949964,-0.372045,1.605488,-1.220378,-1.253396,0
59844,1.589759,-0.506737,-1.790191,-0.834608,1.157905,1.150592,1
59845,-0.377941,-0.814887,0.648801,-0.791867,0.943259,0.901605,0


In [62]:
print(f'------- {float_number} ------')
print(f'Train: {train_data.shape[0]}; {comp_ratio(train_data)[0]}%')
print(f'Test: {test_data.shape[0]}; {comp_ratio(test_data)[0]}%')

------- 4903217 ------
Train: 179539; 33.72%
Test: 59847; 33.72%


### Classification with all training data

In [None]:
def random_sampling(train_data, test_data, label_column, sampling_ratio):
    # Separate features and labels
    train_labels = train_data[label_column]
    train_features = train_data.drop(label_column, axis=1)
    test_labels = test_data[label_column]
    test_features = test_data.drop(label_column, axis=1)

    # Randomly select a subset of the train set
    train_features_sample, _, train_labels_sample, _ = train_test_split(train_features, train_labels, train_size=sampling_ratio, random_state=42)

    return train_features_sample, train_labels_sample, test_features, test_labels

def fit_model(model_name, train_features, train_labels):
    if model_name == 'KNN':
        model = KNeighborsClassifier(n_neighbors=5, leaf_size=30)
    elif model_name == 'LR': 
        model = LogisticRegression(penalty='l2', random_state=42)
    elif model_name == 'RF': 
        model = RandomForestClassifier(n_estimators=20, random_state=42)
    elif model_name == 'XGBoost':
        model = XGBClassifier(max_depth=6)
    elif model_name == 'CatBoost':
        model = CatBoostClassifier(depth=2, iterations=20, silent=True)
    elif model_name == 'LightGBM':
        model = LGBMClassifier(max_depth=2, n_estimators=50)
    else:
        raise ValueError(f"Invalid model name: {model_name}")

    model.fit(train_features, train_labels)
    return model

def evaluate_model(model, test_features, test_labels):
    # Test the model on the test set
    predictions = model.predict(test_features)
    probabilities = model.predict_proba(test_features)[:,1]

    precision = precision_score(test_labels, predictions, zero_division=0)
    recall = recall_score(test_labels, predictions, zero_division=0)
    f1 = f1_score(test_labels, predictions, zero_division=0)
    kappa = cohen_kappa_score(test_labels, predictions)
    roc_auc = roc_auc_score(test_labels, probabilities)
    prc_auc = average_precision_score(test_labels, probabilities)
    confusion = confusion_matrix(test_labels, predictions)

    return precision, recall, f1, kappa, roc_auc, prc_auc, confusion

# Usage 
label_column = 'Label'  # Name of the label column in the CSV files
sampling_ratio = 0.99 # Sampling ratio of 0.5 (50%)

# Perform random sampling
train_features_sample, train_labels_sample, test_features, test_labels = random_sampling(train_data, test_data, label_column, sampling_ratio)

model_names = ['KNN', 'LR', 'RF', 'XGBoost', 'CatBoost', 'LightGBM']  # Model names to evaluate
results = []
for model_name in model_names: 
    # Fit a model on the sampled train set
    model = fit_model(model_name, train_features_sample, train_labels_sample)

    # Evaluate the model on the test set
    precision, recall, f1, kappa, roc_auc, prc_auc, confusion = evaluate_model(model, test_features, test_labels)
    result = {'Model': model_name, 'Sampling Ratio': sampling_ratio,
                                            'Precision': precision, 'Recall': recall, 'F1-score': f1, "Cohen's Kappa": kappa, 'ROC-AUC': roc_auc, 'PRC-AUC': prc_auc, 'Confusion Matrix': confusion}

    results.append(result)

results_df = pd.DataFrame(results)

results_df

Unnamed: 0,Model,Sampling Ratio,Precision,Recall,F1-score,Cohen's Kappa,ROC-AUC,PRC-AUC,Confusion Matrix
0,LightGBM,0.99,1.0,0.999504,0.999752,0.999626,0.999864,0.999759,"[[39668, 0], [10, 20169]]"


In [64]:
filtered_results = results_df[results_df['Sampling Ratio'] == 0.99]

# Select the desired columns
selected_columns = ['Model', 'Precision', 'Recall', 'F1-score', "Cohen's Kappa", 'PRC-AUC']
filtered_results = filtered_results[selected_columns]

# Round numerical values to 4 decimals
filtered_results = filtered_results.round(4)

# Convert the results to LaTeX table format
latex_table = filtered_results.to_latex(index=False, escape=False)

# Print the LaTeX table
print(f"Float: {float_number}")

print(latex_table)


Float: 4903217
\begin{tabular}{lrrrrr}
\toprule
   Model &  Precision &  Recall &  F1-score &  Cohen's Kappa &  PRC-AUC \\
\midrule
LightGBM &        1.0 &  0.9995 &    0.9998 &         0.9996 &   0.9998 \\
\bottomrule
\end{tabular}



  latex_table = filtered_results.to_latex(index=False, escape=False)


### Classification with varying sampling ratio

In [62]:
# sampling_ratios = [0.1, 0.08, 0.05, 0.04, 0.03, 0.02, 0.01]
# sampling_ratios = [0.001, 0.0008, 0.0005, 0.0004, 0.0003, 0.0002, 0.0001, 0.00005]
# sampling_ratios = [0.99, 0.8, 0.5, 0.2, 0.1, 0.05, 0.01]
sampling_ratios = [0.99, 0.8, 0.5, 0.3, 0.2, 0.1, 0.05, 0.01, 0.005, 0.002, 0.001]
model_names = ['KNN', 'XGBoost', 'CatBoost', 'LightGBM']  # Model names to evaluate
output_file = os.path.join(RESULT_DIR, 'random_sampling.csv')



In [74]:
%%capture
# Initialize the result DataFrame
results_df = pd.DataFrame(columns=['Model', 'Sampling Ratio', 'Precision', 'Recall', 'F1-score', "Cohen's Kappa", 'ROC-AUC'])

# Iterate over model names
for model_name in model_names:
    # Iterate over sampling ratios
    for sampling_ratio in sampling_ratios:
        print(sampling_ratio)
        # Perform random sampling
        train_features_sample, train_labels_sample, test_features, test_labels = random_sampling(train_data, test_data, label_column, sampling_ratio)
        # Fit a model on the sampled train set
        model = fit_model(model_name, train_features_sample, train_labels_sample)

        # Evaluate the model on the test set
        precision, recall, f1, kappa, roc_auc = evaluate_model(model, test_features, test_labels)
        result = {'Model': model_name, 'Sampling Ratio': sampling_ratio,
                                        'Precision': precision, 'Recall': recall, 'F1-score': f1, "Cohen's Kappa": kappa, 'ROC-AUC': roc_auc}

        # Append the results to the DataFrame
        results_df = results_df.append(result,
                                    ignore_index=True)
        

# Save the results to a CSV file
results_df.to_csv(output_file, index=False)
print("Results saved to:", output_file)

In [75]:
results_df


Unnamed: 0,Model,Sampling Ratio,Precision,Recall,F1-score,Cohen's Kappa,ROC-AUC
0,KNN,0.99,0.931034,0.835052,0.880435,0.880253,0.932908
1,KNN,0.8,0.952941,0.835052,0.89011,0.889945,0.932912
2,KNN,0.5,0.94186,0.835052,0.885246,0.885072,0.922607
3,KNN,0.3,0.929412,0.814433,0.868132,0.867934,0.927753
4,KNN,0.2,0.886364,0.804124,0.843243,0.843003,0.922589
5,KNN,0.1,0.744444,0.690722,0.716578,0.716138,0.901909
6,KNN,0.05,0.6625,0.546392,0.59887,0.598286,0.906845
7,KNN,0.01,0.52809,0.484536,0.505376,0.504614,0.916775
8,KNN,0.005,0.0,0.0,0.0,0.0,0.916063
9,KNN,0.002,0.0,0.0,0.0,0.0,0.913592
