In [None]:

import argparse
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

def detect_anomalies_iqr(data, metric, threshold=1.2):
    Q1 = data[metric].quantile(0.25)
    Q3 = data[metric].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - threshold * IQR
    upper_bound = Q3 + threshold * IQR
    return (data[metric] < lower_bound) | (data[metric] > upper_bound)

def main(dataset_file):
    df = pd.read_csv(dataset_file)

    shapiro_test = stats.shapiro(df.packet_loss)
    print(f"Prueba de Shapiro-Wilk para {df.packet_loss}: {shapiro_test}")

    metric_columns = ['throughput', 'congestion', 'packet_loss', 'latency', 'jitter']

    for metric in metric_columns:
        df[f'anomaly_{metric}'] = 0

    for metric in metric_columns:
        df.loc[detect_anomalies_iqr(df, metric), f'anomaly_{metric}'] = 1

        fig, ax = plt.subplots(figsize=(12, 6))
        ax.plot(df.index, df[metric], label=metric)
        ax.scatter(df[df[f'anomaly_{metric}'] == 1].index, df[df[f'anomaly_{metric}'] == 1][metric], color='red', label='Anomaly')
        ax.set_title(f'{metric} with Anomalies (IQR)')
        ax.set_ylabel(f'{metric} Values')
        ax.set_xlabel('Timestamp')
        ax.legend()
        plt.show()

    df['anomaly'] = (df[[f'anomaly_{metric}' for metric in metric_columns]].sum(axis=1) >= 2).astype(int)

    print(df[['anomaly'] + [f'anomaly_{metric}' for metric in metric_columns]])

    percentage_anomalies = (df['anomaly'].sum() / len(df)) * 100

    print("Anomaly percentage:", percentage_anomalies, "%")

    df.to_csv('dataset_label.csv', index=False)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Process some integers.')
    parser.add_argument('dataset_file', type=str, help='Path to the dataset file')
    args = parser.parse_args()

    main(args.dataset_file)


In [None]:
python labelAnomaly.py dataset.csv
