# Test

In [None]:
import numpy as np
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
import plotly.express as px
import datetime
import typing
import requests
import time
import shutil
import json
from starvers.starvers import TripleStoreEngine
import seaborn as sns
from scipy.io import arff
import json
from rdflib import Graph, Namespace, URIRef, Literal, BNode
from rdflib.namespace import RDF, RDFS, XSD, DCTERMS
from sklearn.preprocessing import MinMaxScaler

In [None]:
def now() -> str:
    """
    Returns the current time in ISO 8601 format with UTC timezone in the following format:
    YYYY-MM-DDTHH:MM:SS.sssZ
    """
    timestamp = datetime.datetime.now(datetime.timezone.utc)
    timestamp_formated = timestamp.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3]  +"Z"

    return timestamp_formated

## Data Understanding

In [None]:
# Filling the Data Frame

dataset_path = "" # File has to be in the same directory as the notebook
file_name = 'php0FyS2T.arff' # Filename in .arff format 

def load_arff_data() -> pd.DataFrame:
    
    input_file = os.path.join(dataset_path, file_name) 
    
    # Use scipy's arff loader to handle the @attribute metadata and @data sections
    raw_data, meta = arff.loadarff(input_file)
    
    # Convert the raw structured array to a pandas DataFrame
    dataframe = pd.DataFrame(raw_data)

    def clean_data(df: pd.DataFrame):
        # ARFF loaders often read nominal/string attributes as bytes (e.g., b'1').
        # This function decodes them back to standard strings or integers.
        if 'Class' in df.columns and df['Class'].dtype == object: # Convert original string values (in case they were wrongly importated as "byte") back to to string or integer
             df['Class'] = df['Class'].str.decode('utf-8').astype(int)
        return df

    loaded_data = dataframe
    loaded_data = clean_data(loaded_data)
    
    return loaded_data

# Execute
df = load_arff_data()

display(df.head())



In [None]:
# Data Analysis

def analyze_dataset(df: pd.DataFrame):
    print("=== Dataset Overview ===")
    n_instances, n_attributes = df.shape
    print(f"Number of instances: {n_instances}")
    print(f"Number of attributes: {n_attributes}")
    
    # Check for Class attribute
    if 'Class' not in df.columns:
        target_col = df.columns[-1]  # Sometimes the target is the last column with a different name
        print(f"Target column assumed to be: '{target_col}'")
    else:
        target_col = 'Class'
        
    print(f"\n=== Attribute Types ===")
    print(df.dtypes.value_counts())
    
    print("\n=== Missing Values ===")
    missing_counts = df.isnull().sum()
    total_missing = missing_counts.sum()
    print(f"Total missing values: {total_missing}")
    if total_missing > 0:
        print(missing_counts[missing_counts > 0])
        
    print("\n=== Value Ranges & Statistics ===")
    # numeric_df excludes the target if it's categorical
    numeric_df = df.select_dtypes(include=[np.number])
    stats = numeric_df.describe().T
    stats['range'] = stats['max'] - stats['min']
    display(stats[['min', 'max', 'mean', 'std', 'range']].head())

    print("\n=== Sparsity ===")
    zero_counts = (numeric_df == 0).sum().sum()
    total_cells = numeric_df.size
    sparsity = zero_counts / total_cells
    print(f"Sparsity (percentage of zeros): {sparsity:.2%}")

    print("\n=== Class Distribution (Majority/Minority) ===")
    class_counts = df[target_col].value_counts()
    print(f"Number of classes: {len(class_counts)}")
    print(f"Min class size: {class_counts.min()}")
    print(f"Max class size: {class_counts.max()}")
    
    # Plotting
    plt.figure(figsize=(12, 5))
    
    # 1. Class Distribution
    plt.subplot(1, 2, 1)
    # Using a subset if too many classes
    if len(class_counts) > 20:
        sns.histplot(class_counts, bins=10, kde=False)
        plt.title('Histogram of Class Sizes')
        plt.xlabel('Number of Instances per Class')
    else:
        sns.barplot(x=class_counts.index, y=class_counts.values)
        plt.title('Class Distribution')
        
    # 2. Correlation Matrix (features only)
    plt.subplot(1, 2, 2)
    # Subsampling features if too many for a clean plot
    corr_matrix = numeric_df.iloc[:, :20].corr() 
    sns.heatmap(corr_matrix, cmap='coolwarm', annot=False)
    plt.title('Correlation Matrix (First 20 Features)')
    
    plt.tight_layout()
    plt.show()

    return stats, class_counts

# Execute Analysis
stats, class_counts = analyze_dataset(df)



## TASK 1 (Train a regular SOM)

In [None]:
# Task 1