# Extract Messages from ANX

In [10]:
!pip install chardet scikit-learn
!pip install "accelerate>=0.26.0"



In [11]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

print("Hugging Face and PyTorch are ready!")

Hugging Face and PyTorch are ready!


In [12]:
import os

# List all .anx files in the current directory
anx_files = [f for f in os.listdir('.') if f.lower().endswith('.anx')]
print("Found .anx files:", anx_files)

Found .anx files: ['2025012700_SRRS0021.ANX']


In [13]:
import chardet

with open(anx_files[0], 'rb') as file:
    result = chardet.detect(file.read())  # Read first 10KB to guess encoding
    print(result)

{'encoding': 'MacRoman', 'confidence': 0.7161419164734713, 'language': ''}


In [15]:
# weird but detected and worked. likely just bad data with no "good" encoding
with open(anx_files[0], 'r', encoding='mac_roman') as file:
    data = file.read()

print(data[:500])  # Display the first 500 characters

****0000000157****
SRUS32 KWOH 262348

RRSACR

:&&HADS SOR REPORT FOR USER ACR

.A ALRP7 20250126 DH2330/PPCRA 0.00

.A SRGP7 20250126 DH2330/PPCRA 0.01

:END OF REPORT

****0000001421****
SRPA40 KWOH 262348

RRSHFO

:&&HADS SOR REPORT FOR USER HFO

.A LSFG8 20250126 DH2325/QRIRA 0.00088

.A LSFG8 20250126 DH2330/QRIRA 0.00091

.A LSFG8 20250126 DH2335/QRIRA 0.00088

.A IMOG8 20250126 DH2325/QRIRA 0.00319

.A IMOG8 20250126 DH2330/QRIRA 0.00319

.A IMOG8 20250126 DH2335/QRIRA 0.00319

.A MAUG8 2


In [17]:
import os

# Change the filename as needed
file_path = './2025012700_SRRS0021.ANX'

# Read the file using the detected encoding (MacRoman)
with open(file_path, 'r', encoding='mac_roman') as f:
    content = f.read()

# Print the first 2000 characters to inspect the structure
print(content[:2000])

****0000000157****
SRUS32 KWOH 262348

RRSACR

:&&HADS SOR REPORT FOR USER ACR

.A ALRP7 20250126 DH2330/PPCRA 0.00

.A SRGP7 20250126 DH2330/PPCRA 0.01

:END OF REPORT

****0000001421****
SRPA40 KWOH 262348

RRSHFO

:&&HADS SOR REPORT FOR USER HFO

.A LSFG8 20250126 DH2325/QRIRA 0.00088

.A LSFG8 20250126 DH2330/QRIRA 0.00091

.A LSFG8 20250126 DH2335/QRIRA 0.00088

.A IMOG8 20250126 DH2325/QRIRA 0.00319

.A IMOG8 20250126 DH2330/QRIRA 0.00319

.A IMOG8 20250126 DH2335/QRIRA 0.00319

.A MAUG8 20250126 DH2325/QRIRA 0.00094

.A MAUG8 20250126 DH2330/QRIRA 0.00116

.A MAUG8 20250126 DH2335/QRIRA 0.00094

.A UGUG8 20250126 DH2325/QRIRA 0.01140

.A UGUG8 20250126 DH2330/QRIRA 0.01140

.A UGUG8 20250126 DH2335/QRIRA 0.01140

.A PAGG8 20250126 DH2325/QRIRA 0.00617

.A PAGG8 20250126 DH2330/QRIRA 0.00617

.A PAGG8 20250126 DH2335/QRIRA 0.00617

.A LSFG8 20250126 DH2325/HGIRA 10.56

.A LSFG8 20250126 DH2330/HGIRA 10.57

.A LSFG8 20250126 DH2335/HGIRA 10.56

.A IMOG8 20250126 DH2325/HGIRA 9.14


In [18]:
import re

# Header pattern, e.g. for a header like "****0000000157****"
header_pattern = re.compile(r'\*{4}\d{10}\*{4}')

# Split the file content
raw_messages = header_pattern.split(content)

# Filter out empty strings (or strings that are just whitespace)
messages = [msg for msg in raw_messages if msg.strip() != ""]

print(f"Found {len(messages)} messages.")
print("First message preview:")
print(messages[0][:500])

Found 3623 messages.
First message preview:

SRUS32 KWOH 262348

RRSACR

:&&HADS SOR REPORT FOR USER ACR

.A ALRP7 20250126 DH2330/PPCRA 0.00

.A SRGP7 20250126 DH2330/PPCRA 0.01

:END OF REPORT




# Cluster Messages

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import collections

# Assume 'messages' is a list of non-empty message strings obtained from your .anx files
# For example, you might have done something like:
# raw_messages = header_pattern.split(content)
# messages = [msg for msg in raw_messages if msg.strip() != ""]

# If you have a lot of messages, you might want to work on a sample:
sample_messages = messages  # or use messages[:N] for some N

# Vectorize the messages using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
X = vectorizer.fit_transform(sample_messages)

# Choose the number of clusters (you may experiment with this value)
num_clusters = 8
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)
cluster_labels = kmeans.labels_

# Print the distribution of messages across clusters
cluster_counts = collections.Counter(cluster_labels)
print("Cluster distribution:", dict(cluster_counts))

# For each cluster, print a few sample messages for inspection
for cluster in range(num_clusters):
    print(f"\nCluster {cluster} (Count: {cluster_counts[cluster]}):")
    # Select messages for this cluster
    cluster_messages = np.array(sample_messages)[cluster_labels == cluster]
    for i, msg in enumerate(cluster_messages[:1]):  # print first 3 messages from each cluster
        print(f"Message {i+1}:")
        print(msg[:300])  # print first 300 characters for preview
        print("-----")

Cluster distribution: {np.int32(1): 304, np.int32(4): 1025, np.int32(7): 851, np.int32(5): 279, np.int32(3): 302, np.int32(6): 277, np.int32(2): 456, np.int32(0): 129}

Cluster 0 (Count: 129):
Message 1:

SRUS54 KAMA 262354

HMLAMA

<?xml version="1.0" standalone="yes"?>

<site xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"

      generationtime="2025-01-26T23:54:04+00:00"

      timezone="CST6CDT"

      originator="AMA"

      name="Canadian River 19 N Amarillo 19N"

      id="AMAT2"

     
-----

Cluster 1 (Count: 304):
Message 1:

SRUS32 KWOH 262348

RRSACR

:&&HADS SOR REPORT FOR USER ACR

.A ALRP7 20250126 DH2330/PPCRA 0.00

.A SRGP7 20250126 DH2330/PPCRA 0.01

:END OF REPORT


-----

Cluster 2 (Count: 456):
Message 1:

SRUS76 KWOH 262348

RRSAMA

:&&HADS SOR REPORT FOR USER AMA

.A DCMT2 20250126 DH2300/PPHRG 0.00

.A SPRT2 20250126 DH2300/PPHRG 0.00

:END OF REPORT


-----

Cluster 3 (Count: 302):
Message 1:

SNID01 WIIX 262300 RRA

AAXX 26234

96011 42460 52101 10240 20

# Add Sensible Labels

In [20]:
cluster_to_label = {
    0: "XML_SITE_REPORT",
    1: "SOR_REPORT_TEXT",
    2: "BASIC_PRECIP_REPORT",
    3: "NUMERIC_MATRIX_REPORT",
    4: "DETAILED_FORECAST_MATRIX",
    5: "AVIATION_METAR",
    6: "AVIATION_SPECIALIZED_REPORT",
    7: "SOR_REPORT_RHA"
}

# Create a list of assigned labels for each sample message:
assigned_labels = [cluster_to_label[int(cl)] for cl in cluster_labels]

# Optionally, print the first few sample messages with their assigned labels:
for i in range(7):
    print(f"Message {i+1} (Cluster {cluster_labels[i]}): {assigned_labels[i]}")
    print(sample_messages[i][:300])  # preview the first 300 characters
    print("-----")

Message 1 (Cluster 1): SOR_REPORT_TEXT

SRUS32 KWOH 262348

RRSACR

:&&HADS SOR REPORT FOR USER ACR

.A ALRP7 20250126 DH2330/PPCRA 0.00

.A SRGP7 20250126 DH2330/PPCRA 0.01

:END OF REPORT


-----
Message 2 (Cluster 4): DETAILED_FORECAST_MATRIX

SRPA40 KWOH 262348

RRSHFO

:&&HADS SOR REPORT FOR USER HFO

.A LSFG8 20250126 DH2325/QRIRA 0.00088

.A LSFG8 20250126 DH2330/QRIRA 0.00091

.A LSFG8 20250126 DH2335/QRIRA 0.00088

.A IMOG8 20250126 DH2325/QRIRA 0.00319

.A IMOG8 20250126 DH2330/QRIRA 0.00319

.A IMOG8 20250126 DH2335/QRIRA 0.00319
-----
Message 3 (Cluster 7): SOR_REPORT_RHA

SRUS21 KWOH 262348

RRSRHA

:&&HADS SOR REPORT FOR USER RHA

.A RCJD2 20250126 DH2310/TWIRB 32.00

.A RCJD2 20250126 DH2310/QRIRA -9999

.A RCJD2 20250126 DH2310/WCIRB 1570.00

.A RCJD2 20250126 DH2310/WOIRB 15.00

.A RCJD2 20250126 DH2310/WPIRB 7.80

.A RCJD2 20250126 DH2310/WTIRB 3.50

.A RCJD2 2
-----
Message 4 (Cluster 7): SOR_REPORT_RHA

SRUS47 KWOH 262348

RRSLWX

:&&HADS SOR REPORT FOR USER LWX

.A

In [21]:
import pandas as pd

# Assume you have:
# sample_messages: a list of message strings
# cluster_labels: a list/array of cluster numbers (one for each message)
# assigned_labels: a list of descriptive labels corresponding to each message

# Create a DataFrame from your lists
df = pd.DataFrame({
    'message': sample_messages,
    'cluster': cluster_labels,
    'label': assigned_labels
})

# Save the DataFrame to a CSV file
df.to_csv("labeled_dataset.csv", index=False, escapechar="\\")
print("Labeled dataset saved to labeled_dataset.csv")

Labeled dataset saved to labeled_dataset.csv


# Train Model

Labeled data has been saved to CSV.
- load csv
- train model on labeled data

In [4]:
# Step 1: Load the dataset from CSV
from datasets import load_dataset, DatasetDict

# Load the CSV into a Hugging Face dataset; adjust the file path as needed.
dataset = load_dataset("csv", data_files={"data": "labeled_dataset.csv"}, split="data")
print(dataset)

# Optionally, perform a train-test split (here we use 80% for training, 20% for testing)
split_dataset = dataset.train_test_split(test_size=0.2, seed=42)
dataset = DatasetDict({
    "train": split_dataset["train"],
    "test": split_dataset["test"]
})

# Step 2: Process Labels
# Extract unique labels and create mapping dictionaries.
labels = dataset["train"]["label"]
unique_labels = sorted(list(set(labels)))
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}
print("Label Mapping:", label2id)

# Map each example's 'label' field to a numeric 'labels' field.
def encode_labels(example):
    example["labels"] = label2id[example["label"]]
    return example

dataset = dataset.map(encode_labels)

# Step 3: Tokenize the messages
from transformers import AutoTokenizer

model_name = "distilbert-base-uncased"  # You can change this to your preferred model.
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["message"], truncation=True, padding="max_length")

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Remove columns that aren't used for training.
tokenized_dataset = tokenized_dataset.remove_columns(["message", "cluster", "label"])
# already labeled... tokenized_dataset = tokenized_dataset.rename_column("labels", "labels")
tokenized_dataset.set_format("torch")

# Step 4: Load the Model
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=len(unique_labels), 
    id2label=id2label, 
    label2id=label2id
)

# Step 5: Set Up the Trainer
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
)

# Step 6: Train the Model
trainer.train()

# Optionally, save the trained model and tokenizer for later inference:
model.save_pretrained("./trained_classifier")
tokenizer.save_pretrained("./trained_classifier")

Dataset({
    features: ['message', 'cluster', 'label'],
    num_rows: 3623
})
Label Mapping: {'AVIATION_METAR': 0, 'AVIATION_SPECIALIZED_REPORT': 1, 'BASIC_PRECIP_REPORT': 2, 'DETAILED_FORECAST_MATRIX': 3, 'NUMERIC_MATRIX_REPORT': 4, 'SOR_REPORT_RHA': 5, 'SOR_REPORT_TEXT': 6, 'XML_SITE_REPORT': 7}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,0.21457
2,0.495600,0.17716
3,0.125400,0.130314


('./trained_classifier/tokenizer_config.json',
 './trained_classifier/special_tokens_map.json',
 './trained_classifier/vocab.txt',
 './trained_classifier/added_tokens.json',
 './trained_classifier/tokenizer.json')