# Label using Spacy NER

# Extract Messages from ANX

In [45]:
#!pip install chardet scikit-learn
#!pip install "accelerate>=0.26.0"
#!pip install spacy
!pip install spacy-lookups-data
!pip install --upgrade spacy-lookups-data



In [12]:
# get language model
!python -m spacy download en_core_web_sm
!python -m spacy download en_core_web_lg

Collecting en-core-web-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [1]:
# check spacy
import spacy

nlp = spacy.load("en_core_web_sm")  # Load a small English model
doc = nlp("STN: KATL TEMP: 23.4C PPHRG: 5.2mm TIME: 2025-02-17T10:00Z")

for ent in doc.ents:
    print(ent.text, ent.label_)

KATL ORG
23.4C CARDINAL
5.2 CARDINAL
2025-02-17T10:00Z DATE


In [2]:
import os

# List all .anx files in the current directory
anx_files = [f for f in os.listdir('.') if f.lower().endswith('.anx')]
print("Found .anx files:", anx_files)

Found .anx files: ['2025012700_SRRS0021.ANX']


In [10]:
import chardet

with open(anx_files[0], 'rb') as file:
    result = chardet.detect(file.read(10000000))  # Read first 10KB to guess encoding
    print(result)

{'encoding': 'ascii', 'confidence': 1.0, 'language': ''}


In [33]:
# weird but detected and worked. likely just bad data with no "good" encoding
with open(anx_files[0], 'r', encoding='mac_roman') as file:
    data = file.read()

print(data[:200])  # Display the first 500 characters

****0000000157****
SRUS32 KWOH 262348

RRSACR

:&&HADS SOR REPORT FOR USER ACR

.A ALRP7 20250126 DH2330/PPCRA 0.00

.A SRGP7 20250126 DH2330/PPCRA 0.01

:END OF REPORT

****0000001421****
SRPA40 KWOH


In [12]:
import os

# Change the filename as needed
file_path = './2025012700_SRRS0021.ANX'

# Read the file using the detected encoding (MacRoman)
with open(file_path, 'r', encoding='mac_roman') as f:
    content = f.read()

# Print the first 2000 characters to inspect the structure
print(content[:200])

****0000000157****
SRUS32 KWOH 262348

RRSACR

:&&HADS SOR REPORT FOR USER ACR

.A ALRP7 20250126 DH2330/PPCRA 0.00

.A SRGP7 20250126 DH2330/PPCRA 0.01

:END OF REPORT

****0000001421****
SRPA40 KWOH


In [13]:
import re

# Header pattern, e.g. for a header like "****0000000157****"
header_pattern = re.compile(r'\*{4}\d{10}\*{4}')

# Split the file content
raw_messages = header_pattern.split(content)

# Filter out empty strings (or strings that are just whitespace)
messages = [msg for msg in raw_messages if msg.strip() != ""]

print(f"Found {len(messages)} messages.")
print("First message preview:")
print(messages[0][:500])

Found 3623 messages.
First message preview:

SRUS32 KWOH 262348

RRSACR

:&&HADS SOR REPORT FOR USER ACR

.A ALRP7 20250126 DH2330/PPCRA 0.00

.A SRGP7 20250126 DH2330/PPCRA 0.01

:END OF REPORT




# Cluster Messages

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import collections

# Assume 'messages' is a list of non-empty message strings obtained from your .anx files
# For example, you might have done something like:
# raw_messages = header_pattern.split(content)
# messages = [msg for msg in raw_messages if msg.strip() != ""]

# If you have a lot of messages, you might want to work on a sample:
sample_messages = messages  # or use messages[:N] for some N

# Vectorize the messages using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=2000)
X = vectorizer.fit_transform(sample_messages)

# Choose the number of clusters (you may experiment with this value)
num_clusters = 8
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X)
cluster_labels = kmeans.labels_

# Print the distribution of messages across clusters
cluster_counts = collections.Counter(cluster_labels)
print("Cluster distribution:", dict(cluster_counts))

# For each cluster, print a few sample messages for inspection
for cluster in range(num_clusters):
    print(f"\nCluster {cluster} (Count: {cluster_counts[cluster]}):")
    # Select messages for this cluster
    cluster_messages = np.array(sample_messages)[cluster_labels == cluster]
    for i, msg in enumerate(cluster_messages[:1]):  # print first 3 messages from each cluster
        print(f"Message {i+1}:")
        print(msg[:300])  # print first 300 characters for preview
        print("-----")

Cluster distribution: {np.int32(1): 304, np.int32(4): 1025, np.int32(7): 851, np.int32(5): 279, np.int32(3): 302, np.int32(6): 277, np.int32(2): 456, np.int32(0): 129}

Cluster 0 (Count: 129):
Message 1:

SRUS54 KAMA 262354

HMLAMA

<?xml version="1.0" standalone="yes"?>

<site xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"

      generationtime="2025-01-26T23:54:04+00:00"

      timezone="CST6CDT"

      originator="AMA"

      name="Canadian River 19 N Amarillo 19N"

      id="AMAT2"

     
-----

Cluster 1 (Count: 304):
Message 1:

SRUS32 KWOH 262348

RRSACR

:&&HADS SOR REPORT FOR USER ACR

.A ALRP7 20250126 DH2330/PPCRA 0.00

.A SRGP7 20250126 DH2330/PPCRA 0.01

:END OF REPORT


-----

Cluster 2 (Count: 456):
Message 1:

SRUS76 KWOH 262348

RRSAMA

:&&HADS SOR REPORT FOR USER AMA

.A DCMT2 20250126 DH2300/PPHRG 0.00

.A SPRT2 20250126 DH2300/PPHRG 0.00

:END OF REPORT


-----

Cluster 3 (Count: 302):
Message 1:

SNID01 WIIX 262300 RRA

AAXX 26234

96011 42460 52101 10240 20

# Add Sensible Labels

In [27]:
import random

cluster_to_label = {
    0: "XML_SITE_REPORT",
    1: "SOR_REPORT_TEXT",
    2: "BASIC_PRECIP_REPORT",
    3: "NUMERIC_MATRIX_REPORT",
    4: "DETAILED_FORECAST_MATRIX",
    5: "AVIATION_METAR",
    6: "AVIATION_SPECIALIZED_REPORT",
    7: "SOR_REPORT_RHA"
}

# Create a list of assigned labels for each sample message:
assigned_labels = [cluster_to_label[int(cl)] for cl in cluster_labels]

# Optionally, print the first few sample messages with their assigned labels:
for i in random.sample(range(3000), 5):
    print(f"---- Message {i+1} (Cluster {cluster_labels[i]}): {assigned_labels[i]}")
    print("\n".join(filter(str.strip, sample_messages[i][:70].splitlines())))
    #print(sample_messages[i][:100])  # preview the first 300 characters
    

---- Message 360 (Cluster 6): AVIATION_SPECIALIZED_REPORT
SRUS69 KWOH 262349
RRSLCH
:&&AFWS REPORT FOR USER LCH
.E JZVT2 250
---- Message 1961 (Cluster 1): SOR_REPORT_TEXT
SXUS77 KWOH 262356
RRSEKA
:&&HADS SOR REPORT FOR USER EKA
.A ALDC1
---- Message 1220 (Cluster 3): NUMERIC_MATRIX_REPORT
SOVD83 KWNB 262200 RRX
KKYY 26015 2245/ 728138 096829 88871 83099 20
---- Message 737 (Cluster 4): DETAILED_FORECAST_MATRIX
SRUS21 KWOH 262350
RRSRHA
:&&HADS SOR REPORT FOR USER RHA
.E BWSW2
---- Message 2795 (Cluster 3): NUMERIC_MATRIX_REPORT
SMVD01 KWBC 270000 RRA
BBXX
42058 27001 99145 70752 46/// /0710 102


In [28]:
import pandas as pd

# Assume you have:
# sample_messages: a list of message strings
# cluster_labels: a list/array of cluster numbers (one for each message)
# assigned_labels: a list of descriptive labels corresponding to each message

# Create a DataFrame from your lists
df = pd.DataFrame({
    'message': sample_messages,
    'cluster': cluster_labels,
    'label': assigned_labels
})

# Save the DataFrame to a CSV file
df.to_csv("labeled_dataset.csv", index=False, escapechar="\\")
print("Labeled dataset saved to labeled_dataset.csv")

Labeled dataset saved to labeled_dataset.csv


# Train Model

Labeled data has been saved to CSV.
- load csv
- train model on labeled data

In [15]:
# Example of saving training data
#
# TRAIN_DATA.append(
#     ("SRUS32 KDFW 272348 RRSACR "
#      ".A STN1 20250227 DH1200/PPCRA 2.50 "
#      ".A STN2 20250227 DH1300/PPCRA 1.80 "
#      ".A STN3 20250227 DH1400/PPCRA 0.00",
#      {"entities": [
#          (7, 11, "STATION_ID"),       # "KDFW"
#          (12, 18, "TIMESTAMP"),       # "272348"
#          (20, 26, "REPORT_TYPE"),     # "RRSACR"

#          (29, 31, "PREFIX"),          # ".A"
#          (32, 36, "STATION_ID"),      # "STN1"
#          (37, 45, "DATE"),            # "20250227"
#          (46, 52, "TIME"),            # "DH1200"
#          (53, 65, "PRECIPITATION"),   # "PPCRA 2.50"
#      ]}
#     )
# )
# print(json.dumps(TRAIN_DATA))
# with open("srrs_train_data2.json", "w", encoding="utf-8") as file:
#     json.dump(TRAIN_DATA, file, indent=4)

TRAIN_DATA = [
    (
        "SRUS32 KWOH 262348 RRSACR :&&HADS SOR REPORT FOR USER ACR "
        ".A ALRP7 20250126 DH2330/PPCRA 0.00 "
        ".A SRGP7 20250126 DH2330/PPCRA 0.01 :END OF REPORT",
        {
            "entities": [
                (0, 6, "PRODUCT_ID"),
                (7, 11, "STATION_ID"),
                (12, 18, "TIMESTAMP"),
                (19, 25, "REPORT_TYPE"),
                (52, 55, "USER_ID"),
                (59, 64, "STATION_ID"),
                (65, 73, "DATE"),
                (74, 80, "TIME"),
                (81, 92, "PRECIPITATION"),
                (95, 100, "STATION_ID"),
                (101, 109, "DATE"),
                (110, 116, "TIME"),
                (117, 128, "PRECIPITATION")
            ]
        }
    )
]
print(json.dumps(TRAIN_DATA))
with open("srrs_train_data2.json", "w", encoding="utf-8") as file:
    json.dump(TRAIN_DATA, file, indent=4)


[["SRUS32 KWOH 262348 RRSACR :&&HADS SOR REPORT FOR USER ACR .A ALRP7 20250126 DH2330/PPCRA 0.00 .A SRGP7 20250126 DH2330/PPCRA 0.01 :END OF REPORT", {"entities": [[0, 6, "PRODUCT_ID"], [7, 11, "STATION_ID"], [12, 18, "TIMESTAMP"], [19, 25, "REPORT_TYPE"], [52, 55, "USER_ID"], [59, 64, "STATION_ID"], [65, 73, "DATE"], [74, 80, "TIME"], [81, 92, "PRECIPITATION"], [95, 100, "STATION_ID"], [101, 109, "DATE"], [110, 116, "TIME"], [117, 128, "PRECIPITATION"]]}]]


# Train and save NER model

In [16]:


import spacy
import json
from spacy.training.example import Example

# MUST LOAD with...    python -m spacy download en_core_web_sm
# small was bad at labeling "en_core_web_sm"
nlp = spacy.load("en_core_web_lg")

with open("srrs_train_data.json", "r", encoding="utf-8") as file:
    training_data = json.load(file)
print("Training Data Loaded:")
    
# raise KeyboardInterrupt  # Stops only this cell
    
optimizer = nlp.begin_training()
for epoch in range(20):  # More epochs = better learning
    for text, annotations in training_data:
        example = Example.from_dict(nlp.make_doc(text), annotations)
        nlp.update([example], drop=0.3)

# Save the trained model
nlp.to_disk("custom_srrs_ner")

print("Training complete! Model saved as 'custom_srrs_ner'")


 

Training Data Loaded:




Training complete! Model saved as 'custom_srrs_ner'


In [17]:
import json

from datasets import load_dataset, DatasetDict

dataset = load_dataset("csv", data_files={"data": "labeled_dataset.csv"}, split="data")
print(dataset)

nlp = spacy.load("custom_srrs_ner")


for index,msg in enumerate(dataset):
    if index>10:
        break
    doc = nlp(msg["message"])
    print("\n📌 Inferred Entities:")
    for ent in doc.ents:
        print(f"Entity: {ent.text} | Label: {ent.label_}")
    
raise KeyboardInterrupt  # Stops only this cell


Dataset({
    features: ['message', 'cluster', 'label'],
    num_rows: 3623
})

📌 Inferred Entities:
Entity: SRUS32 | Label: PRODUCT_ID
Entity: KWOH | Label: STATION_ID
Entity: 262348 | Label: TIMESTAMP
Entity: RRSACR | Label: REPORT_TYPE

📌 Inferred Entities:
Entity: SRPA40 | Label: TIMESTAMP
Entity: KWOH | Label: STATION_ID
Entity: 262348 | Label: TIMESTAMP

📌 Inferred Entities:
Entity: SRUS21 | Label: PRODUCT_ID
Entity: KWOH | Label: STATION_ID
Entity: 262348 | Label: TIMESTAMP
Entity: RRSRHA | Label: REPORT_TYPE
Entity: RHA | Label: REPORT_TYPE

📌 Inferred Entities:
Entity: SRUS47 | Label: TIMESTAMP
Entity: KWOH | Label: STATION_ID
Entity: 262348 | Label: TIMESTAMP

📌 Inferred Entities:
Entity: SRPA41 | Label: TIMESTAMP
Entity: KWOH | Label: STATION_ID
Entity: 262348 | Label: TIMESTAMP

📌 Inferred Entities:
Entity: SXUS35 | Label: PRODUCT_ID
Entity: KWOH | Label: STATION_ID
Entity: 262348 | Label: TIMESTAMP

📌 Inferred Entities:
Entity: SRUS28 | Label: TIMESTAMP
Entity: KWOH | Labe

KeyboardInterrupt: 