##### 1-check_the_data - This Jupyter notebook checks the initial data for a Named Entity Recognition (NER) task. It loads and preprocesses text into words and tags, creates a pandas DataFrame, and analyzes unique tags and token lengths to ensure data quality before further processing.

In [62]:
from functions import prepare_data
import pandas as pd
from datasets import Dataset


In [63]:

def load_data(file_path):
    sentences = []
    sentence = []
    
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()  # Remove whitespace from the start and end of the line
            if not line:  # If the line is empty
                if sentence:  # If there is a current sentence
                    sentences.append(sentence)  # Add the current sentence to the list
                    sentence = []  # Reset the current sentence
            else:
                parts = line.split()  # Split the line into parts
                if len(parts) == 2:  # Ensure there are exactly two parts
                    word, tag = parts  # Unpack the word and tag
                    sentence.append((word, tag))  # Add the word and tag to the current sentence
                else:
                    pass

    # Add the last sentence if it exists
    if sentence:
        sentences.append(sentence)

    return sentences

def preprocess(sentences):

    words = []
    tags = []
    
    for sentence in sentences:
        words.append([word for word, tag in sentence])
        tags.append([tag for word, tag in sentence])
    
    return words, tags

def create_dataframe(words, tags):

    df_data = {'Word': [], 'Tag': []}
    
    for sentence_words, sentence_tags in zip(words, tags):
        df_data['Word'].extend(sentence_words)
        df_data['Tag'].extend(sentence_tags)
        df_data['Word'].append('')  
        df_data['Tag'].append('')
    
    return pd.DataFrame(df_data)

train_file_path = '/Users/guybasson/works_assigments/sleek_ml_engineer/datasets/train.txt'
validation_file_path = '/Users/guybasson/works_assigments/sleek_ml_engineer/datasets/valid.txt'
test_file_path = '/Users/guybasson/works_assigments/sleek_ml_engineer/datasets/test.txt'
train_sentences = load_data(train_file_path)
train_words, train_tags = preprocess(train_sentences)

train_df = create_dataframe(train_words, train_tags)

print(train_df.head(20))


             Word        Tag
0             The          O
1       admin@338  B-HackOrg
2             has          O
3         largely          O
4        targeted          O
5   organizations          O
6        involved          O
7              in          O
8       financial     B-Idus
9               ,          O
10       economic     B-Idus
11            and          O
12          trade     B-Idus
13         policy     I-Idus
14              ,          O
15      typically          O
16          using          O
17       publicly     B-Tool
18      available     I-Tool
19           RATs     I-Tool


In [64]:
# unique tags
train_df['Tag'].unique()

array(['O', 'B-HackOrg', 'B-Idus', 'I-Idus', 'B-Tool', 'I-Tool', '',
       'B-Area', 'I-Area', 'B-Org', 'I-Org', 'I-HackOrg', 'B-Time',
       'I-Time', 'B-Way', 'I-Way', 'B-OffAct', 'B-SamFile', 'B-Features',
       'I-Features', 'I-OffAct', 'B-SecTeam', 'B-Exp', 'I-Exp',
       'I-SecTeam', 'B-Purp', 'I-Purp', 'I-SamFile'], dtype=object)

In [65]:
# count the number of unique tags
train_df['Tag'].value_counts()

Tag
O             110432
                5251
B-HackOrg       3419
B-Tool          2449
B-Area          2171
B-OffAct        1412
I-Tool          1386
B-Idus          1349
B-Time          1328
I-Purp          1231
B-SamFile       1221
I-Features      1178
B-Org           1113
B-Exp           1068
B-SecTeam        997
I-HackOrg        977
I-Org            958
I-OffAct         851
B-Way            828
I-Area           816
B-Features       812
I-Time           794
I-Way            785
B-Purp           721
I-SamFile        556
I-Exp            538
I-Idus           482
I-SecTeam        473
Name: count, dtype: int64

In [66]:
train_df['Tag'].value_counts().index

Index(['O', '', 'B-HackOrg', 'B-Tool', 'B-Area', 'B-OffAct', 'I-Tool',
       'B-Idus', 'B-Time', 'I-Purp', 'B-SamFile', 'I-Features', 'B-Org',
       'B-Exp', 'B-SecTeam', 'I-HackOrg', 'I-Org', 'I-OffAct', 'B-Way',
       'I-Area', 'B-Features', 'I-Time', 'I-Way', 'B-Purp', 'I-SamFile',
       'I-Exp', 'I-Idus', 'I-SecTeam'],
      dtype='object', name='Tag')

In [67]:
# up[load rad data as str 
with open(train_file_path, "r", encoding="utf-8") as file:
    train_raw_data = file.read()

# do the same for the test data
with open(test_file_path, "r", encoding="utf-8") as file:
    test_raw_data = file.read()

# do the same for the validation data
with open(validation_file_path, "r", encoding="utf-8") as file:
    val_raw_data = file.read()


In [68]:
# Step 1: Filter out empty lines
def check_max_tokens(data):
    max_lines = 0
    current_count = 0

    # Step 2: Iterate through raw data
    for line in data:
        if line.strip():  # If the line is not empty
            current_count += 1
        else:  # If there's an empty line, check max and reset current count
            max_lines = max(max_lines, current_count)
            current_count = 0
    return max_lines, current_count        

# Check for the last group if it wasn't followed by an empty line
max_lines_train, current_count_train = check_max_tokens(train_raw_data)
max_tokens_train = max(max_lines_train, current_count_train)

max_lines_test, current_count_test = check_max_tokens(test_raw_data)
max_tokens_test = max(max_lines_test, current_count_test)

max_lines_val, current_count_val = check_max_tokens(val_raw_data)
max_tokens_val = max(max_lines_val, current_count_val)

In [69]:
print(f"Max tokens in train: {max_tokens_train}")
print(f"Max tokens in test: {max_tokens_test}")
print(f"Max tokens in val: {max_tokens_val}")

Max tokens in train: 65
Max tokens in test: 40
Max tokens in val: 65


In [70]:
test_file_path = '/Users/guybasson/works_assigments/sleek_ml_engineer/datasets/test.txt'
val_file_path = '/Users/guybasson/works_assigments/sleek_ml_engineer/datasets/valid.txt'

# make val_raw_data and test_raw_data
with open(test_file_path, "r", encoding="utf-8") as file:
    test_raw_data = file.read()

with open(val_file_path, "r", encoding="utf-8") as file:
    val_raw_data = file.read()

In [71]:

train_prepared_data = prepare_data(train_raw_data)[:400]
test_prepared_data = prepare_data(test_raw_data)[:80]
val_prepared_data = prepare_data(val_raw_data)

In [72]:
len(train_prepared_data), len(val_prepared_data)

(400, 662)

In [73]:

# Display the first prepared sentence
print(train_prepared_data[0])

{'tokens': ['The', 'admin@338', 'has', 'largely', 'targeted', 'organizations', 'involved', 'in', 'financial', ',', 'economic', 'and', 'trade', 'policy', ',', 'typically', 'using', 'publicly', 'available', 'RATs', 'such', 'as', 'Poison', 'Ivy', ',', 'as', 'well', 'some', 'non-public', 'backdoors', '.'], 'labels': ['O', 'B-HackOrg', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Idus', 'O', 'B-Idus', 'O', 'B-Idus', 'I-Idus', 'O', 'O', 'O', 'B-Tool', 'I-Tool', 'I-Tool', 'O', 'O', 'B-Tool', 'I-Tool', 'O', 'O', 'O', 'O', 'B-Tool', 'I-Tool', 'O']}


In [74]:

# Create a Hugging Face Dataset from the prepared data
train_dataset = Dataset.from_list(train_prepared_data)
val_dataset = Dataset.from_list(val_prepared_data)

In [75]:
# Display the first prepared sentence as a Hugging Face Dataset
print(train_dataset[0])
print(val_dataset[0])

{'tokens': ['The', 'admin@338', 'has', 'largely', 'targeted', 'organizations', 'involved', 'in', 'financial', ',', 'economic', 'and', 'trade', 'policy', ',', 'typically', 'using', 'publicly', 'available', 'RATs', 'such', 'as', 'Poison', 'Ivy', ',', 'as', 'well', 'some', 'non-public', 'backdoors', '.'], 'labels': ['O', 'B-HackOrg', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Idus', 'O', 'B-Idus', 'O', 'B-Idus', 'I-Idus', 'O', 'O', 'O', 'B-Tool', 'I-Tool', 'I-Tool', 'O', 'O', 'B-Tool', 'I-Tool', 'O', 'O', 'O', 'O', 'B-Tool', 'I-Tool', 'O']}
{'tokens': ['We', 'believe', 'that', 'these', 'industries', 'have', 'also', 'been', 'targeted', 'as', 'part', 'of', 'a', 'larger', 'supply-chain', 'attack', 'in', 'order', 'for', 'Orangeworm', 'to', 'get', 'access', 'to', 'their', 'intended', 'victims', 'related', 'to', 'healthcare', '.'], 'labels': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-OffAct', 'I-OffAct', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Id