In [None]:
import csv
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize the text
    words = word_tokenize(text)

    # Remove stopwords and lemmatize the words
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]

    return words

def process_rule(rule):
    # Preprocess the rule's text
    preprocessed_rule = preprocess_text(rule['Rules'])

    # Combine preprocessed words into a single string
    preprocessed_words = ', '.join(preprocessed_rule)

    return rule['Rules'], preprocessed_words

def read_rules_dataset(csv_file):
    rules_dataset = []
    with open(csv_file, 'r', encoding='utf-8') as file:
        reader = csv.DictReader(file)
        for row in reader:
            rules_dataset.append(row)
    return rules_dataset

def write_to_csv(processed_rules, output_file):
    with open(output_file, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Rule', 'Preprocessed Words'])
        for rule in processed_rules:
            writer.writerow([rule[0], rule[1]])

# Example usage:
input_csv_file = 'dataset.csv'
output_csv_file = 'Before-NLP.csv'

# Read rules dataset from CSV file
rules_dataset = read_rules_dataset(input_csv_file)

# Process rules dataset
processed_rules = [process_rule(rule) for rule in rules_dataset]

# Write categorized rules to CSV file
write_to_csv(processed_rules, output_csv_file)

In [None]:
import pandas as pd
import spacy
import numpy as np
import cupy as cp

# Load the NLP model on GPU
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])
nlp.enable_pipe("senter")
nlp.enable_pipe("tagger")

# Updated lists of severity keywords
high_severity_keywords = [
    "loss or exposure",
    "significant impact",
    "mission-critical information technology systems",
    "large-scale outages",
    "denial of service attacks",
    "confidential information",
    "severely impacted the performance",
    "material and adverse impact",
    "critical failure",
    "breach",
    "system compromise",
    "security incident",
    "data breach"
    # Add more relevant terms
]
medium_severity_keywords = [
    "critical information systems",
    "provision of services",
    "intended use",
    "majority of the user's customers",
    "service degradation",
    "partial outage",
    "data loss",
    "performance degradation",
    "service interruption",
    "functionality issues",
    "data inconsistency"
    # Add more relevant terms
]
low_severity_keywords = [
    "data security",
    "valuable classified data",
    "uptime of IT systems",
    "handled",
    "relevant sectors",
    "minor inconvenience",
    "service disruption",
    "temporary downtime",
    "user inconvenience",
    "software update",
    "patch installation",
    "scheduled maintenance"
    # Add more relevant terms
]

# Move severity keywords to GPU
high_severity_keywords_gpu = cp.asarray([nlp(keyword).vector for keyword in high_severity_keywords])
medium_severity_keywords_gpu = cp.asarray([nlp(keyword).vector for keyword in medium_severity_keywords])
low_severity_keywords_gpu = cp.asarray([nlp(keyword).vector for keyword in low_severity_keywords])

def classify_severity_nlp_batch(preprocessed_words):
    severity_scores = cp.zeros((len(preprocessed_words),), dtype=int)
    for i, word in enumerate(preprocessed_words):
        word_vector = nlp(word).vector

        similarity_high = cp.max(cp.dot(cp.asarray(word_vector), cp.asarray(high_severity_keywords_gpu).T) /
                                  (cp.linalg.norm(cp.asarray(word_vector)) * cp.linalg.norm(cp.asarray(high_severity_keywords_gpu), axis=1)))
        similarity_medium = cp.max(cp.dot(cp.asarray(word_vector), cp.asarray(medium_severity_keywords_gpu).T) /
                                    (cp.linalg.norm(cp.asarray(word_vector)) * cp.linalg.norm(cp.asarray(medium_severity_keywords_gpu), axis=1)))
        similarity_low = cp.max(cp.dot(cp.asarray(word_vector), cp.asarray(low_severity_keywords_gpu).T) /
                                 (cp.linalg.norm(cp.asarray(word_vector)) * cp.linalg.norm(cp.asarray(low_severity_keywords_gpu), axis=1)))

        severity_scores[i] = int((similarity_high >= 0.3) * 3 + (similarity_medium >= 0.3) * 2 + (similarity_low >= 0.3) * 1)
    return severity_scores

# Read the CSV file
df = pd.read_csv("Before-NLP.csv")

# Batch processing for classification
batch_size = 1000
num_batches = (len(df) + batch_size - 1) // batch_size
severity_list = []
for batch_index in range(num_batches):
    start_index = batch_index * batch_size
    end_index = min((batch_index + 1) * batch_size, len(df))
    preprocessed_words_batch = [word.split(', ') for word in df['Preprocessed Words'][start_index:end_index]]

    # Flatten the list of lists
    flattened_words_batch = [word for sublist in preprocessed_words_batch for word in sublist]

    # Classify severity for the flattened list
    severity_scores = classify_severity_nlp_batch(flattened_words_batch)

    # Convert severity scores to NumPy array for compatibility with np.where
    severity_scores_np = cp.asnumpy(severity_scores)

    # Assign severity labels using NumPy's where function
    severity_labels = np.where(severity_scores_np >= 6, "High", np.where(severity_scores_np >= 3, "Medium", "Low")).tolist()

    # Extend severity list while taking into account the indices
    severity_list.extend(severity_labels[start_index:end_index])

# Assigning severity to DataFrame
df['Severity'] = severity_list

# Save the updated DataFrame
df.to_csv("after_nlp.csv", index=False)

In [None]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the dataset
df = pd.read_csv("after_nlp.csv")

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df['Preprocessed Words'], df['Severity'], test_size=0.1, random_state=42)

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))
model.to(device)  # Move model to GPU if available

# Tokenize input data
max_length = 128
X_train_tokens = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=max_length, return_tensors='pt')
X_test_tokens = tokenizer(X_test.tolist(), padding=True, truncation=True, max_length=max_length, return_tensors='pt')

# Create DataLoader for training dataset
train_dataset = TensorDataset(X_train_tokens['input_ids'].to(device), X_train_tokens['attention_mask'].to(device), torch.tensor(y_train_encoded).to(device))
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=32)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * 5  # Total number of steps (epochs * batches)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Fine-tune BERT model
model.train()
for epoch in range(5):  # Increase the number of epochs for training
    for batch in tqdm(train_dataloader):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

# Tokenize test input data
X_test_tokens = tokenizer(X_test.tolist(), padding=True, truncation=True, max_length=max_length, return_tensors='pt')

# Create DataLoader for test dataset
test_dataset = TensorDataset(X_test_tokens['input_ids'].to(device), X_test_tokens['attention_mask'].to(device), torch.tensor(y_test_encoded).to(device))
test_sampler = RandomSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=32)

# Evaluate on test data
model.eval()
y_pred_bert_test = []
for batch in tqdm(test_dataloader):
    input_ids, attention_mask, labels = batch
    input_ids, attention_mask = input_ids.to(device), attention_mask.to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    _, predicted = torch.max(logits, 1)
    y_pred_bert_test.extend(predicted.tolist())

# Convert predictions back to CPU and numpy
y_pred_bert_test = torch.tensor(y_pred_bert_test).to(device).cpu().numpy()

# Evaluate test predictions
accuracy_bert_test = accuracy_score(y_test_encoded[:len(y_pred_bert_test)], y_pred_bert_test)
print()
print("BERT Model Test Accuracy:", accuracy_bert_test)
print("BERT Model Test Classification Report:")
print(classification_report(y_test_encoded[:len(y_pred_bert_test)], y_pred_bert_test))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 39/39 [00:22<00:00,  1.72it/s]
100%|██████████| 39/39 [00:23<00:00,  1.70it/s]
100%|██████████| 39/39 [00:23<00:00,  1.67it/s]
100%|██████████| 39/39 [00:23<00:00,  1.64it/s]
100%|██████████| 39/39 [00:24<00:00,  1.62it/s]
100%|██████████| 5/5 [00:00<00:00,  5.39it/s]


BERT Model Test Accuracy: 0.7518248175182481
BERT Model Test Classification Report:
              precision    recall  f1-score   support

           0       0.75      1.00      0.86       103
           1       0.00      0.00      0.00        13
           2       0.00      0.00      0.00        21

    accuracy                           0.75       137
   macro avg       0.25      0.33      0.29       137
weighted avg       0.57      0.75      0.65       137




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Checking validation accuracy and Training accuracy for each Epoch


In [1]:
import pandas as pd
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from tqdm import tqdm

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the dataset
df = pd.read_csv("after_nlp.csv")

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(df['Preprocessed Words'], df['Severity'], test_size=0.1, random_state=42)

# Encode labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))
model.to(device)  # Move model to GPU if available

# Tokenize input data
max_length = 128
X_train_tokens = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=max_length, return_tensors='pt')
X_test_tokens = tokenizer(X_test.tolist(), padding=True, truncation=True, max_length=max_length, return_tensors='pt')

# Create DataLoader for training dataset
train_dataset = TensorDataset(X_train_tokens['input_ids'].to(device), X_train_tokens['attention_mask'].to(device), torch.tensor(y_train_encoded).to(device))
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=32)

# Create DataLoader for test dataset
test_dataset = TensorDataset(X_test_tokens['input_ids'].to(device), X_test_tokens['attention_mask'].to(device), torch.tensor(y_test_encoded).to(device))
test_sampler = RandomSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=32)

# Define optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_dataloader) * 5  # Total number of steps (epochs * batches)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Fine-tune BERT model
num_epochs = 5
for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0
    correct_train = 0
    total_train = 0
    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{num_epochs}', unit='batch'):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()

        # Calculate training accuracy
        _, predicted = torch.max(outputs.logits, 1)
        correct_train += (predicted == labels).sum().item()
        total_train += labels.size(0)

    # Validation
    model.eval()
    valid_loss = 0
    correct_valid = 0
    total_valid = 0
    with torch.no_grad():
        for batch in tqdm(test_dataloader, desc=f'Validation {epoch + 1}/{num_epochs}', unit='batch'):
            input_ids, attention_mask, labels = batch
            input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            valid_loss += loss.item()

            # Calculate validation accuracy
            _, predicted = torch.max(outputs.logits, 1)
            correct_valid += (predicted == labels).sum().item()
            total_valid += labels.size(0)

    # Calculate average losses
    avg_train_loss = train_loss / len(train_dataloader)
    avg_valid_loss = valid_loss / len(test_dataloader)

    # Calculate accuracy
    train_accuracy = correct_train / total_train
    valid_accuracy = correct_valid / total_valid

    # Print training and validation loss, accuracy
    print(f'Epoch {epoch + 1}/{num_epochs}')
    print(f'Training Loss: {avg_train_loss:.4f}, Training Accuracy: {train_accuracy:.4f}')
    print(f'Validation Loss: {avg_valid_loss:.4f}, Validation Accuracy: {valid_accuracy:.4f}')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/5: 100%|██████████| 39/39 [00:24<00:00,  1.60batch/s]
Validation 1/5: 100%|██████████| 5/5 [00:00<00:00,  5.75batch/s]


Epoch 1/5
Training Loss: 0.7243, Training Accuracy: 0.7539
Validation Loss: 0.7263, Validation Accuracy: 0.7518


Epoch 2/5: 100%|██████████| 39/39 [00:23<00:00,  1.63batch/s]
Validation 2/5: 100%|██████████| 5/5 [00:00<00:00,  5.55batch/s]


Epoch 2/5
Training Loss: 0.6989, Training Accuracy: 0.7685
Validation Loss: 0.6821, Validation Accuracy: 0.7518


Epoch 3/5: 100%|██████████| 39/39 [00:25<00:00,  1.55batch/s]
Validation 3/5: 100%|██████████| 5/5 [00:00<00:00,  5.27batch/s]


Epoch 3/5
Training Loss: 0.6927, Training Accuracy: 0.7685
Validation Loss: 0.6936, Validation Accuracy: 0.7518


Epoch 4/5: 100%|██████████| 39/39 [00:26<00:00,  1.47batch/s]
Validation 4/5: 100%|██████████| 5/5 [00:00<00:00,  5.06batch/s]


Epoch 4/5
Training Loss: 0.6847, Training Accuracy: 0.7685
Validation Loss: 0.7914, Validation Accuracy: 0.7518


Epoch 5/5: 100%|██████████| 39/39 [00:25<00:00,  1.50batch/s]
Validation 5/5: 100%|██████████| 5/5 [00:00<00:00,  5.23batch/s]

Epoch 5/5
Training Loss: 0.6915, Training Accuracy: 0.7685
Validation Loss: 0.7404, Validation Accuracy: 0.7518



