In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.23.1-py3-none-any.whl (5.3 MB)
[K     |████████████████████████████████| 5.3 MB 30.9 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 68.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.10.1-py3-none-any.whl (163 kB)
[K     |████████████████████████████████| 163 kB 71.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.10.1 tokenizers-0.13.1 transformers-4.23.1


In [None]:
import pandas as pd
import numpy as np
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

In [None]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
model.to(device)

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12,

In [None]:
data = pd.read_csv('cybersecurity_incidents.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Description,Time
0,1,"In the July attack, Iranian actors deployed ra...",September 2022.
1,2,Montenegrin officials blamed Russia for the at...,September 2022.
2,3,Hackers targeted the state-level parliamentary...,September 2022.
3,4,Authorities claim the NSA stole user data and ...,September 2022.
4,5,The group Anonymous took responsibility for a ...,September 2022.


In [None]:
data = pd.read_csv('cybersecurity_incidents_descriptions.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,Description
0,0,September 2022. Iranian hackers targeted Alba...
1,1,September 2022. Hackers targeted Montenegro’s ...
2,2,September 2022. Hackers targeted the state -le...
3,3,September 2022. China accused the U.S. Nationa...
4,4,September 2022. The group Anonymous took respo...


In [None]:
question = 'Which country was the attacker from?'

In [None]:
text = data.Description[0]

In [None]:
input_ids = tokenizer.encode(question, text)

In [None]:
tokens = tokenizer.convert_ids_to_tokens(input_ids)
for token, id in zip(tokens, input_ids):
    print('{:8}{:8,}'.format(token,id))

[CLS]        101
which      2,029
country    2,406
was        2,001
the        1,996
attacker  17,346
from       2,013
?          1,029
[SEP]        102
in         1,999
the        1,996
july       2,251
attack     2,886
,          1,010
iranian    7,726
actors     5,889
deployed   7,333
ransom    16,540
##ware     8,059
on         2,006
albanian   9,408
government   2,231
networks   6,125
that       2,008
destroyed   3,908
data       2,951
and        1,998
disrupted  20,275
government   2,231
services   2,578
.          1,012
[SEP]        102


In [None]:
#first occurence of [SEP] token
sep_idx = input_ids.index(tokenizer.sep_token_id)
print("SEP token index: ", sep_idx)
#number of tokens in segment A (question) - this will be one more than the sep_idx as the index in Python starts from 0
num_seg_a = sep_idx+1
print("Number of tokens in segment A: ", num_seg_a)
#number of tokens in segment B (text)
num_seg_b = len(input_ids) - num_seg_a
print("Number of tokens in segment B: ", num_seg_b)
#creating the segment ids
segment_ids = [0]*num_seg_a + [1]*num_seg_b
#making sure that every input token has a segment id
assert len(segment_ids) == len(input_ids)

SEP token index:  8
Number of tokens in segment A:  9
Number of tokens in segment B:  23


In [None]:
#token input_ids to represent the input and token segment_ids to differentiate our segments - question and text
output = model(torch.tensor([input_ids]),  token_type_ids=torch.tensor([segment_ids]))

In [None]:
output

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[-6.0965, -5.0558, -8.4061, -7.4181, -7.2669, -7.8661, -6.1221, -9.1777,
         -6.0964, -4.6881, -4.6350, -3.0044, -5.6120, -6.2465,  3.6938, -5.7589,
         -6.1733, -5.4640, -6.4288, -6.0428,  1.6097, -6.3822, -6.4739, -8.1923,
         -6.4928, -7.2093, -8.5869, -6.7679, -6.8529, -7.0847, -6.0965, -6.0969]],
       grad_fn=<CloneBackward0>), end_logits=tensor([[-1.9659, -5.1183, -5.7751, -6.2831, -7.0675, -6.4004, -6.1657, -6.4644,
         -1.9658, -6.4674, -6.2753, -3.1410, -2.8508, -4.5260,  4.4731, -0.3414,
         -5.2253, -5.2826, -2.2257, -6.3750,  3.0577, -1.8167, -1.4636, -5.4353,
         -6.1655, -4.0968, -6.7491, -6.1244, -4.5073, -2.8784, -1.9658, -1.9665]],
       grad_fn=<CloneBackward0>), hidden_states=None, attentions=None)

In [None]:
#tokens with highest start and end scores
answer_start = torch.argmax(output.start_logits)
answer_end = torch.argmax(output.end_logits)
if answer_end >= answer_start:
    answer = " ".join(tokens[answer_start:answer_end+1])
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")
    
print("\nQuestion:\n{}".format(question.capitalize()))
print("\nAnswer:\n{}.".format(answer.capitalize()))


Question:
Which country was the attacker from?

Answer:
Iranian.


In [None]:
question = 'Which country was the victim from?'

In [None]:
input_ids = tokenizer.encode(question, text)

In [None]:
#token input_ids to represent the input and token segment_ids to differentiate our segments - question and text
output = model(torch.tensor([input_ids]),  token_type_ids=torch.tensor([segment_ids]))

In [None]:
#tokens with highest start and end scores
answer_start = torch.argmax(output.start_logits)
answer_end = torch.argmax(output.end_logits)
if answer_end >= answer_start:
    answer = " ".join(tokens[answer_start:answer_end+1])
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")
    
print("\nQuestion:\n{}".format(question.capitalize()))
print("\nAnswer:\n{}.".format(answer.capitalize()))


Question:
Which country was the victim from?

Answer:
Albanian.


In [None]:
question = 'What type of cyber attack occurred?'

In [None]:
#first occurence of [SEP] token
sep_idx = input_ids.index(tokenizer.sep_token_id)
print("SEP token index: ", sep_idx)
#number of tokens in segment A (question) - this will be one more than the sep_idx as the index in Python starts from 0
num_seg_a = sep_idx+1
print("Number of tokens in segment A: ", num_seg_a)
#number of tokens in segment B (text)
num_seg_b = len(input_ids) - num_seg_a
print("Number of tokens in segment B: ", num_seg_b)
#creating the segment ids
segment_ids = [0]*num_seg_a + [1]*num_seg_b
#making sure that every input token has a segment id
assert len(segment_ids) == len(input_ids)

SEP token index:  8
Number of tokens in segment A:  9
Number of tokens in segment B:  23


In [None]:
input_ids = tokenizer.encode(question, text)

In [None]:
#token input_ids to represent the input and token segment_ids to differentiate our segments - question and text
output = model(torch.tensor([input_ids]),  token_type_ids=torch.tensor([segment_ids]))

In [None]:
#tokens with highest start and end scores
answer_start = torch.argmax(output.start_logits)
answer_end = torch.argmax(output.end_logits)
if answer_end >= answer_start:
    answer = " ".join(tokens[answer_start:answer_end+1])
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")
    
print("\nQuestion:\n{}".format(question.capitalize()))
print("\nAnswer:\n{}.".format(answer.capitalize()))


Question:
What type of cyber attack occurred?

Answer:
Ransom ##ware.


In [None]:
question = 'Which industry was the victim from?'

In [None]:
input_ids = tokenizer.encode(question, text)

In [None]:
#first occurence of [SEP] token
sep_idx = input_ids.index(tokenizer.sep_token_id)
print("SEP token index: ", sep_idx)
#number of tokens in segment A (question) - this will be one more than the sep_idx as the index in Python starts from 0
num_seg_a = sep_idx+1
print("Number of tokens in segment A: ", num_seg_a)
#number of tokens in segment B (text)
num_seg_b = len(input_ids) - num_seg_a
print("Number of tokens in segment B: ", num_seg_b)
#creating the segment ids
segment_ids = [0]*num_seg_a + [1]*num_seg_b
#making sure that every input token has a segment id
assert len(segment_ids) == len(input_ids)

SEP token index:  8
Number of tokens in segment A:  9
Number of tokens in segment B:  23


In [None]:
#token input_ids to represent the input and token segment_ids to differentiate our segments - question and text
output = model(torch.tensor([input_ids]),  token_type_ids=torch.tensor([segment_ids]))

In [None]:
#tokens with highest start and end scores
answer_start = torch.argmax(output.start_logits)
answer_end = torch.argmax(output.end_logits)
if answer_end >= answer_start:
    answer = " ".join(tokens[answer_start:answer_end+1])
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")
    
print("\nQuestion:\n{}".format(question.capitalize()))
print("\nAnswer:\n{}.".format(answer.capitalize()))


Question:
Which industry was the victim from?

Answer:
Ransom.


In [None]:
question = 'When did the attack occur?'

In [None]:
input_ids = tokenizer.encode(question, text)
tokens = tokenizer.convert_ids_to_tokens(input_ids)

In [None]:
#first occurence of [SEP] token
sep_idx = input_ids.index(tokenizer.sep_token_id)
print("SEP token index: ", sep_idx)
#number of tokens in segment A (question) - this will be one more than the sep_idx as the index in Python starts from 0
num_seg_a = sep_idx+1
print("Number of tokens in segment A: ", num_seg_a)
#number of tokens in segment B (text)
num_seg_b = len(input_ids) - num_seg_a
print("Number of tokens in segment B: ", num_seg_b)
#creating the segment ids
segment_ids = [0]*num_seg_a + [1]*num_seg_b
#making sure that every input token has a segment id
assert len(segment_ids) == len(input_ids)

SEP token index:  7
Number of tokens in segment A:  8
Number of tokens in segment B:  99


In [None]:
#token input_ids to represent the input and token segment_ids to differentiate our segments - question and text
output = model(torch.tensor([input_ids]).to(device),  token_type_ids=torch.tensor([segment_ids]).to(device))

In [None]:
#tokens with highest start and end scores
answer_start = torch.argmax(output.start_logits)
answer_end = torch.argmax(output.end_logits)
if answer_end >= answer_start:
    answer = " ".join(tokens[answer_start:answer_end+1])
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")
    
print("\nQuestion:\n{}".format(question.capitalize()))
print("\nAnswer:\n{}.".format(answer.capitalize()))


Question:
When did the attack occur?

Answer:
September 202 ##2.


## Extract Features from Descriptions

In [None]:
questions = [
    'When did the attack occur?',
    'Which country was the attacker from?',
    'Which country was the victim from?',
    'What type of cyber attack occurred?',
    'Which industry was the victim from?'
]

headers = [
    'time',
    'attacker_origin',
    'victim_origin',
    'type_of_attack',
    'industry'    
]

In [None]:
# get answers from input text
for i, q in enumerate(questions):
  print(i, q)
  a = []

  for text in data.Description.values:
    input_ids = tokenizer.encode(q, text)
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    #first occurence of [SEP] token
    sep_idx = input_ids.index(tokenizer.sep_token_id)

    #number of tokens in segment A (question) - this will be one more than the sep_idx as the index in Python starts from 0
    num_seg_a = sep_idx+1

    #number of tokens in segment B (text)
    num_seg_b = len(input_ids) - num_seg_a

    #creating the segment ids
    segment_ids = [0]*num_seg_a + [1]*num_seg_b
    #making sure that every input token has a segment id
    assert len(segment_ids) == len(input_ids)

    #token input_ids to represent the input and token segment_ids to differentiate our segments - question and text
    output = model(torch.tensor([input_ids]).to(device),  token_type_ids=torch.tensor([segment_ids]).to(device))

    #tokens with highest start and end scores
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)

    # remove wordpiece tokenization
    answer = tokens[answer_start]
    for j in range(answer_start+1, answer_end+1):
        if tokens[j][0:2] == "##":
            answer += tokens[j][2:]
        else:
            answer += " " + tokens[j]
    a.append(answer)

  data[headers[i]] = a

0 When did the attack occur?
1 Which country was the attacker from?
2 Which country was the victim from?
3 What type of cyber attack occurred?
4 Which industry was the victim from?


In [None]:
data.head()

Unnamed: 0.1,Unnamed: 0,Description,time,attacker_origin,victim_origin,type_of_attack,industry
0,0,September 2022. Iranian hackers targeted Alba...,september 2022,iranian,albania,iranian cyberattack,computer systems
1,1,September 2022. Hackers targeted Montenegro’s ...,september 2022,russia,russia,hackers targeted montenegro ’ s government net...,government networks
2,2,September 2022. Hackers targeted the state -le...,september 2022,bosnia and herzegovina,bosnia and herzegovina,hackers targeted the state - level parliamenta...,parliamentary website
3,3,September 2022. China accused the U.S. Nationa...,september 2022,china,china,cyberattacks,digital communications networks
4,4,September 2022. The group Anonymous took respo...,september 2022,iranian,iranian,cyberattacks,media


In [None]:
data.to_csv('cybersecurity_incidents_with_extras.csv')