In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
import torch
from torch_geometric.data import Data
from data_related.get_epss_score import *


In [2]:
severity_map = {"LOW": 1, "MEDIUM": 2, "HIGH": 3, "CRITICAL": 4}


    
df = pd.read_csv('output.csv')

'''epss_scores = []
for each_cve in df['cve_id']:
    epss_scores.append(get_epss_score(each_cve))'''


df["baseseverity"] = df["baseseverity"].map(severity_map)
df["confidentialityimpact"] = df["confidentialityimpact"].map(severity_map)
df["integrity"] = df["integrity"].map(severity_map)



encoder = OneHotEncoder()
vendor_encoded = encoder.fit_transform(df[["vendor"]]).toarray()
cwe_encoded = encoder.fit_transform(df[['CWE']]).toarray()


scaler = MinMaxScaler()
basescore_normalised = scaler.fit_transform(df[["basescore"]])
baseseverity_normalised = scaler.fit_transform(df[["baseseverity"]])
confidentialityimpact_normalised = scaler.fit_transform(df[["confidentialityimpact"]])
integrity_normalised = scaler.fit_transform(df[["integrity"]])



X = torch.tensor(
    torch.cat([
        torch.tensor(vendor_encoded),
        torch.tensor(cwe_encoded),
        torch.tensor(basescore_normalised), 
        torch.tensor(baseseverity_normalised),
        torch.tensor(confidentialityimpact_normalised),
        torch.tensor(integrity_normalised)
        #torch.tensor(df["EPSS_score"].values).unsqueeze(1)
    ], dim=1),
    dtype=torch.float
)

print(f"matrix shape: {X.shape}")


matrix shape: torch.Size([8953, 3446])


  X = torch.tensor(


In [3]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer as wnl
import statistics


In [4]:
def tokenize_sentence(sentence):
    if isinstance(sentence, str):
        
        tokens = word_tokenize(sentence)
        filtered_words = [word for word in tokens if word not in stopwords.words('english')]
        lemmatized_word = [wnl().lemmatize(word) for word in filtered_words]
        without_punctuation = [i for i in lemmatized_word if i not in '.,!?']
        
        return without_punctuation

def check_sentence_similarity(sentence1, sentence2):
    if sentence1 is not None and sentence2 is not None:
        avg_len = (len(sentence1) + len(sentence2))/2

        similar_word_count = 0

        for each_word_1 in sentence1:
            for each_word_2 in sentence2:
                if each_word_1 == each_word_2:
                    similar_word_count += 1

        return similar_word_count/avg_len

In [5]:
edges = []
edge_weights = []
threshold = 0.5

for i in range(100):
    weight = 0
    for j in range(i + 1, 100):

        severity_diff = abs(df.loc[i, "baseseverity"] - df.loc[j, "baseseverity"])
        score_diff = abs(df.loc[i, "basescore"] - df.loc[j, "basescore"])
        confidentiality_diff = abs(df.loc[i, "confidentialityimpact"] - df.loc[j, "confidentialityimpact"])
        integrity_diff = abs(df.loc[i, "integrity"] - df.loc[j, "integrity"])
        if df.loc[i, "CWE"] == df.loc[j, "CWE"]:
            weight += 0.2
        if df.loc[i, "vendor"] == df.loc[j, "vendor"]:
            weight += 0.2
        if severity_diff == 0:
            weight += 0.05
        if score_diff == 0:
            weight += 0.05
        if confidentiality_diff == 0:
            weight += 0.05
        if integrity_diff == 0:
            weight += 0.05
        


        i_sentence = tokenize_sentence(df.loc[i, 'description'])
        j_sentence = tokenize_sentence(df.loc[j, 'description'])
        similarity_score = check_sentence_similarity(i_sentence, j_sentence)
        if similarity_score is not None:
            weight += similarity_score
        else:
            similarity_score = 0


        

        if df.loc[i, "CWE"] == df.loc[j, "CWE"] or \
           df.loc[i, "vendor"] == df.loc[j, "vendor"] or \
           (score_diff == 0 or severity_diff == 0 or confidentiality_diff == 0 or integrity_diff == 0) or \
            similarity_score > 0 or \
            score_diff <= threshold:
            edges.append((i, j))
            edges.append((j, i)) 
            edge_weights.append(weight)
            edge_weights.append(weight)



edge_index = torch.tensor(edges, dtype=torch.long).t().contiguous()
edge_weight = torch.tensor(edge_weights, dtype=torch.float)

data = Data(x=X, edge_index=edge_index, edge_attr=edge_weight)
print(data)


Data(x=[8953, 3446], edge_index=[2, 9360], edge_attr=[9360])


In [6]:
import torch.nn as nn
import torch.nn.functional as func
from torch_geometric.nn import GCNConv


class WeightedGNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(WeightedGNN, self).__init__()
        self.gc1 = GCNConv(input_dim, hidden_dim)
        self.gc2 = GCNConv(hidden_dim, output_dim)

    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr
        x = self.gc1(x, edge_index, edge_weight)
        x = func.relu(x)
        x = self.gc2(x, edge_index, edge_weight)
        return x


input_dim = X.shape[1]
hidden_dim = 16
output_dim = 2  

model = WeightedGNN(input_dim, hidden_dim, output_dim)
print(model)


WeightedGNN(
  (gc1): GCNConv(3446, 16)
  (gc2): GCNConv(16, 2)
)


In [7]:
def get_epss_scores(cve_ids):
    epss_scores_list = []
    for id in cve_ids:
        epss_scores_list.append(get_epss_score(id))

    return epss_scores_list

epss_scores = get_epss_scores(df["cve_id"])



In [9]:
print(epss_scores)

import torch.optim as optim


optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()



y = torch.tensor(epss_scores, dtype=torch.long)

for epoch in range(200):
    model.train()
    optimizer.zero_grad()
    
    out = model(data)
    loss = criterion(out, y)
    
    loss.backward()
    optimizer.step()
    
    if epoch % 20 == 0:
        print(f'Epoch {epoch}, Loss: {loss.item():.4f}')


[0.00013, 0.0002, 0.00048, 0.00053, 0.00033, 0.00017, 0.00106, 0.00038, 0.00059, 0.00018, 0.00063, 0.00113, 0.0018, 0.00055, 0.00029, 9e-05, 0.00029, None, 0.00052, 0.00036, 0.0003, 0.00038, 0.00012, 0.00019, 0.00051, 7e-05, 0.00033, 0.00034, 0.00153, 0.00014, 0.00024, 0.00029, 0.00034, 0.0004, 0.11253, 0.00029, 0.0019, 0.00013, 0.00047, 0.00132, 0.00119, 0.00034, 0.00029, 0.00019, 0.00056, 0.00191, 0.00044, 0.00037, 0.00042, 0.00031, 0.00014, 0.20522, 0.00012, 0.00036, 0.00024, 0.05362, 0.0003, 0.00034, 0.00038, 0.00029, 0.00069, 0.00036, 0.00079, 0.00346, 0.00036, 0.00017, 0.00052, 0.00098, 0.00035, 0.00047, 0.00065, 0.0003, 0.01526, 0.00119, 0.00046, 0.00036, 0.0004, 0.00034, 0.00034, 0.00035, 0.00039, 0.00038, 0.00101, 0.01526, 0.00038, 0.00034, 0.00077, 0.00047, 0.00097, 0.00095, 0.00059, 0.00199, 0.19854, 0.00034, 0.00045, 0.00051, 0.00067, 0.00041, 0.0005, 0.00031, 0.00074, 0.00031, 0.00029, 0.00026, 0.00054, 0.00052, 0.00295, 0.00042, 0.00034, 0.00015, 0.00084, 0.00044, 0.00046

TypeError: 'NoneType' object cannot be interpreted as an integer

## Test the data here