In [4]:
import json
import pandas as pd

# Load the JSON file
with open('/content/nvdcve-1.1-modified.json') as f:
    nvd_data = json.load(f)

# Prepare data for the DataFrame
data = []
for item in nvd_data["CVE_Items"]:
    cve_id = item["cve"]["CVE_data_meta"]["ID"]
    description = item["cve"]["description"]["description_data"][0]["value"]
    cwe_id = item["cve"]["problemtype"]["problemtype_data"][0]["description"][0]["value"] if item["cve"]["problemtype"]["problemtype_data"][0]["description"] else "N/A"

    # Extract CVSS v3 metrics if available
    impact = item.get("impact", {}).get("baseMetricV3", {}).get("cvssV3", {})
    if impact:
        # Format the CVSS vector string
        cvss_vector = (
            f"CVSS:{impact.get('version', '3.1')}"
            f"/AV:{impact.get('attackVector', 'N/A')[0]}"
            f"/AC:{impact.get('attackComplexity', 'N/A')[0]}"
            f"/PR:{impact.get('privilegesRequired', 'N/A')[0]}"
            f"/UI:{impact.get('userInteraction', 'N/A')[0]}"
            f"/S:{impact.get('scope', 'N/A')[0]}"
            f"/C:{impact.get('confidentialityImpact', 'N/A')[0]}"
            f"/I:{impact.get('integrityImpact', 'N/A')[0]}"
            f"/A:{impact.get('availabilityImpact', 'N/A')[0]}"
        )

        # Add data to the list
        data.append({
            "CVE ID": cve_id,
            "Description": description,
            "CVSS Vector": cvss_vector
        })

# Create a DataFrame with only entries that contain CVSS data
df = pd.DataFrame(data)

# Save to TSV
output_path = '/content/cve_dataset_with_cvss_vector.tsv'
df.to_csv(output_path, sep='\t', index=False)
print(f"Filtered dataset saved to {output_path}")


Filtered dataset saved to /content/cve_dataset_with_cvss_vector.tsv


In [1]:
import google.cloud.aiplatform as aiplatform

In [2]:

import google.generativeai as genai

# Directly set the API key instead of using 'userdata'
gemini_key = ""  # Replace this with your actual API key

# Configure the library with the API key
genai.configure(api_key=gemini_key)


In [3]:
generation_config = {
  "temperature": 0.5,
  "top_p": 0.9,
  "top_k": 3,
  "max_output_tokens": 1500,
}


In [4]:
import pandas as pd
df=pd.read_csv("/content/cve_dataset_with_cvss_vector.tsv",sep="\t")

In [5]:
df.head(10)

Unnamed: 0,CVE ID,Description,CVSS Vector
0,CVE-2024-38424,Memory corruption during GNSS HAL process init...,CVSS:3.1/AV:L/AC:L/PR:L/UI:N/S:U/C:H/I:H/A:H
1,CVE-2024-38423,Memory corruption while processing GPU page ta...,CVSS:3.1/AV:L/AC:L/PR:L/UI:N/S:U/C:H/I:H/A:H
2,CVE-2024-38422,Memory corruption while processing voice packe...,CVSS:3.1/AV:L/AC:L/PR:L/UI:N/S:U/C:H/I:H/A:H
3,CVE-2024-38421,Memory corruption while processing GPU commands.,CVSS:3.1/AV:L/AC:L/PR:L/UI:N/S:U/C:H/I:H/A:H
4,CVE-2024-38419,Memory corruption while invoking IOCTL calls f...,CVSS:3.1/AV:L/AC:L/PR:L/UI:N/S:U/C:H/I:H/A:H
5,CVE-2024-38415,Memory corruption while handling session error...,CVSS:3.1/AV:L/AC:L/PR:L/UI:N/S:U/C:H/I:H/A:H
6,CVE-2024-38409,Memory corruption while station LL statistic h...,CVSS:3.1/AV:L/AC:L/PR:L/UI:N/S:U/C:H/I:H/A:H
7,CVE-2024-38408,Cryptographic issue when a controller receives...,CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:L/A:N
8,CVE-2024-38407,Memory corruption while processing input param...,CVSS:3.1/AV:L/AC:L/PR:L/UI:N/S:U/C:H/I:H/A:H
9,CVE-2024-38406,Memory corruption while handling IOCTL calls i...,CVSS:3.1/AV:L/AC:L/PR:L/UI:N/S:U/C:H/I:H/A:H


In [6]:
model = genai.GenerativeModel(model_name= "gemini-1.0-pro",
                              generation_config = generation_config)

In [13]:
import time
import pandas as pd

# Assuming 'df' is your DataFrame and 'model' is already defined and available
df1 = df.head(200)
output_data = []

def get_cvss_score(description, model):
    prompt = (
        "You are a cybersecurity expert specializing in cyber threat intelligence. Analyze the following CVE "
        "description and calculate the CVSS v3.1 Base Score. Determine the values for each base metric: AV, AC, PR, UI, "
        "S, C, I, and A. Summarize each metric’s value and provide the final CVSS v3.1 vector string in the format:\n"
        "CVSS:3.1/AV:N/AC:L/PR:N/UI:N/S:U/C:H/I:H/A:H\n\n"
        f"CVEDescription: {description}\nCVSS Vector:"
    )

    max_retries = 7
    backoff_time = 10  # initial backoff time in seconds

    for attempt in range(max_retries):
        try:
            # Generate the response
            response = model.generate_content(prompt)  # Replace with actual API call

            # Log the full response for debugging
            print(f"Response object: {response}")

            # Ensure response has 'parts' and is not empty
            if not hasattr(response, 'parts') or not response.parts:
                raise ValueError("No valid response parts found.")

            # Extract the text from the response object
            text = response.parts[0].text
            answer = text.strip().split()[-1].upper()  # Extract the last word and convert to uppercase
            return answer

        except Exception as e:
            error_message = str(e)
            if "rate limit" in error_message.lower() or "too many requests" in error_message.lower():
                if attempt < max_retries - 1:
                    sleep_time = backoff_time * (2 ** attempt)  # Exponential backoff
                    print(f"Rate limit hit. Retrying in {sleep_time} seconds...")
                    time.sleep(sleep_time)
                else:
                    print("Max retries reached. Skipping this entry.")
                    return None
            else:
                print(f"An error occurred: {e}")
                return None  # Ensuring a return even on failure

# Process entries and ensure 200 outputs
index = 0
while len(output_data) < 200 and index < len(df1):
    row = df1.iloc[index]
    description = row['Description']

    # Get the model response for the current description
    response_text = get_cvss_score(description, model)
    if response_text:
        output_data.append({
            'Description': description,
            'Geminai': response_text
        })
        print(f"Processed Description {index}: {response_text}")
    else:
        print(f"Skipping Description {index} due to an error.")

    index += 1

# Create a new DataFrame from the output_data
output_df = pd.DataFrame(output_data)

# Save the new DataFrame to a CSV file
output_df.to_csv('geminaiCVSSmydataresp.tsv', sep='\t', index=False)
print("Processing completed with 200 entries saved to 'geminaiCVSSmydataresp.tsv'.")


[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
        }
      ],
      "usage_metadata": {
        "prompt_token_count": 196,
        "total_token_count": 196
      }
    }),
)
An error occurred: No valid response parts found.
Skipping Description 70 due to an error.
Response object: response:
GenerateContentResponse(
    done=True,
    iterator=None,
    result=protos.GenerateContentResponse({
      "candidates": [
        {
          "finish_reason": "SAFETY",
          "index": 0,
          "safety_ratings": [
            {
              "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
              "probability": "NEGLIGIBLE"
            },
            {
              "category": "HARM_CATEGORY_HATE_SPEECH",
              "probability": "NEGLIGIBLE"
            },
            {
              "category": "HARM_CATEGORY_HARASSMENT",
              "probability": "NEGLIGIBLE"
            },
            {
              "category": "HARM_CATEGO

In [27]:
import pandas as pd

# Load the predictions file and ground truth file, with bad lines skipped
predictions_path = '/content/geminaiCVSSmydataresp.tsv'  # Replace with actual path to your predictions file
ground_truth_path = '/content/cve_dataset_with_cvss_vector.tsv'  # Replace with actual path to the ground truth file

# Read the TSV files and skip lines with issues
gt_df = pd.read_csv(ground_truth_path, sep='\t', on_bad_lines='skip')
gemini_df = pd.read_csv(predictions_path, sep='\t', on_bad_lines='skip')

# Clean and standardize the predictions by removing asterisks if present
gemini_df['Geminai'] = gemini_df['Geminai'].str.replace(r'[*]', '', regex=True).str.strip()

# Merge dataframes on the 'Description' column
merged_df = pd.merge(gt_df[['Description', 'CVSS Vector']], gemini_df[['Description', 'Geminai']], on='Description', how='inner')

# Calculate accuracy
accuracy = (merged_df['CVSS Vector'] == merged_df['Geminai']).mean() * 100
print(f"Accuracy: {accuracy:.2f}%")



Accuracy: 73.53%
