In [None]:
# prompt: mount drive

from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
cd drive/MyDrive/llamaconfig/

[Errno 2] No such file or directory: 'drive/MyDrive/llamaconfig/'
/content/drive/MyDrive/llamaconfig


In [None]:
pip install datasets transformers torch


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [None]:
import os
import pandas as pd
import torch
from datasets import Dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification

# Step 1: Set Up Environment
print("Setting up environment...")

# Step 2: Set up Hugging Face Authentication
print("Setting up Hugging Face authentication...")
huggingface_token = ""  # Your Hugging Face Token

# Step 3: Prepare the Dataset
print("Loading dataset...")
data = pd.read_csv("Dataset500.csv")  # Replace with the actual CSV file path
print("Dataset loaded successfully.")
print("Available columns:", data.columns)  # Display the available column names

# Verify the existence of the 'setup.py' column or its variant
column_name = None
if "setup.py" in data.columns:
    column_name = "setup.py"
elif "setup_py" in data.columns:
    column_name = "setup_py"
else:
    raise KeyError("The dataset does not contain a 'setup.py' or 'setup_py' column. Please check your dataset.")

print(f"Using column: {column_name}")

# Convert to HuggingFace Dataset
print("Converting dataset to HuggingFace format...")
dataset = Dataset.from_pandas(data)
print("Conversion successful.")

# Step 4: Load CodeBERT Model for Classification
print("Loading CodeBERT model for classification...")
model_name = "microsoft/codebert-base"  # Change to a classification model
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Enable GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print("CodeBERT classification model loaded successfully.")

# Step 5: Function to Classify Code as Malicious or Benign
def classify_code_snippet(code_snippet):
    """
    Uses CodeBERT to classify a code snippet as malicious (1) or benign (0).
    Ensures the input does not exceed the 512-token limit.
    """
    if pd.isna(code_snippet):  # Handle NaN values
        return 0  # Default to benign

    code_snippet = str(code_snippet)  # Ensure it's a string

    # Tokenize and truncate the input to 512 tokens
    inputs = tokenizer(code_snippet, truncation=True, max_length=512, return_tensors="pt").to(device)

    # Run the model for classification
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=-1).item()  # Convert to integer (0 or 1)

    return predictions  # Returns 1 for malicious, 0 for benign

# Step 6: Apply CodeBERT Classification to Dataset
print("Applying CodeBERT classification to dataset...")
predictions = [classify_code_snippet(code_snippet) for code_snippet in data[column_name]]

data["predictions"] = predictions
print("CodeBERT classification completed.")

# Step 7: Save Results to CSV
print("Saving results to CSV file...")
results_df = pd.DataFrame(data)
results_df.to_csv("codebert_classification_results.csv", index=False)
print("Results saved to codebert_classification_results.csv")

# Step 8: Example Classification on a New Code Snippet
new_code_snippet = "if os.system('rm -rf /'):\n    print('Malicious Code')"
new_data_result = classify_code_snippet(new_code_snippet)
print("Example Code Classification:", "Malicious" if new_data_result == 1 else "Benign")

Setting up environment...
Setting up Hugging Face authentication...
Loading dataset...
Dataset loaded successfully.
Available columns: Index(['package_name', 'has_susp_url', 'Popular', 'is_license',
       'is_valid_author_emails', 'is_valid_homepage', 'PostInstallCommand',
       'min_cfg', 'has_verylong_string', 'file_list', 'setup.py'],
      dtype='object')
Using column: setup.py
Converting dataset to HuggingFace format...
Conversion successful.
Loading CodeBERT model for classification...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at microsoft/codebert-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CodeBERT classification model loaded successfully.
Applying CodeBERT classification to dataset...
CodeBERT classification completed.
Saving results to CSV file...
Results saved to codebert_classification_results.csv
Example Code Classification: Benign
