In [None]:
# prompt: mount drive

from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
cd drive/MyDrive/llamaconfig/

/content/drive/MyDrive/llamaconfig


In [None]:
pip install datasets transformers torch


Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [None]:
import os
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Step 1: Set Up Environment
print("Setting up environment...")

# Step 2: Set up Hugging Face Authentication
print("Setting up Hugging Face authentication...")
huggingface_token = ""  # Your Hugging Face Token

# Step 3: Prepare the Dataset
print("Loading dataset...")
data = pd.read_csv("ds-balanced.csv")  # Replace with the actual CSV file path
print("Dataset loaded successfully.")
print("Available columns:", data.columns)  # Display the available column names

# Verify the existence of the 'setup.py' column or its variant
column_name = None
if "setup.py" in data.columns:
    column_name = "setup.py"
elif "setup_py" in data.columns:
    column_name = "setup_py"
else:
    raise KeyError("The dataset does not contain a 'setup.py' or 'setup_py' column. Please check your dataset.")

print(f"Using column: {column_name}")

Convert to HuggingFace Dataset
print("Converting dataset to HuggingFace format...")
dataset = Dataset.from_pandas(data)
print("Conversion successful.")

# Step 4: Preprocess the Dataset
print("Tokenizing the dataset...")
# Tokenize the Data
model_name = "meta-llama/Meta-Llama-3-8B"  # Correct the model name if necessary
tokenizer = AutoTokenizer.from_pretrained(model_name, token=huggingface_token)

# Add a padding token if not already present
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

def tokenize_function(examples):
    texts = [str(text) for text in examples[column_name]]  # Use the correct column name
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)  # Adjust max_length as needed

 tokenized_datasets = dataset.map(tokenize_function, batched=True)
print("Tokenization completed.")

# Step 5: Adding Zero-Shot Context
def prepare_zero_shot_template(file_content):
    """
    Prepare the zero-shot prompt for the model with a context for evaluating malicious code.
    """
    return (
        "You are an AI assistant specialized in detecting malicious code in PyPI packages. "
        "Your task is to analyze the code snippet and indicate your analysis result with one of the two options:\n"
        "(1) Yes: Code is malicious\n"
        "(0) No: Code is not malicious\n"
        "Only reply with a single number: '1' for Yes (Code is malicious), or '0' for No (Code is not malicious). "
        "Do not include any additional text.\n"
        f"Code snippet: {file_content}"
    )

# Step 6: Initialize the Model
print("Initializing the model...")
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, token=huggingface_token)
print("Model initialized successfully.")

# Step 7: Zero-Shot Classification Function with Context
def zero_shot_classification(text):
    """
    Classify a given text (code snippet from setup.py) as malicious (1) or not malicious (0) using the prepared context.
    """
    # Prepare the zero-shot template for the input
    prompt = prepare_zero_shot_template(text)

    # Tokenize the context (which includes the instruction and the code snippet)
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True, max_length=128)

    # Get model output
    outputs = model(**inputs)

    # Get prediction (0 or 1)
    predictions = torch.argmax(outputs.logits, dim=-1)

    return predictions.item()

# Step 8: Apply Zero-Shot Classification to the Dataset with Context
print("Applying zero-shot classification with context...")

predictions = []
for code_snippet in data[column_name]:  # Use the correct column name
    # Classify each code snippet from the setup.py column with the context applied

    prediction = zero_shot_classification(code_snippet)
    predictions.append(prediction)

data["predictions"] = predictions
print("Zero-shot classification completed.")

# Step 9: Save Results to CSV
print("Saving results to CSV file...")
results_df = pd.DataFrame(data)
results_df.to_csv("zse_llama2_setup_py_with_context.csv", index=False)
print("Zero-shot evaluation results saved to zse_llama2_setup_py_with_context.csv")

# Step 10: Example Zero-Shot Classification on a new code snippet
new_data_result = zero_shot_classification("new setup.py code snippet")
print("Classification of new data point:", "Malicious" if new_data_result == 1 else "Not Malicious")


Setting up environment...
Setting up Hugging Face authentication...
Loading dataset...
Dataset loaded successfully.
Available columns: Index(['package_name', 'has_susp_url', 'Popular', 'is_license',
       'is_valid_author_emails', 'is_valid_homepage', 'PostInstallCommand',
       'min_cfg', 'has_verylong_string', 'file_list', 'setup.py'],
      dtype='object')
Using column: setup.py
Converting dataset to HuggingFace format...
Conversion successful.
Tokenizing the dataset...


tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

Map:   0%|          | 0/1499 [00:00<?, ? examples/s]

Tokenization completed.
Initializing the model...


config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at meta-llama/Meta-Llama-3-8B and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


Model initialized successfully.
Applying zero-shot classification with context...
Zero-shot classification completed.
Saving results to CSV file...
Zero-shot evaluation results saved to zse_llama2_setup_py_with_context.csv
Classification of new data point: Not Malicious
