# Prediction using pretrained models

### Install libraries

In [None]:
%pip install openai python-dotenv seaborn matplotlib sqlalchemy

Note: you may need to restart the kernel to use updated packages.


### Configuring OpenAI's client with OpenRouter

In [None]:
import os

from dotenv import load_dotenv
from openai import OpenAI

# Load the API key from the .env file
load_dotenv()

OPENROUTER_API_KEY = os.getenv("OPENROUTER_API_KEY")

# Create a client
client = OpenAI(
    base_url="https://openrouter.ai/api/v1",
    api_key=OPENROUTER_API_KEY,
)

### Functions

In [None]:
def is_phishing(guess: bool):
    return guess


# "function definitions" for the OpenAI API. This helps the model understand what the functions are supposed to do.
tools = [
    {
        "type": "function",
        "function": {
            "name": "is_phishing",
            "description": "Determines if this URL is a phishing website.",
            "parameters": {
                "type": "object",
                "properties": {
                    "guess": {
                        "type": "boolean",
                        "description": "The guess of whether the website is a phishing website.",
                    },
                },
                "required": ["guess"],
                "additionalProperties": False,
            },
        },
    },
]

In [None]:
import json
import re


def label(url: str, model: str = "openai/gpt-3.5-turbo"):
    # Prompt the model
    messages = [
        {
            "role": "system",
            "content": "You are a website security tool. You have been asked to determine if the following URL is a phishing website.",
        },
        {
            "role": "user",
            "content": f"This is the URL of the website: {url}",
        },
    ]

    # Make a request to the API
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        tools=tools,
        stream=False,  # Return all messages at once
        parallel_tool_calls=True,  # Functions can be called in parallel - no need to be executed sequentially
        tool_choice="required",  # Forces the assistant to use the tools
        temperature=0.0,  # No randomness/creativity in the responses,
    )

    for choice in response.choices:
        if choice.message.tool_calls:
            # Arguments are passed as a JSON string
            tool_call = choice.message.tool_calls[0]
            # Parse arguments as JSON
            arguments = json.loads(tool_call.function.arguments)
            # Extract the guessed value
            guess = arguments['guess']
            return guess
        if choice.message.content:
            # Some Non-OpenAI models returns tool_calls as part of the message content (as a string)
            # Extract the JSON part using regex
            content = choice.message.content
            json_str = re.search(r'\[TOOL_CALLS\] (\[.*?\])', content).group(1)
            # Parse the JSON
            tool_calls = json.loads(json_str)
            tool_call = tool_calls[0]
            # Extract the guessed value
            guess = tool_call['arguments']['guess']
            return guess


label("https://bafybeidszm5hjkodoz44tw2fpdiaomoanrtpcqwf6ym236mjsxvop75paa.ipfs.cf-ipfs.com/ddoonncn.html")

False

### Labeling

In [None]:
label("https://www.google.com")

AttributeError: 'Choice' object has no attribute 'content'

In [None]:
label("https://societegenerale.ci/fr/")

[{"name": "is_phishing", "arguments": {"guess": false}}]
[{'name': 'is_phishing', 'arguments': {'guess': False}}]


False

In [None]:
label("https://nuxt.lemonsqueezy.com/checkout")

[{"name": "is_phishing", "arguments": {"guess": false}}]
[{'name': 'is_phishing', 'arguments': {'guess': False}}]


False

In [None]:
label("https://www.binance.com/fr")

[{"name": "is_phishing", "arguments": {"guess": false}}]
[{'name': 'is_phishing', 'arguments': {'guess': False}}]


False

In [None]:
from sqlalchemy import create_engine

DATABASE_URL = os.getenv("DATABASE_URL")

engine = create_engine(DATABASE_URL, connect_args={"options": "-c timezone=utc"})

In [None]:
import pandas as pd

# Load the dataset from the database
df = pd.read_sql_table("url", engine, index_col=None, parse_dates=["created_at", "updated_at"])

In [None]:
from datetime import datetime


def save_results(model_name: str, X_train, y_pred, y_train):
    # Create a DataFrame with X_train, y_pred, and y_train
    results_df = pd.DataFrame({
        'X': X_train,
        'y': y_train,
        'y_pred': y_pred,
    })

    # Add columns for the model name and the current date-time
    current_datetime = datetime.now().strftime("%Y-%m-%d_%H:%M:%S")
    filename = f"{model_name.replace('/', '-')}-{current_datetime}.csv"

    # Save the DataFrame to a CSV file
    results_df.to_csv(f"../data/{filename}", index=False)

In [None]:
from sklearn.model_selection import train_test_split

X = df["url"]
y = df["is_phishing"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.1, random_state=42)

In [None]:
model_name = "openai/gpt-3.5-turbo"


#model_name="google/gemini-pro"
#model_name="mistralai/mistral-7b-instruct"

def apply_label(url: str):
    try:
        return label(url, model=model_name)
    except Exception as e:
        print(url)
        print(e)
        return None


y_pred = X_train.apply(apply_label)

save_results(model_name, X_train, y_pred, y_train)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(y_train, y_pred)

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Generate the confusion matrix
cm = confusion_matrix(y_train, y_pred)

# Create a Confusion Matrix
plt.figure(figsize=(8, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens')
plt.title('Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()