In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine, text
from sqlalchemy.exc import SQLAlchemyError
import datetime
import urllib.parse

# Thông tin kết nối PostgreSQL
DB_CONFIG = {
    'host': 'localhost',
    'database': 'processed',
    'user': 'airflow',
    'password': 'airflow',
    'port': '5432'
}

def create_connection():
    """Tạo kết nối đến PostgreSQL"""
    try:
        # Properly quote the password to handle special characters like '#'
        quoted_password = urllib.parse.quote_plus(DB_CONFIG['password'])
        connection_string = f"postgresql+psycopg2://{DB_CONFIG['user']}:{quoted_password}@{DB_CONFIG['host']}:{DB_CONFIG['port']}/{DB_CONFIG['database']}"
        engine = create_engine(connection_string)
        return engine
    except Exception as e:
        print(f"Lỗi kết nối: {e}")
        return None

engine = create_connection()
if engine is None:
    print("Không thể kết nối đến cơ sở dữ liệu.")
else:
    print("Kết nối thành công đến cơ sở dữ liệu PostgreSQL.")

Kết nối thành công đến cơ sở dữ liệu PostgreSQL.


In [5]:
import pandas as pd
pd.options.mode.copy_on_write = True
from pydantic_settings import BaseSettings
from dotenv import load_dotenv
import os

# Load biến môi trường từ file .env
load_dotenv()

class MlflowSettings(BaseSettings):
    MLFLOW_TRACKING_URI: str = os.getenv("MLFLOW_TRACKING_URI")

In [7]:
import mlflow
from mlflow.transformers import log_model
from transformers import DistilBertTokenizer, DistilBertModel
import torch

class DistilBERTFeatureExtractor:
    def __init__(self, model_name="distilbert-base-uncased"):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.tokenizer = DistilBertTokenizer.from_pretrained(model_name)
        self.model = DistilBertModel.from_pretrained(model_name).to(self.device)
        self.model.eval()

    def extract_features(self, text):
        """Trích xuất đặc trưng từ DistilBERT"""
        inputs = self.tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {key: value.to(self.device) for key, value in inputs.items()}

        with torch.no_grad():
            outputs = self.model(**inputs)

        sentence_embedding = outputs.last_hidden_state[:, 0, :].detach().cpu().numpy().squeeze()
        return sentence_embedding

tracking = MlflowSettings()
mlflow.set_tracking_uri(tracking.MLFLOW_TRACKING_URI)
# Tạo model và tokenizer
model_name = "distilbert-base-uncased"
model = DistilBertModel.from_pretrained(model_name)
tokenizer = DistilBertTokenizer.from_pretrained(model_name)

# Lưu model vào MLflow
with mlflow.start_run():
    # Log model using transformers flavor
    mlflow.transformers.log_model(
        transformers_model={
            "model": model,
            "tokenizer": tokenizer
        },
        name="distilbert_feature_extractor",
        registered_model_name="DistilBERT-FeatureExtractor",
        input_example=["This is a sample text for feature extraction"],
        task="feature-extraction"
    )

    # Log parameters
    mlflow.log_params({
        "model_name": model_name,
        "model_type": "distilbert",
        "task": "feature-extraction",
        "framework": "pytorch",
        "max_length": "512"
    })

    # Log device info
    mlflow.log_param("device", str(torch.device('cuda' if torch.cuda.is_available() else 'cpu')))

    print("Model logged successfully to MLflow!")

Device set to use cpu
Downloading artifacts: 100%|██████████| 14/14 [00:00<00:00, 121.20it/s] 
Device set to use cpu


🏃 View run able-shrike-715 at: http://127.0.0.1:5000/#/experiments/0/runs/38585b4868594524a8705f950381bc7e
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/0


MlflowException: API request to http://127.0.0.1:5000/api/2.0/mlflow-artifacts/artifacts/0/models/m-fc4de28e81294a4d8b5b9d93cdee998c/artifacts/conda.yaml failed with exception HTTPConnectionPool(host='127.0.0.1', port=5000): Max retries exceeded with url: /api/2.0/mlflow-artifacts/artifacts/0/models/m-fc4de28e81294a4d8b5b9d93cdee998c/artifacts/conda.yaml (Caused by ResponseError('too many 500 error responses'))

In [None]:
label_to_number = {
    "benign": 0,
    "dirb": 1,
    "wpscan": 2,
    "service_scans": 3,
    "network_scans": 4,
    "privilege_escalation": 5,
    "dnsteal": 6,
    "reverse_shell": 7,
    "service_stop": 8,
    "webshell": 9,
    "cracking": 10
}