In [1]:
import pandas as pd
from sqlalchemy import create_engine
from sklearn.ensemble import IsolationForest

# PostgreSQL 연결 설정
DATABASE_URL = "postgresql://admin:admin@localhost:5432/logdb"
engine = create_engine(DATABASE_URL)

# 데이터 로드 함수
def load_data():
    query = "SELECT * FROM document_logs;"
    data = pd.read_sql(query, engine)
    return data

# 데이터 전처리 함수
def preprocess_data(data, all_features=None):
    data['timestamp'] = pd.to_datetime(data['timestamp'])
    data['hour'] = data['timestamp'].dt.hour
    data['weekday'] = data['timestamp'].dt.weekday
    features = data[['user_id', 'document_id', 'activity_type', 'hour', 'weekday']]
    encoded_features = pd.get_dummies(features)

    # 모든 피처를 고정하여 일관성 유지
    if all_features is not None:
        encoded_features = encoded_features.reindex(columns=all_features, fill_value=0)

    return encoded_features

# 모델 학습 함수
def train_model(data):
    model = IsolationForest(contamination=0.05, random_state=42)
    model.fit(data)
    return model

# 모델 학습 및 저장
data = load_data()
processed_data = preprocess_data(data)
all_features = processed_data.columns  # 학습 시의 피처 이름 저장

model = train_model(processed_data)

# 학습된 모델과 피처 이름 저장
import pickle
with open("isolation_forest_model.pkl", "wb") as f:
    pickle.dump((model, all_features), f)
print("Model and features saved successfully!")


Model saved successfully!
