In [87]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 파일 불러옴
data = pd.read_csv('downloads/dataset_caseid.csv', index_col=0)

# Feature와 Label 분리
X = data[['caseid']] 
y = data['label']

# 데이터 분리: 80% 훈련, 20% 테스트
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# ridge사용, GridSearchCV를 이용한 최적의 파라미터 찾기
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],              # 정규화 강도를 제어
    'solver': ['liblinear', 'lbfgs'],          # 최적화 알고리즘 선택
    'max_iter': [100, 200, 500, 1000]          # 최대 반복 횟수
}

ridge_clf = LogisticRegression(penalty='l2', random_state=42)
grid_search = GridSearchCV(ridge_clf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

# 최적의 파라미터 출력
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# 테스트 세트 평가
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# 결과 출력
print("Best Hyperparameters:", best_params)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(report)


Best Hyperparameters: {'C': 0.01, 'max_iter': 100, 'solver': 'liblinear'}
Accuracy: 96.95%
Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       191
           1       0.00      0.00      0.00         6

    accuracy                           0.97       197
   macro avg       0.48      0.50      0.49       197
weighted avg       0.94      0.97      0.95       197



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
