In [4]:
# Cell 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import json
import pickle
import time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix, roc_curve
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import requests
import joblib
import shap
from sklearn.linear_model import LogisticRegression
from transformers import GPT2Tokenizer, GPT2Model, AutoTokenizer, AutoModel
import torch
from tqdm.notebook import tqdm

  from pandas.core import (
  torch.utils._pytree._register_pytree_node(


In [5]:
# Set random seed
np.random.seed(42)
torch.manual_seed(42)

# Set plot style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('Set2')
%matplotlib inline

In [6]:
# Cell 2: Load and prepare data
# Load data
df = pd.read_csv('../data/processed/heart_processed.csv')
print(f"Dataset shape: {df.shape}")
df.head()

# Feature names and descriptions for prompt engineering
feature_descriptions = {
    'age': 'age in years',
    'sex': 'sex (1 = male; 0 = female)',
    'cp': 'chest pain type (0: typical angina, 1: atypical angina, 2: non-anginal pain, 3: asymptomatic)',
    'trestbps': 'resting blood pressure in mm Hg on admission to the hospital',
    'chol': 'serum cholesterol in mg/dl',
    'fbs': 'fasting blood sugar > 120 mg/dl (1 = true; 0 = false)',
    'restecg': 'resting electrocardiographic results (0: normal, 1: having ST-T wave abnormality, 2: showing probable or definite left ventricular hypertrophy)',
    'thalach': 'maximum heart rate achieved',
    'exang': 'exercise induced angina (1 = yes; 0 = no)',
    'oldpeak': 'ST depression induced by exercise relative to rest',
    'slope': 'the slope of the peak exercise ST segment (0: upsloping, 1: flat, 2: downsloping)',
    'ca': 'number of major vessels (0-3) colored by fluoroscopy',
    'thal': 'thalassemia (1: normal, 2: fixed defect, 3: reversible defect)'
}

Dataset shape: (1025, 14)


In [7]:
# Cell 3: Split data
# Split data into features and target
X = df.drop('target', axis=1)
y = df['target']

# Split into train, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")

Training set: (717, 13)
Validation set: (154, 13)
Test set: (154, 13)


In [11]:
!pip install openai==0.28

Collecting openai==0.28
  Obtaining dependency information for openai==0.28 from https://files.pythonhosted.org/packages/ae/59/911d6e5f1d7514d79c527067643376cddcf4cb8d1728e599b3b03ab51c69/openai-0.28.0-py3-none-any.whl.metadata
  Downloading openai-0.28.0-py3-none-any.whl.metadata (13 kB)
Downloading openai-0.28.0-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 1.42.0
    Uninstalling openai-1.42.0:
      Successfully uninstalled openai-1.42.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain-openai 0.0.5 requires openai<2.0.0,>=1.10.0, but you have openai 0.28.0 which is incompatible.[0m[31m
[0mSuccessfully installed openai-0.28.0


In [None]:
# import os
# from dotenv import load_dotenv
# from pathlib import Path
# from openai import OpenAI

# # 获取当前工作目录
# current_dir = Path(os.getcwd())

# # 获取父目录路径
# parent_dir = current_dir.parent

# # 构建.env.local的完整路径
# env_path = parent_dir / '.env.local'
# print(f"尝试加载环境变量文件: {env_path}")

# # 加载环境变量
# load_dotenv(dotenv_path=env_path)

# # 检查环境变量是否已加载
# api_key = os.environ.get("OPENAI_API_KEY")
# print(f"API密钥是否存在: {api_key is not None}")

尝试加载环境变量文件: /Users/quanhongjin/Documents/Cornell Tech/ML4Health/FP/.env.local
API密钥是否存在: True
发送提示到gpt-4 API:
提示长度: 1173字符
示例响应(CVD的概率): As an AI, I don't have the ability to diagnose or predict medical conditions. However, I can tell you that several factors in this patient's information, such as the presence of exercise-induced angina, ST depression induced by exercise, and the
