<a href="https://colab.research.google.com/github/nisha1365/precisionFDA/blob/main/model_finding_in_precision.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import torch
from sklearn.metrics import accuracy_score, confusion_matrix
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset

# Load the data into a Pandas DataFrame
df = pd.read_csv('your_data.csv')  # Replace 'your_data.csv' with the actual file path

# Split the data into training and testing sets
X = df.drop(['internalpatientid', 'target'], axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load the pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Tokenize and encode the textual data
X_train_encodings = tokenizer.batch_encode_plus(
    X_train.values.tolist(),
    truncation=True,
    padding=True,
    return_tensors='pt'
)

X_test_encodings = tokenizer.batch_encode_plus(
    X_test.values.tolist(),
    truncation=True,
    padding=True,
    return_tensors='pt'
)

# Create PyTorch DataLoader objects
train_dataset = TensorDataset(
    X_train_encodings['input_ids'],
    X_train_encodings['attention_mask'],
    torch.tensor(y_train.values)
)

test_dataset = TensorDataset(
    X_test_encodings['input_ids'],
    X_test_encodings['attention_mask'],
    torch.tensor(y_test.values)
)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

# Set up the optimizer and training loop
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

epochs = 5
for epoch in range(epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        all_labels = []
        all_preds = []
        for batch in test_loader:
            input_ids, attention_mask, labels = batch
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)

            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_preds)
print(f"Accuracy: {accuracy}")

# Compute confusion matrix
confusion_mat = confusion_matrix(all_labels, all_preds)
print("Confusion Matrix:")
print(confusion_mat)
