In [None]:
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=1)

data_jasons = {"reddit": "reddit_data_set.json", "wikipedia": "wikipedia_summaries.json", "news": "newspapers_data.json"}

is_train = True
load_model = False
load_optimizer = False
model_path = "/content/bert_classifier.pth"
optimizer_path = "/content/optimizer.pth"
epoch_path = "/content/epoch.pth"
if load_model:
    model.load_state_dict(torch.load(model_path))

model.to(device)
def tokenize_data(texts, labels, max_length=128):
    input_ids = []
    attention_masks = []

    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            padding='max_length',
            return_attention_mask=True,
            truncation=True,
            return_tensors='pt'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels, dtype=torch.float)

    return input_ids, attention_masks, labels

def trainer(epochs_num, model, criterion, device, optimizer, train_dataloader,val_dataloader ):
    print(epochs_num)
    train_accuracies = []
    val_accuracies = []
    index = 0
    for epoch in tqdm(range(epochs_num)):
        batch_number = 0
        model.train()
        train_accuracy = 0
        for batch in train_dataloader:
            batch_number += 1
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids': batch[0],
                    'attention_mask': batch[1],
                    'labels': batch[2]}

            outputs = model(**inputs)
            logits = outputs.logits
            predictions = [1 if i > 0.5 else 0 for i in logits]
            N = len(predictions)
            for i in range(N):
                if predictions[i] == inputs['labels'][i]:
                    train_accuracy += 1
            loss = criterion(logits.view(-1), inputs['labels'].view(-1))
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        train_accuracies.append(train_accuracy / len(train_dataloader.dataset))
        # Validation
        model.eval()
        val_accuracy = 0
        for i, batch in enumerate(val_dataloader):
            batch = tuple(t.to(device) for t in batch)
            inputs = {'input_ids': batch[0],
                    'attention_mask': batch[1],
                    'labels': batch[2]}

            with torch.no_grad():
                outputs = model(**inputs)

            logits = outputs.logits
            predictions = [1 if i > 0.5 else 0 for i in logits]
            N = len(predictions)
            for i in range(N):
                if predictions[i] == inputs['labels'][i]:
                    val_accuracy += 1

        val_accuracies.append(val_accuracy / len(val_dataloader.dataset))
        print(f"Epoch {epoch+1}/{epochs_num} - Validation Accuracy: { val_accuracies[index]:.4f}, Train Accuracy: {train_accuracies[index]:.4f}")
        index+=1
        torch.save(model.state_dict(), 'bert_classifier.pth')
        torch.save(optimizer.state_dict(), 'optimizer.pth')
        torch.save({ epoch: i }, 'epoch.pth')
    return train_accuracies, val_accuracies

def evaluate(model, criterion, device, dataloader):
    model.eval()
    accuracy = 0
    predictions = []
    for i, batch in enumerate(dataloader):
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[2]}

        with torch.no_grad():
            outputs = model(**inputs)

        logits = outputs.logits
        batch_predictions = [1 if i > 0.5 else 0 for i in logits]
        predictions.extend(batch_predictions)

        N = len(batch_predictions)
        for i in range(N):
            if batch_predictions[i] == inputs['labels'][i]:
                accuracy += 1

    accuracy /= len(dataloader.dataset)
    print(f"Accuracy: {accuracy:.4f}")

    return predictions

data = read_data(data_jasons)
# for key in data_jasons.keys():
#     data_path = data_jasons[key]
#     with open(data_path, 'r') as file:
#         json_data = json.load(file)
#         if key == "reddit":
#             for item in json_data.values():
#                 data.append((item['human_comment'], 0))
#                 data.append((item['AI_comment'], 1))
#         else:
#             for item in json_data.values():
#                 data.append((item['human_comment'], 0))
#                 data.append((item['AI_comment'], 1))


print("data size:", len(data), "\n")
train_data, test_data = train_test_split(data, test_size=0.2)
train_texts = []
train_labels = []
val_texts = []
val_labels = []
for item in train_data:
    train_texts.append(item[0])
    train_labels.append(item[1])
for item in test_data:
    val_texts.append(item[0])
    val_labels.append(item[1])



train_inputs, train_masks, train_labels = tokenize_data(train_texts, train_labels) #TODO add the data
val_inputs, val_masks, val_labels = tokenize_data(val_texts, val_labels) # TODO add the data

batch_size = 32

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

val_data = TensorDataset(val_inputs, val_masks, val_labels)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
epochs_num = 20
criterion = nn.BCEWithLogitsLoss()

if load_optimizer:
    optimizer.load_state_dict(optimizer_path)
    epochs_num -= torch.load(epoch_path)

if is_train:
    train_accuracies, val_accuraccies = trainer(epochs_num, model, criterion ,device, optimizer, train_dataloader, val_dataloader)
    plt.plot(train_accuracies, label='Training Accuracy')
    plt.plot(val_accuraccies, label='Validation Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    plt.title('Training and Validation Accuracy')
    plt.savefig('/content/accuracy.png')
    plt.show()

else:
    predctions, accuracy = evaluate(model, criterion, device, val_dataloader)



In [None]:
# prompt: i want to  get acces in my code to a local file

from google.colab import files

files.upload()



In [None]:
import json

def read_data(data_key_path):
    data = []
    for key in data_key_path.keys():
        data_path = data_key_path[key]
        if key == "reddit":
            with open('/content/' + data_path, 'r') as file:
                json_data = json.load(file)
                for item in json_data.values():
                    data.append((item['human_comment'], 0))
                    data.append((item['AI_comment'], 1))
        elif key == "wikipedia":
            with open('/content/' + data_path, 'r') as file:
                json_data = json.load(file)
                for item in json_data.values():
                    data.append((item['Summary'], item['Is AI generated']))
        else:
            with open('/content/' + data_path, 'r', encoding='utf-8') as file:
                json_data = json.load(file)
                for key in json_data.keys():
                    if key == "human":
                        for item in json_data[key]:
                            data.append((item, 0))
                    else:
                        for item in json_data[key]:
                            data.append((item, 1))
    return data

In [None]:
# files.download('bert_classifier.pth')
files.download('optimizer.pth')
# files.download('epoch.pth')