<a href="https://colab.research.google.com/github/moosemorse/AI_Text_Detector/blob/main/TATG_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setup

In [None]:
#gets rid of installation dialogue
%%capture
!pip install transformers
!pip install pytorch
!pip install datasets

In [None]:
import os
import matplotlib.pyplot as plt
from google.colab import files, drive
from datasets import load_dataset
import pandas as pd
import seaborn as sns
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer
import numpy as np
import copy

In [None]:
#mount drive, gain access to file in google drive
drive.mount('/content/drive', force_remount=False)

#obtain csv file and store in var 'df' as dataframe
train_path = "drive/MyDrive/GPT-wiki-intro.csv"
df = pd.read_csv(train_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Inspect data

In [None]:
#data to describe csv file
print(df.describe())

In [None]:
df.head()

In [None]:
print(df.iloc[:].loc[:, ['wiki_intro', 'generated_intro']])
#iloc dictates the rows indexed
#loc dictates the columns extracted

In [None]:
#visualisation to compare data for human-written text and ai-written text
#testing seaborn and these could be helpful for evaluation afterwards
sns.countplot(x = 'wiki_intro_len', data = df)
plt.show()

sns.countplot(x = 'generated_intro_len', data = df)
plt.show()

In [None]:
df.max()

# Dataset

In [None]:
#dataset class inherits dataset module imported from torch
class ChatGPT_Dataset(Dataset):

  def __init__(self, data_path, tokenizer, max_token_len = 512):
    self.data_path = data_path
    self.tokenizer = tokenizer
    self.max_token_len = max_token_len
    self._prepare_data()

  #clean dataframe to create dataset with text needed and labels
  #1 represents human written, 0 represents generated
  def _prepare_data(self):
    data = pd.read_csv(self.data_path)
    generated = pd.DataFrame({'text': data['generated_intro'], 'label': 0})
    wiki = pd.DataFrame({'text': data['wiki_intro'], 'label': 1})
    self.data = pd.concat([generated, wiki])
    self.data.label = str(self.data.label)

  def __len__(self):
    return (len(self.data))

  def __getitem__(self, index):
    item = self.data.iloc[index]
    text = str(item.text)
    label = torch.FloatTensor(item['label'])
    tokens = self.tokenizer.encode_plus(item,
                                        add_special_tokens = True,
                                        return_tensors ='pt',
                                        truncation = True,
                                        max_length = self.max_token_len,
                                        padding = 'max_length',
                                        return_attention_mask = True)

    return {'input_ids': tokens.input_ids.flatten(), 'attention_mask': tokens.attention_mask.flatten(),
            'labels': label }

In [None]:
model_name = 'roberta-base'
tokenizer = RobertaTokenizer.from_pretrained(model_name) #use pretrained tokenizer used from Roberta
ChatGPT_ds = ChatGPT_Dataset(train_path, tokenizer) #instance of dataset
ChatGPT_ds.__getitem__(0)

TypeError: ignored

# Creating the model

# Evaluating the model