In [32]:
# Import libraries

import pandas as pd
import numpy as np
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset, RandomSampler, SequentialSampler


In [2]:
# Load data from csv files

df_gpt4 = pd.read_csv("/Users/maxschaffelder/Desktop/Thesis/Data/GPT4/data_GPT4_para1.csv")
df_human = pd.read_csv("/Users/maxschaffelder/Desktop/Thesis/Data/Human/data_human_para2.csv")

# Remove unnecessary columns
df_gpt4 = df_gpt4.drop(["prompt"], axis=1)

# Add labels for classification
df_gpt4["label"] = 1
df_human["label"] = 0

# Combine human and GPT datasets into two datasets, one for gpt3.5 and one for gpt4
df_4 = pd.concat([df_gpt4, df_human])

df_4.head()


Unnamed: 0.1,Unnamed: 0,content,label
0,0,Title: Feminist Standpoint Theory: Women's Rol...,1
1,1,Title: Mitigating Gentrification: How Creative...,1
2,2,"Title: Dreamers, Drugs, and Duties: How Racist...",1
3,3,Title: Bharti Airtel Business Methods\n\nIntro...,1
4,4,Title: Did Popper Solve the Problem of Inducti...,1


In [3]:
# Shuffle datasets

#df_4 = df_4.sample(frac=1, random_state=42)
df_4 = df_4.reset_index(drop=True)

df_4.head()

Unnamed: 0.1,Unnamed: 0,content,label
0,0,Title: Feminist Standpoint Theory: Women's Rol...,1
1,1,Title: Mitigating Gentrification: How Creative...,1
2,2,"Title: Dreamers, Drugs, and Duties: How Racist...",1
3,3,Title: Bharti Airtel Business Methods\n\nIntro...,1
4,4,Title: Did Popper Solve the Problem of Inducti...,1


In [4]:
# Separate data into paragraphs

def separate_paragraphs(df, essay_column_name):
    essays_by_paragraph = []

     # Create a list to store the parsed paragraphs and essay indices
    parsed_data = []

    # Iterate over the essays in the specified column
    for essay_index, essay in enumerate(df[essay_column_name]):
        
        # Separate the essay into paragraphs
        paragraphs = essay.split('\n')

        # Create a list of tuples with the essay index and paragraph text
        for paragraph in paragraphs:
            parsed_data.append((essay_index, paragraph))

    # Create a new DataFrame with the parsed paragraphs and essay indices
    parsed_df = pd.DataFrame(parsed_data, columns=['essay_index', 'paragraph'])

    # Merge the original DataFrame with the parsed DataFrame
    merged_df = df.merge(parsed_df, left_index=True, right_on='essay_index')

    # Drop the essay_index column as it is not needed anymore
    merged_df.drop('essay_index', axis=1, inplace=True)

    return merged_df


df_4 = separate_paragraphs(df_4, "content")
df_4.head()


Unnamed: 0.1,Unnamed: 0,content,label,paragraph
0,0,Title: Feminist Standpoint Theory: Women's Rol...,1,Title: Feminist Standpoint Theory: Women's Rol...
1,0,Title: Feminist Standpoint Theory: Women's Rol...,1,
2,0,Title: Feminist Standpoint Theory: Women's Rol...,1,Introduction
3,0,Title: Feminist Standpoint Theory: Women's Rol...,1,
4,0,Title: Feminist Standpoint Theory: Women's Rol...,1,Feminist standpoint theory is an interdiscipli...


In [5]:
# Function to tokenize all data in the dataframe, by paragraph, making each entry a maximum of 512 tokens, 
# and adding padding if it's shorter

def tokenize_df(df):
    
    # import tokenizer
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base') 
    
    # initialize columns of new df
    tokenized_df_columns = ["tokenized text", "attention mask", "label"]
    
    # initialize new df for tokenized data
    tokenized_df = pd.DataFrame({col: [] for col in tokenized_df_columns})
    
    # temporary variable storing tokenized text
    combined_paragraph_tokens = []
    combined_paragraph_attention_masks = []
    
    # variable storing the current essay number in order to not combine different essays
    current_essay_nr = df["Unnamed: 0"][0] # initial value is first essay in list
    
    max_len = 512 # roberta model has a maximum input size of 512
    
    # Looping through original df (non-tokenized), getting paragraphs, labels, and essay number
    for paragraph, label, essay_nr in zip(df["paragraph"], df["label"], df["Unnamed: 0"]):

        # encoding the current paragraph
        paragraph_tokens = tokenizer.encode(paragraph) # tokens
        paragraph_attention_mask = tokenizer(paragraph)["attention_mask"] # attention mask
        
        # checking that 512 token length is not surpassed
        if len(combined_paragraph_tokens) + len(paragraph_tokens) <= max_len and essay_nr == current_essay_nr:
            #add new tokens
            combined_paragraph_tokens = combined_paragraph_tokens + paragraph_tokens 
            # add new attention maskss
            combined_paragraph_attention_masks = combined_paragraph_attention_masks + paragraph_attention_mask 
            
        # else if it would be too long, add entry and start new one
        elif len(combined_paragraph_tokens) + len(paragraph_tokens) > max_len:
            
            # add padding
            padding_amount = max_len - len(combined_paragraph_tokens)
            padding = [1 for i in range(padding_amount)]
            padding_attention_mask = [0 for i in range(padding_amount)]
            
            combined_paragraph_tokens = combined_paragraph_tokens + padding
            combined_paragraph_attention_masks = combined_paragraph_attention_masks + padding_attention_mask
            
            new_entry = {'tokenized text': combined_paragraph_tokens, "attention mask": combined_paragraph_attention_masks, 'label': label}
            tokenized_df.loc[len(tokenized_df)] = new_entry
            combined_paragraph_tokens = []
            combined_paragraph_attention_masks = []
            
        # else if a new essay has started
        elif essay_nr != current_essay_nr:
            
            # add padding
            padding_amount = max_len - len(combined_paragraph_tokens)
            padding = [1 for i in range(padding_amount)]
            padding_attention_mask = [0 for i in range(padding_amount)]
            combined_paragraph_tokens = combined_paragraph_tokens + padding
            combined_paragraph_attention_masks = combined_paragraph_attention_masks + padding_attention_mask
            
            new_entry = {'tokenized text': combined_paragraph_tokens, "attention mask": combined_paragraph_attention_masks, 'label': label}
            tokenized_df.loc[len(tokenized_df)] = new_entry
            combined_paragraph_tokens = []
            combined_paragraph_attention_masks = []
            current_essay_nr = essay_nr
           
    return tokenized_df


In [6]:
tokenized_df4 = tokenize_df(df_4)
tokenized_df4.head()

  element = np.asarray(element)
Token indices sequence length is longer than the specified maximum sequence length for this model (520 > 512). Running this sequence through the model will result in indexing errors


Unnamed: 0,tokenized text,attention mask,label
0,"[0, 46525, 35, 43588, 13371, 2300, 26305, 35, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
1,"[0, 2, 0, 597, 20554, 661, 14922, 6680, 34, 11...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
2,"[0, 2, 0, 9167, 1517, 2577, 17799, 14922, 6680...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
3,"[0, 2, 0, 46576, 2, 0, 2, 0, 534, 1342, 34136,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
4,"[0, 2, 0, 246, 4, 31392, 5206, 11176, 3109, 44...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1


In [9]:
# Remove rows where the list only contains 1s or starts with [0, 2] and contains only 1s afterwards
tokenized_df4 = tokenized_df4.loc[~tokenized_df4['tokenized text'].apply(lambda x: all(e == 1 for e in x) or (len(x) >= 2 and x[:2] == [0, 2] and all(e == 1 for e in x[2:]))), :]

# Print the resulting DataFrame
len(tokenized_df4["tokenized text"])


596

In [10]:
# Split data into training and test sets
train_data, test_data = train_test_split(tokenized_df4, test_size=0.2, random_state=42)

# Split the test set further into validation and test sets
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)


In [14]:
len(train_data["tokenized text"])

476

In [15]:
lengths = []

print(len(tokenized_df4))
#for i in tokenized_df["tokenized_text"]:
 #   print(i)
    
#print(tokenized_df["tokenized text"][60])    
#print(tokenized_df["attention mask"][60])
tokenized_df4.head(5)

596


Unnamed: 0,tokenized text,attention mask,label
0,"[0, 46525, 35, 43588, 13371, 2300, 26305, 35, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
1,"[0, 2, 0, 597, 20554, 661, 14922, 6680, 34, 11...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
2,"[0, 2, 0, 9167, 1517, 2577, 17799, 14922, 6680...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
3,"[0, 2, 0, 46576, 2, 0, 2, 0, 534, 1342, 34136,...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1
4,"[0, 2, 0, 246, 4, 31392, 5206, 11176, 3109, 44...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1


In [23]:
len(train_data)

476

In [30]:
# Create model (ADD TO OTHER CODE)

model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)




Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [20]:
# Convert pd dataframes into pytorch tensors

train_inputs = torch.tensor(train_data["tokenized text"].tolist())
train_masks = torch.tensor(train_data["attention mask"].tolist())
train_labels = torch.tensor(train_data["label"].tolist())


validation_inputs = torch.tensor(val_data["tokenized text"].tolist())
validation_masks = torch.tensor(val_data["attention mask"].tolist())
validation_labels = torch.tensor(val_data["label"].tolist())


In [26]:
# Create dataloader

batch_size = 16 # try different values

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)



In [None]:
epochs = 3
total_steps = len(train_dataloader) * epochs
optimizer = optim.AdamW(model.parameters(), lr=2e-5, eps=1e-8)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print_every = 100

# Training loop
for epoch in range(epochs):

    model.train()
    
    for step, batch in enumerate(train_dataloader):
        
        # Unpack the batch into input tensors, attention masks, and labels
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        # Clear any previously calculated gradients
        model.zero_grad()

        # Perform a forward pass through the model and compute the loss
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        # Perform a backward pass to compute gradients
        loss.backward()

        # Clip the gradients to prevent exploding gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Update the model's weights
        optimizer.step()

        # Update the learning rate
        scheduler.step()
        
    model.eval()
    eval_loss = 0.0
    for batch in validation_dataloader:
        # Unpack the batch into input tensors, attention masks, and labels
        input_ids = batch[0].to(device)
        attention_mask = batch[1].to(device)
        labels = batch[2].to(device)

        # Disable gradient calculations, as they are not needed for evaluation
        with torch.no_grad():
            # Perform a forward pass through the model and compute the loss
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            # Accumulate the evaluation loss
            eval_loss += loss.item()

    # Compute and print the average validation loss
    avg_eval_loss = eval_loss / len(validation_dataloader)
    print(f"Epoch {epoch+1}, Validation Loss: {avg_eval_loss}")

            

In [50]:
input_text = "This is an example sentence."

tokenizer = RobertaTokenizer.from_pretrained('roberta-base') 

# Using encode()
token_ids = tokenizer.encode(input_text)

# Using __call__()
encoded_input = tokenizer(input_text)

tokenizer.decode([0, 2, 0, 10462, 6, 8281, 3649, 14, 190, 114, 84, 5717, 32, 11359, 30, 26739, 5588, 8, 9186, 2433, 6, 51, 32, 45, 37771, 2368, 3030, 30, 106, 4, 91, 10648, 14, 2172, 64, 202, 33, 7017, 2640, 13, 49, 5717, 114, 49, 5717, 32, 9473, 39938, 877, 8, 1403, 12, 11847, 4, 2, 0, 2, 0, 33837, 8841, 2, 0, 2, 0, 133, 1050, 17874, 6, 215, 25, 16797, 8, 35638, 6, 67, 1693, 1142, 59, 5, 29988, 9, 36471, 1809, 19, 99, 52, 216, 59, 1050, 14766, 4, 1216, 17874, 3608, 14, 84, 5717, 8, 2163, 32, 11359, 30, 10, 3143, 9, 592, 6, 4106, 6, 8, 3039, 2433, 4, 2, 0, 2, 0, 10462, 6, 8281, 10648, 14, 190, 114, 84, 5717, 32, 11359, 30, 6731, 2433, 6, 51, 32, 45, 37771, 2368, 3030, 30, 106, 4, 91, 3649, 14, 2172, 33, 5, 476, 7, 4227, 15, 49, 3266, 8, 2807, 7, 1760, 11, 10753, 19, 106, 6, 61, 2386, 106, 7, 146, 5717, 14, 32, 45, 37771, 2368, 3030, 30, 6731, 2433, 4, 2, 0, 2, 0, 48984, 2, 0, 2, 0, 1121, 6427, 6, 5, 4286, 9, 481, 40, 34, 57, 19639, 13, 11505, 6, 8, 5, 36471, 1217, 9, 481, 40, 34, 57, 6835, 30, 26948, 1809, 4, 6903, 6, 5, 7404, 9, 7017, 2640, 1302, 7, 25696, 5, 240, 13, 23732, 50, 12754, 4620, 9, 1218, 50, 45832, 4, 635, 6, 1738, 8281, 34, 3751, 7, 27389, 36471, 1809, 19, 7017, 2640, 396, 9364, 7, 23732, 50, 12754, 4620, 9, 1218, 50, 45832, 4, 2, 0, 2, 0, 530, 1728, 10648, 14, 9473, 39938, 5073, 8, 1403, 12, 11847, 2163, 32, 2139, 13, 2172, 7, 33, 7017, 2640, 13, 49, 5717, 8, 2163, 4, 91, 3649, 14, 2172, 33, 5, 476, 7, 146, 5717, 14, 32, 45, 37771, 2368, 3030, 30, 6731, 2433, 6, 61, 2386, 106, 7, 33, 7017, 2640, 13, 49, 2163, 4, 2, 0, 2, 0, 5771, 8281, 17, 27, 29, 2120, 7, 27389, 36471, 1809, 19, 7017, 2640, 7700, 1142, 59, 5, 29988, 9, 42, 1217, 19, 99, 52, 216, 59, 1050, 14766, 11, 5, 2297, 2166, 6, 12243, 6, 8, 1050, 17874, 6, 39, 7576, 3608, 14, 2172, 33, 5, 476, 7, 146, 5717, 14, 32, 45, 37771, 2368, 3030, 30, 6731, 2433, 4, 2, 0, 2, 0, 28965, 6, 5, 864, 9, 549, 10, 36471, 1217, 9, 481, 40, 7980, 7017, 2640, 64, 28, 156, 37806, 4748, 396, 9364, 7, 23732, 50, 12754, 4620, 9, 1218, 50, 45832, 1189, 10, 5674, 9, 2625, 11, 10561, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

#encoded_input["attention_mask"]



'<s></s><s>However, Kane suggests that even if our choices are influenced by neural processes and genetic factors, they are not causally determined by them. He argues that individuals can still have ultimate responsibility for their choices if their choices are indeterminate and self-forming.</s><s></s><s>Human Sciences</s><s></s><s>The human sciences, such as psychology and sociology, also raise questions about the compatibility of libertarianism with what we know about human beings. These sciences suggest that our choices and actions are influenced by a variety of social, cultural, and environmental factors.</s><s></s><s>However, Kane argues that even if our choices are influenced by external factors, they are not causally determined by them. He suggests that individuals have the power to reflect on their values and choose to act in accordance with them, which allows them to make choices that are not causally determined by external factors.</s><s></s><s>Conclusion</s><s></s><s>In con

In [103]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('roberta-base')

text = "This is an example."

# Tokenize the text and add padding
encoded = tokenizer.encode(text, padding='max_length', max_length=10, truncation=True)

print(encoded)


[0, 713, 16, 41, 1246, 4, 2, 1, 1, 1]
