In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, RobertaModel
from torch.optim import AdamW
import matplotlib.pyplot as plt

In [16]:
df_arg = pd.read_csv("Data/argquality_data.csv") 
df_arg = df_arg.drop(["annotator", "#id", "issue", "stance"], axis=1)

# Define mapping from string to float
mapping = {
    "1 (Low)": 1.0,
    "2 (Average)": 2.0,
    "3 (High)": 3.0
}

# List of columns to convert
columns_to_convert = [
    'overall quality', 'local acceptability', 'appropriateness', 'arrangement', 
    'clarity', 'cogency', 'effectiveness', 'global acceptability', 'global relevance', 
    'global sufficiency', 'reasonableness', 'local relevance', 'credibility', 
    'emotional appeal', 'sufficiency'
]

# Replace string values with corresponding float values
df_arg[columns_to_convert] = df_arg[columns_to_convert].replace(mapping)
df_arg_filtered = df_arg.dropna(subset=columns_to_convert)

# Get averaged values
numeric_columns = df_arg_filtered[columns_to_convert].select_dtypes(include='number').columns.tolist()
averaged_df_arg = df_arg_filtered.groupby('argument')[numeric_columns].mean().reset_index()

X = averaged_df_arg["argument"] # Feature
y = averaged_df_arg["cogency"] # Label (adjust if needed) --> overall quality, cogency, effectiveness, reasonableness

  df_arg[columns_to_convert] = df_arg[columns_to_convert].replace(mapping)


# Linear regression model

In [17]:
# Linear regression

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert text data to numerical features using TfidfVectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train a regression model
model = LinearRegression()
model.fit(X_train_tfidf, y_train)

# Make predictions
y_pred = model.predict(X_test_tfidf)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error: {mse}")

# Example of predicting on new data
#new_data = ["new text to predict"]
#new_data_tfidf = vectorizer.transform(new_data)
#predicted_value = model.predict(new_data_tfidf)
#print(f"Predicted Value: {predicted_value[0]}")

Mean Squared Error: 0.26768348155514116


# Roberta Model

In [18]:
# Hyperparameters
max_len_tokenizer = 256
num_epochs = 4
batch_size = 8
learning_rate = 3e-6

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
val_loss_list = []
train_loss_list = []

# Define dataset class
class TextRegressionDataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        target = self.targets[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(target, dtype=torch.float)
        }

# Initialize tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
base_model = RobertaModel.from_pretrained('roberta-base')


# Custom regression head on top of RoBERTa
class RobertaRegression(torch.nn.Module):
    def __init__(self, base_model):
        super(RobertaRegression, self).__init__()
        self.base_model = base_model
        self.regressor = torch.nn.Linear(base_model.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state
        pooled_output = last_hidden_state[:, 0, :]  # CLS token
        return self.regressor(pooled_output)

model = RobertaRegression(base_model)

# Create dataset objects
full_train_dataset = TextRegressionDataset(X_train.tolist(), y_train.tolist(), tokenizer, max_len=max_len_tokenizer)
test_dataset = TextRegressionDataset(X_test.tolist(), y_test.tolist(), tokenizer, max_len=max_len_tokenizer)

# Split the training dataset into training and validation sets
train_size = int(0.8 * len(full_train_dataset))
val_size = len(full_train_dataset) - train_size
train_dataset, val_dataset = random_split(full_train_dataset, [train_size, val_size])

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Training setup
device = torch.device('mps' if torch.backends.mps.is_available() else 'cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.MSELoss()

# Training loop

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        loss = criterion(outputs.squeeze(), targets)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    train_loss_list.append(avg_loss)
    
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.squeeze(), targets)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    val_loss_list.append(avg_val_loss)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {avg_loss}, Val Loss: {avg_val_loss}")

# Evaluation on the test set
model.eval()
predictions = []
actuals = []
with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        predictions.extend(outputs.squeeze().tolist())
        actuals.extend(targets.tolist())

mse = mean_squared_error(actuals, predictions)
print(f"Mean Squared Error: {mse}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1/4, Loss: 1.9338572071148799, Val Loss: 0.9922287804739816
Epoch 2/4, Loss: 0.7213513662035649, Val Loss: 0.33086504680769785
Epoch 3/4, Loss: 0.4641992674710659, Val Loss: 0.2591967007943562
Epoch 4/4, Loss: 0.36394136857527953, Val Loss: 0.2683177888393402
Mean Squared Error: 0.2241039622414447


In [None]:
# Example of predicting on new data
new_text = ["new text to predict"]
encoding = tokenizer(new_text, return_tensors='pt', padding=True, truncation=True, max_length=max_len_tokenizer)
input_ids = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
predicted_value = outputs.squeeze().item()
print(f"Predicted Value: {predicted_value}")

In [None]:
# Assuming train_loss_list and val_loss_list are your lists of loss values
# and num_epochs is the number of epochs
num_epochs = len(train_loss_list)  # Assuming train_loss_list and val_loss_list have the same length
epochs = list(range(1, num_epochs + 1))

# Plotting the training and validation loss
plt.plot(epochs, train_loss_list, label='Training Loss', marker='o')
plt.plot(epochs, val_loss_list, label='Validation Loss', marker='o')

# Adding labels and title
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

# Display the plot
plt.show()


In [None]:
# Calculate the average length in words of entries in the "argument" column of both dataframes
df_avg_length = df['argument'].apply(lambda x: len(x.split())).mean()
averaged_df_arg_avg_length = averaged_df_arg['argument'].apply(lambda x: len(x.split())).mean()

# Print the average lengths
print("Average length in df dataframe:", df_avg_length)
print("Average length in averaged_df_arg dataframe:", averaged_df_arg_avg_length)

# Plot the distributions
plt.figure(figsize=(12, 6))

# Distribution for df dataframe
plt.subplot(1, 2, 1)
df['argument'].apply(lambda x: len(x.split())).hist(bins=30, edgecolor='black')
plt.title('Word Count Distribution in df Dataframe')
plt.xlabel('Word Count')
plt.ylabel('Frequency')

# Distribution for averaged_df_arg dataframe
plt.subplot(1, 2, 2)
averaged_df_arg['argument'].apply(lambda x: len(x.split())).hist(bins=30, edgecolor='black')
plt.title('Word Count Distribution in averaged_df_arg Dataframe')
plt.xlabel('Word Count')
plt.ylabel('Frequency')

plt.tight_layout()
plt.show()
