**Import libraries**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import accelerate
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, DataLoader


  from .autonotebook import tqdm as notebook_tqdm


**Load Dataset and Pre-process**

In [3]:
# Load datasets
df_republicans = pd.read_csv(r'D:\dev\work\dataset\uspolit\reddit_opinion_republican.csv')
df_democrats = pd.read_csv(r'D:\dev\work\dataset\uspolit\reddit_opinion_democrats.csv')

# Label 'comments' column based on party  
df_republicans['label'] = 0  # Label for Republicans
df_democrats['label'] = 1    # Label for Democrats

# Combine datasets
df = pd.concat([df_republicans[['self_text', 'label']], df_democrats[['self_text', 'label']]], ignore_index=True)

#Convert all to String
df['self_text'] = df['self_text'].astype(str)
df['self_text'] = df['self_text'].fillna('')

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['self_text'], df['label'], test_size=0.2, random_state=42)

# Initialize RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

  df_republicans = pd.read_csv(r'D:\dev\work\dataset\uspolit\reddit_opinion_republican.csv')


In [None]:
#print(df['self_text'].head())
#print(df['self_text'].dtype)

0    He's a preening city boy, a germaphobe, a narc...
1    Bob’s citation is the rest of the basement dwe...
2    Thread 69 was dumb. It was just a few thousand...
3    To be fair Israel does got a lot more, but act...
4    I don’t suspect many low-info voters read past...
Name: self_text, dtype: object
object


**Create custom Dataset class and load Data**

In [6]:
# Custom Dataset class for loading data
class SentimentDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length=256)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

print(type(X_train), len(X_train), X_train[:5])  # Check the type and first few elements

# Create datasets
train_dataset = SentimentDataset(X_train.tolist(), y_train.tolist())
test_dataset = SentimentDataset(X_test.tolist(), y_test.tolist())


<class 'pandas.core.series.Series'> 5985544 541133              I still prefer Jesus is a friend of Mine
5119008    I get that hypocrisy is a cornerstone of Trump...
113171     It's sad you gotta be against comedy to be a D...
431053                                    “I live in DC now”
6858565    If laws did not exist I would 200% shoot out m...
Name: self_text, dtype: object


**Model initialisation and training**

In [7]:
# Initialize the RoBERTa model
model = RobertaForSequenceClassification.from_pretrained('roberta-base')

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch"
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Train the model
trainer.train()

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  0%|          | 10/140289 [05:27<1188:38:30, 30.50s/it]

{'loss': 0.6921, 'grad_norm': 0.46177029609680176, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.0}


  0%|          | 20/140289 [10:26<1160:37:33, 29.79s/it]

{'loss': 0.6932, 'grad_norm': 1.1846706867218018, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.0}


  0%|          | 30/140289 [15:25<1163:20:09, 29.86s/it]

{'loss': 0.6893, 'grad_norm': 0.48050183057785034, 'learning_rate': 3e-06, 'epoch': 0.0}


  0%|          | 40/140289 [20:25<1172:21:33, 30.09s/it]

{'loss': 0.6861, 'grad_norm': 0.4246169328689575, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.0}


  0%|          | 50/140289 [25:25<1168:11:10, 29.99s/it]

{'loss': 0.6825, 'grad_norm': 0.6301615238189697, 'learning_rate': 5e-06, 'epoch': 0.0}


  0%|          | 60/140289 [30:23<1171:42:23, 30.08s/it]

{'loss': 0.6816, 'grad_norm': 0.36588525772094727, 'learning_rate': 6e-06, 'epoch': 0.0}


  0%|          | 70/140289 [35:24<1169:29:17, 30.03s/it]

{'loss': 0.6801, 'grad_norm': 0.8127037882804871, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.0}


  0%|          | 80/140289 [40:24<1172:59:59, 30.12s/it]

{'loss': 0.681, 'grad_norm': 0.8103258609771729, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.0}


  0%|          | 90/140289 [45:28<1183:15:45, 30.38s/it]

{'loss': 0.6828, 'grad_norm': 0.6521996259689331, 'learning_rate': 9e-06, 'epoch': 0.0}


  0%|          | 100/140289 [50:39<1220:36:48, 31.34s/it]

{'loss': 0.6803, 'grad_norm': 0.8225775957107544, 'learning_rate': 1e-05, 'epoch': 0.0}


  0%|          | 110/140289 [55:45<1186:19:42, 30.47s/it]

{'loss': 0.6833, 'grad_norm': 1.6965949535369873, 'learning_rate': 1.1000000000000001e-05, 'epoch': 0.0}


  0%|          | 120/140289 [1:00:49<1174:05:54, 30.15s/it]

{'loss': 0.679, 'grad_norm': 0.8235664367675781, 'learning_rate': 1.2e-05, 'epoch': 0.0}


  0%|          | 130/140289 [1:05:53<1185:20:11, 30.45s/it]

{'loss': 0.6691, 'grad_norm': 2.1199426651000977, 'learning_rate': 1.3000000000000001e-05, 'epoch': 0.0}


  0%|          | 140/140289 [1:10:59<1189:25:33, 30.55s/it]

{'loss': 0.6713, 'grad_norm': 1.9448823928833008, 'learning_rate': 1.4000000000000001e-05, 'epoch': 0.0}


  0%|          | 142/140289 [1:11:59<1182:49:53, 30.38s/it]

KeyboardInterrupt: 

**Model Evaluation & Confusion Matrix**

In [None]:
# Evaluate the model
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

# Confusion matrix
c_matrix = confusion_matrix(y_test, pred_labels)
ax = sns.heatmap(c_matrix, annot=True, xticklabels=['Republican', 'Democrat'], yticklabels=['Republican', 'Democrat'], cbar=False, cmap='Blues', fmt='g')
ax.set_xlabel("Prediction")
ax.set_ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

# Accuracy calculation
accuracy = np.sum(pred_labels == y_test) / len(y_test)
print(f"Accuracy: {accuracy:.4f}")