In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers.trainer_utils import IntervalStrategy
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score


In [None]:

# ================================================================
# 2. LOAD + LIGHT PREPROCESSING
# ================================================================
df = pd.read_csv("/content/finaldataset_posts (1).csv")
print(f"Shape after initial load: {df.shape}")

# Rename 'body' column to 'text' to match expected column name in subsequent steps
df = df.rename(columns={'body': 'text'})
print(f"Shape after renaming column: {df.shape}")

# ---- Light Text Cleaning ----
def clean_text(t):
    if isinstance(t, str):
        t = t.strip()                 # remove leading/trailing spaces
        t = " ".join(t.split())       # fix excessive spaces
    return t

df["text"] = df["text"].apply(clean_text)
print(f"Shape after text cleaning: {df.shape}")

# Remove empty or NaN texts
df = df.dropna(subset=["text"])
print(f"Shape after dropping NaN texts: {df.shape}")
df = df[df["text"].str.len() > 0]
print(f"Shape after removing empty texts: {df.shape}")

# Remove duplicate rows
df = df.drop_duplicates(subset=["text"]).reset_index(drop=True)
print(f"Shape after dropping duplicate texts: {df.shape}")

# ---- Clean 'label' column and Convert labels to numbers ----
df["label"] = df["label"].apply(clean_text) # Clean labels as well
print(f"Unique labels after cleaning: {df['label'].unique()}")

# Filter rows to only include valid numerical labels (1 or 0)
# Based on the output `Unique labels after cleaning: [1 0]`, the labels are already numerical.
valid_labels = [1, 0] # Changed to numerical labels
df = df[df["label"].isin(valid_labels)]
print(f"Shape after filtering for valid labels: {df.shape}")

# The mapping step is removed as labels are already numerical (1 and 0)
# The dropna(subset=["label"]) is no longer needed here as we pre-filtered for valid labels
print(f"Shape after processing labels: {df.shape}")
print(f"Unique labels after processing: {df['label'].unique()}")




















Shape after initial load: (87045, 2)
Shape after renaming column: (87045, 2)
Shape after text cleaning: (87045, 2)
Shape after dropping NaN texts: (87045, 2)
Shape after removing empty texts: (87045, 2)
Shape after dropping duplicate texts: (86508, 2)
Unique labels after cleaning: [1 0]
Shape after filtering for valid labels: (86508, 2)
Shape after processing labels: (86508, 2)
Unique labels after processing: [1 0]


In [None]:
# ================================================================
# 3. BUILD HUGGINGFACE DATASET + TRAIN/TEST SPLIT
# ================================================================
dataset = Dataset.from_pandas(df)

# Check if the dataset is empty before splitting
if len(dataset) == 0:
    raise ValueError("The dataset is empty after preprocessing. Cannot perform train/test split.")

dataset = dataset.train_test_split(test_size=0.2, seed=42)

dataset = DatasetDict({
    "train": dataset["train"],
    "test": dataset["test"],
})


In [None]:
# ================================================================
# 4. TOKENIZATION
# ================================================================
model_name = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding=True,
        max_length=256
    )

dataset = dataset.map(tokenize, batched=True)

Map:   0%|          | 0/69206 [00:00<?, ? examples/s]

Map:   0%|          | 0/17302 [00:00<?, ? examples/s]

In [None]:
# ================================================================
# 5. LOAD MODEL
# ================================================================
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=2
)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# ================================================================
# 6. TRAINING HYPERPARAMETERS (Based on your BALANCE RATE sheet)
# ================================================================
training_args = TrainingArguments(
    output_dir="./roberta_anxiety_classifier",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=3e-5,
    weight_decay=0.05,
    warmup_steps=1000,
    eval_strategy=IntervalStrategy.EPOCH,
    save_strategy=IntervalStrategy.EPOCH,
    logging_steps=50,
    load_best_model_at_end=True,
)

In [31]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from transformers.trainer_utils import IntervalStrategy
import torch
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# ================================================================
# 7. METRICS
# ================================================================
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    acc = accuracy_score(p.label_ids, preds)
    f1 = f1_score(p.label_ids, preds)
    return {
        "accuracy": acc,
        "f1": f1
    }

# ================================================================
# Sample Data for Testing
# ================================================================
print("\nCreating sample data for testing...")

sample_texts = [
    "I feel really overwhelmed and anxious about my upcoming exams.",
    "This is a a beautiful day, and I'm feeling great!",
    "I'm struggling to sleep and constantly worried about everything.",
    "Just enjoying a quiet evening at home.",
    "My heart is racing, and I can't stop thinking about potential problems."
]
sample_labels = [1, 0, 1, 0, 1] # 1 for anxiety-related, 0 for not

sample_df = pd.DataFrame({"text": sample_texts, "labels": sample_labels}) # Renamed 'label' to 'labels' for Hugging Face Trainer compatibility
print(f"Sample DataFrame created:\n{sample_df}")

# Convert to Hugging Face Dataset
sample_dataset = Dataset.from_pandas(sample_df)

# Tokenize the sample dataset using the tokenizer defined earlier
def tokenize(batch):
    return tokenizer(
        batch["text"],
        truncation=True,
        padding=True,
        max_length=256
    )

tokenized_sample_dataset = sample_dataset.map(tokenize, batched=True)
# Remove original 'text' and pandas index column. Keep 'labels' for potential evaluation.
tokenized_sample_dataset = tokenized_sample_dataset.remove_columns(["text"])
tokenized_sample_dataset.set_format("torch")

print("\nTokenized sample dataset created:")
print(tokenized_sample_dataset)
print("Sample input_ids for the first item:", tokenized_sample_dataset[0]["input_ids"])
print("Sample attention_mask for the first item:", tokenized_sample_dataset[0]["attention_mask"])
print("Sample labels for the first item:", tokenized_sample_dataset[0]["labels"])



Creating sample data for testing...
Sample DataFrame created:
                                                text  labels
0  I feel really overwhelmed and anxious about my...       1
1  This is a a beautiful day, and I'm feeling great!       0
2  I'm struggling to sleep and constantly worried...       1
3             Just enjoying a quiet evening at home.       0
4  My heart is racing, and I can't stop thinking ...       1


Map:   0%|          | 0/5 [00:00<?, ? examples/s]


Tokenized sample dataset created:
Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 5
})
Sample input_ids for the first item: tensor([    0,   100,   619,   269, 13203,     8, 13473,    59,   127,  2568,
        15734,     4,     2,     1,     1,     1,     1])
Sample attention_mask for the first item: tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0])
Sample labels for the first item: tensor(1)


In [None]:
# ================================================================
# 8. TRAINER
# ================================================================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)


  trainer = Trainer(


In [None]:
# ================================================================
# 9. TRAIN MODEL
# ================================================================
trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.1253,0.174086,0.948272,0.946957
2,0.1017,0.136447,0.955265,0.955446
3,0.0974,0.124117,0.955554,0.955787


TrainOutput(global_step=12978, training_loss=0.14876108750813472, metrics={'train_runtime': 9678.6362, 'train_samples_per_second': 21.451, 'train_steps_per_second': 1.341, 'total_flos': 2.731329554586624e+16, 'train_loss': 0.14876108750813472, 'epoch': 3.0})

In [None]:
# ================================================================
# 10. SAVE MODEL + TOKENIZER
# ================================================================
trainer.save_model("./roberta_anxiety_model_final")
tokenizer.save_pretrained("./roberta_anxiety_model_final")

print("Training complete! Model saved to ./roberta_anxiety_model_final")

Training complete! Model saved to ./roberta_anxiety_model_final


In [33]:
loaded_tokenizer = AutoTokenizer.from_pretrained("./roberta_anxiety_model_final")
loaded_model = AutoModelForSequenceClassification.from_pretrained("./roberta_anxiety_model_final")

print("Tokenizer and Model loaded successfully!")

Tokenizer and Model loaded successfully!


In [37]:
import torch

# Reuse sample_texts from earlier in the notebook
# sample_texts = [
#     "I feel really overwhelmed and anxious about my upcoming exams.",
#     "This is a a beautiful day, and I'm feeling great!",
#     "I'm struggling to sleep and constantly worried about everything.",
#     "Just enjoying a quiet evening at home.",
#     "My heart is racing, and I can't stop thinking about potential problems."
# ]

print("Sample texts for phrase-by-phrase prediction:")
for i, text in enumerate(sample_texts):
    print(f"{i+1}. {text}")

# Tokenize the sample texts
inputs = loaded_tokenizer(sample_texts, return_tensors="pt", padding=True, truncation=True, max_length=256)

# Move inputs to the same device as the model (GPU if available, else CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)
inputs = {k: v.to(device) for k, v in inputs.items()}

# Make predictions
loaded_model.eval() # Set the model to evaluation mode
with torch.no_grad(): # Disable gradient calculation for inference
    outputs = loaded_model(**inputs)

# Get predicted logits and probabilities
logits = outputs.logits
probabilities = torch.softmax(logits, dim=1)

# Get the predicted labels (0 or 1)
predictions = torch.argmax(logits, dim=1).cpu().numpy()

# Map numerical labels back to meaningful names if needed (based on previous steps, 1 = anxiety, 0 = non-anxiety)
label_map = {1: "Anxiety-Related", 0: "Not Anxiety-Related"}

print("\n--- Prediction Results ---")
for i, text in enumerate(sample_texts):
    predicted_label = predictions[i]
    label_name = label_map[predicted_label]
    confidence_anxiety = probabilities[i][1].item() # Probability for label 1 (Anxiety)
    confidence_non_anxiety = probabilities[i][0].item() # Probability for label 0 (Non-Anxiety)

    print(f"\nText: '{text}'")
    print(f"Predicted Label: {label_name}")
    print(f"Confidence (Anxiety-Related): {confidence_anxiety:.4f}")
    print(f"Confidence (Not Anxiety-Related): {confidence_non_anxiety:.4f}")


Sample texts for phrase-by-phrase prediction:
1. I feel really overwhelmed and anxious about my upcoming exams.
2. This is a a beautiful day, and I'm feeling great!
3. I'm struggling to sleep and constantly worried about everything.
4. Just enjoying a quiet evening at home.
5. My heart is racing, and I can't stop thinking about potential problems.

--- Prediction Results ---

Text: 'I feel really overwhelmed and anxious about my upcoming exams.'
Predicted Label: Anxiety-Related
Confidence (Anxiety-Related): 0.9998
Confidence (Not Anxiety-Related): 0.0002

Text: 'This is a a beautiful day, and I'm feeling great!'
Predicted Label: Not Anxiety-Related
Confidence (Anxiety-Related): 0.0025
Confidence (Not Anxiety-Related): 0.9975

Text: 'I'm struggling to sleep and constantly worried about everything.'
Predicted Label: Anxiety-Related
Confidence (Anxiety-Related): 0.9998
Confidence (Not Anxiety-Related): 0.0002

Text: 'Just enjoying a quiet evening at home.'
Predicted Label: Not Anxiety-Rel

In [42]:
from ipywidgets import Textarea, Button, Output
from IPython.display import display
import torch

# Create a Textarea widget for input
text_input_widget = Textarea(
    value='',
    placeholder='Type your text here...',
    description='Text:',
    disabled=False,
    layout={'width': '80%', 'height': '100px'}
)

# Create a Button widget to trigger prediction
predict_button = Button(description='Get Prediction')

# Create an Output widget to display results
output_widget = Output()

# Define the prediction function
def on_button_click(b):
    with output_widget:
        output_widget.clear_output()
        custom_text = text_input_widget.value

        if not custom_text.strip():
            print("Please enter some text in the text box to get a prediction.")
        else:
            print(f"\n--- Prediction for Your Custom Text ---")
            print(f"Input Text: '{custom_text}'")

            # Tokenize the custom text
            custom_inputs = loaded_tokenizer(custom_text, return_tensors="pt", padding=True, truncation=True, max_length=256)

            # Move inputs to the same device as the model (GPU if available, else CPU)
            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            loaded_model.to(device)
            custom_inputs = {k: v.to(device) for k, v in custom_inputs.items()}

            # Make prediction
            loaded_model.eval() # Set the model to evaluation mode
            with torch.no_grad(): # Disable gradient calculation for inference
                custom_outputs = loaded_model(**custom_inputs)

            # Get predicted logits and probabilities
            custom_logits = custom_outputs.logits
            custom_probabilities = torch.softmax(custom_logits, dim=1)

            # Get the predicted label (0 or 1)
            custom_prediction = torch.argmax(custom_logits, dim=1).cpu().numpy()[0]

            # Map numerical label back to meaningful name
            label_map = {1: "Anxiety-Related", 0: "Not Anxiety-Related"}
            predicted_label_name = label_map[custom_prediction]

            confidence_anxiety = custom_probabilities[0][1].item() # Probability for label 1 (Anxiety)
            confidence_non_anxiety = custom_probabilities[0][0].item() # Probability for label 0 (Non-Anxiety)

            print(f"Predicted Label: {predicted_label_name}")
            print(f"Confidence (Anxiety-Related): {confidence_anxiety:.4f}")
            print(f"Confidence (Not Anxiety-Related): {confidence_non_anxiety:.4f}")

# Attach the prediction function to the button's click event
predict_button.on_click(on_button_click)

# Display the widgets
display(text_input_widget, predict_button, output_widget)


Textarea(value='', description='Text:', layout=Layout(height='100px', width='80%'), placeholder='Type your tex…

Button(description='Get Prediction', style=ButtonStyle())

Output()

## Testing Model on a Sample from Original Dataset

This section will load the original `finaldataset_posts.csv` file, preprocess it, randomly select 100 rows, and then use the fine-tuned model to make predictions on these selected rows.

In [44]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load the original dataset
test_df_original = pd.read_csv("/content/finaldataset_posts (1).csv")

# Rename 'body' column to 'text' to match expected column name
test_df_original = test_df_original.rename(columns={'body': 'text'})

# ---- Light Text Cleaning (same as training preprocessing) ----
def clean_text(t):
    if isinstance(t, str):
        t = t.strip()                 # remove leading/trailing spaces
        t = " ".join(t.split())       # fix excessive spaces
    return t

test_df_original["text"] = test_df_original["text"].apply(clean_text)

# Remove empty or NaN texts
test_df_original = test_df_original.dropna(subset=["text"])
test_df_original = test_df_original[test_df_original["text"].str.len() > 0]

# Remove duplicate rows (based on text content)
test_df_original = test_df_original.drop_duplicates(subset=["text"]).reset_index(drop=True)

print(f"Shape of preprocessed original dataset: {test_df_original.shape}")

# Select 100 random rows for testing
sample_for_testing = test_df_original.sample(n=100, random_state=42).reset_index(drop=True)

print(f"Selected 100 rows for testing:\n{sample_for_testing.head()}")

# --- Load the saved model and tokenizer ---
# Ensure these are already loaded, if not, uncomment the lines below
# loaded_tokenizer = AutoTokenizer.from_pretrained("./roberta_anxiety_model_final")
# loaded_model = AutoModelForSequenceClassification.from_pretrained("./roberta_anxiety_model_final")

# Tokenize the selected sample texts
# Assuming 'loaded_tokenizer' is available from previous steps
inputs_sample = loaded_tokenizer(
    sample_for_testing["text"].tolist(),
    return_tensors="pt",
    padding=True,
    truncation=True,
    max_length=256
)

# Move inputs to the same device as the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
loaded_model.to(device)
inputs_sample = {k: v.to(device) for k, v in inputs_sample.items()}

# Make predictions
loaded_model.eval()
with torch.no_grad():
    outputs_sample = loaded_model(**inputs_sample)

# Get predicted logits and probabilities
logits_sample = outputs_sample.logits
probabilities_sample = torch.softmax(logits_sample, dim=1)

# Get the predicted labels
predictions_sample = torch.argmax(logits_sample, dim=1).cpu().numpy()

# Map numerical labels back to meaningful names
label_map = {1: "Anxiety-Related", 0: "Not Anxiety-Related"}

print("\n--- Predictions for 100 Sample Rows ---")
for i, text in enumerate(sample_for_testing["text"]):
    predicted_label_val = predictions_sample[i]
    predicted_label_name = label_map[predicted_label_val]
    confidence_anxiety = probabilities_sample[i][1].item()
    confidence_non_anxiety = probabilities_sample[i][0].item()

    print(f"\nText: '{text}'")
    print(f"Predicted Label: {predicted_label_name}")
    print(f"Confidence (Anxiety-Related): {confidence_anxiety:.4f}")
    print(f"Confidence (Not Anxiety-Related): {confidence_non_anxiety:.4f}")


Shape of preprocessed original dataset: (86508, 2)
Selected 100 rows for testing:
                                                text  label
0  Tw: mild non detailed mention of abuse I felt ...      0
1  I was just diagnosed with ADHD in my late twen...      1
2  My favorite juice was finished so I took orang...      1
3  My mom's relationship isn't good and hasn't be...      1
4  My body is always on high alert still. I jump ...      1

--- Predictions for 100 Sample Rows ---

Text: 'Tw: mild non detailed mention of abuse I felt the need to tell my story. CPS wants me to go back to my abusive household though, because of my age they can't force me. Their end goal is me repairing the relationship as if it's just a quick fix. They tried to gaslight me once they dug up my diagnosis of DID, saying implying I was too mentally ill, and that the things I remembered weren't real. They have implied my story is inconsistent (it isn't). They insist on top of this they never said I was lying but

## Save Predictions to Excel File

This section will compile the predictions for the 100 sample rows into a Pandas DataFrame and then save it as an Excel file.

In [45]:
import pandas as pd

# Prepare data for DataFrame
results = []
label_map = {1: "Anxiety-Related", 0: "Not Anxiety-Related"}

for i, text in enumerate(sample_for_testing["text"]):
    predicted_label_val = predictions_sample[i]
    predicted_label_name = label_map[predicted_label_val]
    confidence_anxiety = probabilities_sample[i][1].item()
    confidence_non_anxiety = probabilities_sample[i][0].item()

    results.append({
        "Original Text": text,
        "Predicted Label": predicted_label_name,
        "Confidence (Anxiety-Related)": f"{confidence_anxiety:.4f}",
        "Confidence (Not Anxiety-Related)": f"{confidence_non_anxiety:.4f}"
    })

# Create DataFrame
predictions_df = pd.DataFrame(results)

# Define the output Excel file path
output_excel_path = "./sample_predictions.xlsx"

# Save DataFrame to Excel
predictions_df.to_excel(output_excel_path, index=False)

print(f"Predictions for 100 sample rows saved to '{output_excel_path}'")
print("First 5 rows of the saved predictions:")
display(predictions_df.head())


Predictions for 100 sample rows saved to './sample_predictions.xlsx'
First 5 rows of the saved predictions:


Unnamed: 0,Original Text,Predicted Label,Confidence (Anxiety-Related),Confidence (Not Anxiety-Related)
0,Tw: mild non detailed mention of abuse I felt ...,Not Anxiety-Related,0.0819,0.9181
1,I was just diagnosed with ADHD in my late twen...,Anxiety-Related,0.9998,0.0002
2,My favorite juice was finished so I took orang...,Anxiety-Related,0.9998,0.0002
3,My mom's relationship isn't good and hasn't be...,Anxiety-Related,0.6902,0.3098
4,My body is always on high alert still. I jump ...,Anxiety-Related,0.9998,0.0002
