In [1]:
import pandas as pd
import re
import string
import demoji
from sklearn.model_selection import train_test_split
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:


import pandas as pd

# 1. Load your CSV file. Make sure to use the correct filename.
df = pd.read_csv('data/Suicide_Detection.csv') # <--- IMPORTANT: Use your actual filename here!

# Let's see the columns to confirm they loaded correctly
print("Columns loaded from CSV:", df.columns)

# 2. Rename the 'class' column to 'label' so the rest of the code works.
# The 'text' column is already named correctly.
df = df.rename(columns={'class': 'label'})

# 3. Keep only the columns we need.
df = df[['text', 'label']]

# 4. Create a mapping for YOUR specific labels ('suicide' and 'non-suicide').
label_map = {'suicide': 1, 'non-suicide': 0}
df['label'] = df['label'].map(label_map)

# Remove any rows where the text is missing (just in case)
df.dropna(subset=['text'], inplace=True)

# Let's check the result
print("\nData after processing:")
print(df.head())
print("\nLabel counts:")
print(df['label'].value_counts())

Columns loaded from CSV: Index(['Unnamed: 0', 'text', 'class'], dtype='object')

Data after processing:
                                                text  label
0  Ex Wife Threatening SuicideRecently I left my ...      1
1  Am I weird I don't get affected by compliments...      0
2  Finally 2020 is almost over... So I can never ...      0
3          i need helpjust help me im crying so hard      1
4  I’m so lostHello, my name is Adam (16) and I’v...      1

Label counts:
label
1    116037
0    116037
Name: count, dtype: int64


In [3]:
# --- Final Cell 3: Emoji-to-Text Conversion with Progress Bar ---

import re
import string
import demoji
from tqdm import tqdm

tqdm.pandas()

# --- Part 1: The One-Time Download ---
# This will run first. It won't have a progress bar, so be patient.
# It might take a few minutes, but only happens once.
print("Step 1: Downloading emoji database (one-time process)...")
demoji.download_codes()
print("✅ Download complete.")


def preprocess_text(text):
    # Ensure text is a string, then lowercase it
    text = str(text).lower()
    
    # This is the better method: converting emojis like '😊' to 'smiling face'
    text = demoji.replace_with_desc(text, sep=" ") 
    
    # The rest of the cleaning steps
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'\@\w+|\#','', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text


# --- Part 2: The Text Processing ---
# This is where you will see the progress bar as it processes each row.
# This will likely take several minutes to complete.
print("\nStep 2: Preprocessing text (this will show a progress bar)...")
df['text'] = df['text'].progress_apply(preprocess_text)


print("\n✅ Preprocessing complete.")
df.head()

  demoji.download_codes()


Step 1: Downloading emoji database (one-time process)...
✅ Download complete.

Step 2: Preprocessing text (this will show a progress bar)...


100%|██████████| 232074/232074 [13:44<00:00, 281.43it/s] 


✅ Preprocessing complete.





Unnamed: 0,text,label
0,ex wife threatening suiciderecently i left my ...,1
1,am i weird i dont get affected by compliments ...,0
2,finally 2020 is almost over so i can never hea...,0
3,i need helpjust help me im crying so hard,1
4,i’m so losthello my name is adam 16 and i’ve b...,1


In [4]:
# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42)

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize the texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=128)

In [5]:
class DepressionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = DepressionDataset(train_encodings, train_labels)
val_dataset = DepressionDataset(val_encodings, val_labels)

In [6]:
# Load the model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

trainer.train()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
10,0.6976
20,0.6898
30,0.69
40,0.6842
50,0.6494
60,0.6027
70,0.5719
80,0.5366
90,0.4519
100,0.4562


KeyboardInterrupt: 

In [None]:
model.save_pretrained("./my_model")
tokenizer.save_pretrained("./my_model")
print("Model saved successfully!")