# Data

In [None]:
import kagglehub
import pandas as pd
import os

In [None]:
# Download latest version
path = kagglehub.dataset_download("nikhileswarkomati/suicide-watch")

print("Path to dataset files:", path)

# Find the CSV file within the downloaded directory
for filename in os.listdir(path):
    if filename.endswith(".csv"):
        csv_file_path = os.path.join(path, filename)
        break

# Read the CSV file into a pandas DataFrame
df = pd.read_csv(csv_file_path)

if 'Unnamed: 0' in df.columns:
    df = df.drop('Unnamed: 0', axis=1)

print(df.head())

print(df.info())
print(df.describe())
print(df.shape)
print(df.columns)
df.dtypes

# Convert 'class' column to numerical representation (0 or 1)
df['class'] = df['class'].map({'suicide': 1, 'non-suicide': 0})

# Display the updated DataFrame to verify the changes
print(df.head())

Path to dataset files: /kaggle/input/suicide-watch
                                                text        class
0  Ex Wife Threatening SuicideRecently I left my ...      suicide
1  Am I weird I don't get affected by compliments...  non-suicide
2  Finally 2020 is almost over... So I can never ...  non-suicide
3          i need helpjust help me im crying so hard      suicide
4  I’m so lostHello, my name is Adam (16) and I’v...      suicide
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232074 entries, 0 to 232073
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   text    232074 non-null  object
 1   class   232074 non-null  object
dtypes: object(2)
memory usage: 3.5+ MB
None
                                                     text    class
count                                              232074   232074
unique                                             232074        2
top     I still haven't beaten the first boss in 

# Logistic Regression

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [None]:
# Prepare the data
X = df['text']
y = df['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a TF-IDF vectorizer to convert text into numerical features
vectorizer = TfidfVectorizer(max_features=5000) # Limit features for performance

# Fit and transform the training data
X_train_vec = vectorizer.fit_transform(X_train)

# Transform the testing data using the same vectorizer
X_test_vec = vectorizer.transform(X_test)

# Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train_vec, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test_vec)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred))

Accuracy: 0.93635678121297
              precision    recall  f1-score   support

           0       0.93      0.94      0.94     23287
           1       0.94      0.93      0.94     23128

    accuracy                           0.94     46415
   macro avg       0.94      0.94      0.94     46415
weighted avg       0.94      0.94      0.94     46415



In [None]:
# Example prediction for new text
new_text = ["I am feeling okay."]
new_text_vec = vectorizer.transform(new_text)
prediction = model.predict(new_text_vec)
print(f"Prediction for '{new_text[0]}': {prediction[0]}")

Prediction for 'I am feeling okay.': 0


# RoBERTa

## Installs and Imports

### Installs

In [None]:
!pip install transformers
!pip install tensorflow



### Imports

In [None]:
import tensorflow as tf
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification

## Model

### Load Tokenizer and Model

In [None]:
# Load pre-trained RoBERTa tokenizer and model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = TFRobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)  # 2 labels: suicide/non-suicide

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFRobertaForSequenceClassification: ['roberta.embeddings.position_ids']
- This IS expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFRobertaForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predicti

### Data Prep

In [None]:
# Calculate the number of samples for 35% of the data
sample_size = int(0.35 * len(df))

# Sample 35% of the data
df_sample = df.sample(n=sample_size, random_state=42)

# Prepare the data
X = df_sample['text'].tolist()
y = df_sample['class'].tolist()

# Tokenize the text data
encoded_data = tokenizer(X, padding=True, truncation=True, max_length=256, return_tensors='tf')

In [None]:
# Convert labels to TensorFlow tensors
y = tf.convert_to_tensor(y)

Split the Data

In [None]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification

In [None]:
# Access the input_ids from the encoded data for splitting
# Convert input_ids to NumPy array before splitting
input_ids_np = encoded_data['input_ids'].numpy()

In [None]:
# Convert TensorFlow tensors to NumPy arrays
input_ids_np = encoded_data['input_ids'].numpy()
y_np = y.numpy() if hasattr(y, "numpy") else y

# Split the data
X_train_encoded, X_test_encoded, y_train, y_test = train_test_split(
    input_ids_np, y_np, test_size=0.2, random_state=42
)

# Convert back to tensors if needed (for TensorFlow models)
import tensorflow as tf
X_train_encoded = tf.convert_to_tensor(X_train_encoded)
X_test_encoded = tf.convert_to_tensor(X_test_encoded)

In [None]:
# Convert the split data back to TensorFlow tensors
X_train_encoded = tf.convert_to_tensor(X_train_encoded)
X_test_encoded = tf.convert_to_tensor(X_test_encoded)
y_train = tf.convert_to_tensor(y_train)
y_test = tf.convert_to_tensor(y_test)

### Model Definition

In [None]:
# Define the optimizer and loss function
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)


# Compile the model
model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

### Model Train

In [None]:
# Train the model
model.fit(
    x=X_train_encoded,
    y=y_train,
    validation_data=(X_test_encoded, y_test),
    epochs=1,  # Adjust the number of epochs as needed
    batch_size=24 # Reduced batch size to 24
)



<tf_keras.src.callbacks.History at 0x78d197d22b50>

### Model Eval

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_encoded, y_test)
print(f"Loss: {loss}")
print(f"Accuracy: {accuracy}")

Loss: 0.03282339125871658
Accuracy: 0.9892889857292175


### Testing

In [None]:
# Example prediction for new text
new_text = ["I am feeling okay."]
new_text_encoded = tokenizer(new_text, padding=True, truncation=True, max_length=128, return_tensors='tf')
prediction = model.predict(new_text_encoded)
predicted_class = tf.argmax(prediction.logits, axis=1).numpy()[0]
print(f"Prediction for '{new_text[0]}': {predicted_class}")

Prediction for 'I am feeling okay.': 0


In [None]:
# Export the model weights

model.save_weights('/content/drive/MyDrive/ColabNoteBooks/Capstone/roberta_weights')

In [None]:
# export the model weights in a zip file

import zipfile
import os

def zip_directory(folder_path, zip_path):
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(folder_path):
            for file in files:
                zipf.write(os.path.join(root, file),
                           os.path.relpath(os.path.join(root, file),
                                           os.path.join(folder_path, '..')))

zip_directory('/content/drive/MyDrive/ColabNoteBooks/Capstone/roberta_weights', '/content/drive/MyDrive/ColabNoteBooks/Capstone/roberta_weights.zip')