<a href="https://colab.research.google.com/github/Muyiiwaa/Ayeolad/blob/master/Natural_Language_Processing_(Text_Classification).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import kagglehub
import torch
from torch import nn, optim
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.metrics import f1_score, precision_score, recall_score
import wandb
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
# Download latest version
path = kagglehub.dataset_download("suchintikasarkar/sentiment-analysis-for-mental-health")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/suchintikasarkar/sentiment-analysis-for-mental-health?dataset_version_number=1...


100%|██████████| 11.1M/11.1M [00:01<00:00, 7.17MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/suchintikasarkar/sentiment-analysis-for-mental-health/versions/1


In [5]:
data_url = os.path.join(path,os.listdir(path)[0])
data_url

'/root/.cache/kagglehub/datasets/suchintikasarkar/sentiment-analysis-for-mental-health/versions/1/Combined Data.csv'

In [6]:
data = pd.read_csv(data_url)
data.head()

Unnamed: 0.1,Unnamed: 0,statement,status
0,0,oh my gosh,Anxiety
1,1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,3,I've shifted my focus to something else but I'...,Anxiety
4,4,"I'm restless and restless, it's been a month n...",Anxiety


In [8]:
data['status'].unique()

7

In [9]:
data = data[['statement', 'status']]
data.head()

Unnamed: 0,statement,status
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


In [10]:
data.rename(columns = {
    'statement': 'text',
    'status': 'label'
}, inplace = True)

data.head()

Unnamed: 0,text,label
0,oh my gosh,Anxiety
1,"trouble sleeping, confused mind, restless hear...",Anxiety
2,"All wrong, back off dear, forward doubt. Stay ...",Anxiety
3,I've shifted my focus to something else but I'...,Anxiety
4,"I'm restless and restless, it's been a month n...",Anxiety


In [12]:
encoder = LabelEncoder()

data['label'] = encoder.fit_transform(data['label'])
data.head()

Unnamed: 0,text,label
0,oh my gosh,0
1,"trouble sleeping, confused mind, restless hear...",0
2,"All wrong, back off dear, forward doubt. Stay ...",0
3,I've shifted my focus to something else but I'...,0
4,"I'm restless and restless, it's been a month n...",0


In [13]:
data['label'].unique()

array([0, 3, 2, 6, 5, 1, 4])

In [15]:
model_uri = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_uri)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [28]:
# split the dataset

train_df, test_df = train_test_split(data, test_size=0.2, random_state = 23,
                                     stratify = data['label'])

In [41]:
# setup the data object


class MentalData(Dataset):

  def __init__(self, dataframe:pd.DataFrame, tokenizer, max_length=128):
    self.texts = dataframe['text'].to_list()
    self.labels = dataframe['label'].to_list()
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.texts)

  def __getitem__(self, index):
    text = self.texts[index]
    label = self.labels[index]

    encoding = self.tokenizer.encode_plus(
        text,
        add_special_tokens = True,
        return_tensors = 'pt',
        padding = 'max_length',
        truncation = True,
        max_length = self.max_length
    )
    input_ids = encoding['input_ids'].flatten()
    attention_mask = encoding['attention_mask'].flatten()

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
        'label': torch.tensor(data=label, dtype = torch.long)
    }

In [42]:
train_data = MentalData(train_df, tokenizer)
test_data = MentalData(test_df, tokenizer)

In [43]:
train_data[17]

{'input_ids': tensor([  101,  2893, 15035,  1077,   102,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [26]:
data['text'].to_list()[0]

'oh my gosh'

In [38]:
model = AutoModelForSequenceClassification.from_pretrained(model_uri, num_labels =7)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = optim.AdamW(params = model.parameters(), lr = 1e-4, weight_decay = 0.01)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
https://www.youtube.com/playlist?list=PLf43guw17cen5G8c3-Emt5v_GGck4LcIn