In [1]:
!pip install transformers



In [2]:
import re
import nltk
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score, roc_curve, auc
from sklearn.preprocessing import StandardScaler
from copy import deepcopy
from transformers import BertTokenizer, BertModel



In [3]:
import numpy as np
import pandas as pd

In [4]:
train_ = pd.read_csv("/kaggle/input/mental-cleaned/final_cleaned_data.csv")

In [5]:
train_.head()

Unnamed: 0,Label,Content,len,n_words,n_sent
0,1,compensate career minimal interaction people l...,142,18,1
1,1,need talk understand autism convinced autistic...,177,26,1
2,1,legal limit low function avarage person high f...,220,32,1
3,1,autism know social situation make easy folk as...,96,15,1
4,1,need parenting advise regard risperidone greet...,296,43,1


In [6]:
train_["Label"].value_counts()

Label
1    59631
0    39368
Name: count, dtype: int64

In [7]:
train_, test = train_test_split(train_, test_size=0.2, random_state=42)
train, val = train_test_split(train_, test_size=0.2, random_state=42)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

train_encodings = tokenizer(train['Content'].tolist(), max_length=128, truncation=True, padding=True, return_tensors='pt')
val_encodings = tokenizer(val['Content'].tolist(), max_length=128, truncation=True, padding=True, return_tensors='pt')
test_encodings = tokenizer(test['Content'].tolist(), max_length=128, truncation=True, padding=True, return_tensors='pt')
    
train_labels = torch.tensor(train["Label"].tolist())
val_labels = torch.tensor(val["Label"].tolist())

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
class Mymodel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=256)
        self.head = torch.nn.Sequential(torch.nn.Linear(256,256),
                                        torch.nn.ReLU(),
                                        torch.nn.Linear(256,2),
                                       )
    def forward(self,x):
        bert_emb = self.bert(input_ids=x["input_ids"],
                             attention_mask=x["attention_mask"],
                            ).logits
        output = self.head(bert_emb)
        return output

In [9]:
model = Mymodel()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

train_dataset = torch.utils.data.TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = torch.utils.data.TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

test_dataset = torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'])
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

optimizer = AdamW(model.parameters(), lr=1e-5,weight_decay=5e-5)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
criterium = torch.nn.CrossEntropyLoss()

def train_one_ep(model,dataloader):
    total_loss = 0
    total = 0
    model.train()
    for batch in train_loader:
        inputs = {'input_ids': batch[0].to(device),
                  'attention_mask': batch[1].to(device),
                  'labels': batch[2].to(device)}
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterium(outputs,inputs['labels'].long())
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * batch[0].shape[0]
        total += batch[0].shape[0]
    return total_loss / total

def eval(model,dataloader):
    model.eval()
    total_loss = 0
    total = 0
    y_true, y_pred = [], []
    for batch in val_loader:
        inputs = {'input_ids': batch[0].to(device),
                  'attention_mask': batch[1].to(device),
                  'labels': batch[2].to(device)}
        with torch.no_grad():
            outputs = model(inputs)
        loss = criterium(outputs,inputs['labels'].long())
        total_loss += loss.item() * batch[0].shape[0]
        total += batch[0].shape[0]
        logits = outputs.detach().cpu().numpy()
        predictions = logits[:,1]
        y_true.extend(inputs['labels'].cpu().numpy())
        y_pred.extend(predictions)
    fpr, tpr, thresholds = roc_curve(np.array(y_true), np.array(y_pred))

    return total_loss / total, auc(fpr, tpr)

def predict(model,dataloader):
    model.eval()
    y_pred = []
    for batch in tqdm(test_loader):
        inputs = {'input_ids': batch[0].to(device),
                  'attention_mask': batch[1].to(device),
                 }
        with torch.no_grad():
            outputs = model(inputs)
        logits = outputs.softmax(dim=-1).detach().cpu().numpy()
        predictions = logits[:,1]
        y_pred.extend(predictions)
    return y_pred

In [11]:
best_model = deepcopy(model)
best_auc = 0
best_iter = 0
patience = 2
num_epochs = 5

for epoch in tqdm(range(num_epochs)):
    train_loss = train_one_ep(model,train_loader)
    val_loss,auc_roc = eval(model,val_loader)
    print("Epoch {} : train_loss {:.3f} val_loss {:.3f} val_auc {:.3f}".format(epoch,train_loss,val_loss,auc_roc))

    if auc_roc > best_auc :
        best_model = deepcopy(model)
        best_auc = auc_roc
        best_iter = epoch

    if epoch - best_iter >= patience :
        print("earlystop")
        break

model = best_model

 20%|██        | 1/5 [23:54<1:35:37, 1434.34s/it]

Epoch 0 : train_loss 0.508 val_loss 0.457 val_auc 0.858


 40%|████      | 2/5 [47:55<1:11:55, 1438.37s/it]

Epoch 1 : train_loss 0.433 val_loss 0.443 val_auc 0.865


 60%|██████    | 3/5 [1:11:55<47:58, 1439.31s/it]

Epoch 2 : train_loss 0.386 val_loss 0.455 val_auc 0.865


 60%|██████    | 3/5 [1:35:59<1:03:59, 1919.74s/it]

Epoch 3 : train_loss 0.327 val_loss 0.479 val_auc 0.858
earlystop





In [12]:
path = "/kaggle/working/bert-base-uncased_model.pth"
torch.save(model.state_dict(), path)

In [None]:
y_true, y_pred = [], []
for batch in val_loader:
    inputs = {'input_ids': batch[0].to(device),
              'attention_mask': batch[1].to(device),
              'labels': batch[2].to(device)}
    with torch.no_grad():
        outputs = model(inputs)
    logits = outputs.softmax(dim=-1).detach().cpu().numpy()
    predictions = logits[:,1]
    y_true.extend(inputs['labels'].cpu().numpy())
    y_pred.extend(predictions)

thresholds = np.linspace(0, 1, 100)
f1_scores = []
threshold_values = []

for threshold in thresholds:
    y_pred_thresholded = (y_pred >= threshold).astype(int)
    f1 = f1_score(y_true, y_pred_thresholded)
    f1_scores.append(f1)
    threshold_values.append(threshold)

f1_scores = np.array(f1_scores)
threshold_values = np.array(threshold_values)

window_size = 5
smooth_f1_scores = np.convolve(f1_scores, np.ones(window_size)/window_size, mode='same')

optimal_threshold = thresholds[np.argmax(smooth_f1_scores)]
max_f1_score = np.max(smooth_f1_scores)

plt.figure(figsize=(10, 6))
plt.plot(thresholds, f1_scores, label='F1 Score', color='blue')
plt.plot(thresholds, smooth_f1_scores, label='F1 Score (smooth)', color='green')
plt.scatter(optimal_threshold, max_f1_score, color='red', label='Optimal')
plt.xlabel('Threshold')
plt.ylabel('F1 Score')
plt.legend()
plt.grid(True)
plt.show()

print(f"Seuil Optimal: {optimal_threshold:.2f}")
print(f"Score F1 Maximum: {max_f1_score:.2f}")

In [14]:
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

test_labels = test["Label"]
y_test_pred = predict(model,test_loader)

100%|██████████| 619/619 [02:22<00:00,  4.34it/s]


In [20]:
y_test_pred = np.array(y_test_pred)
threshold = 0.38
y_test_pred = (y_test_pred >= threshold).astype(int)

In [None]:
# Calculate accuracy
accuracy = accuracy_score(test_labels, y_test_pred)

# Calculate precision
precision = precision_score(test_labels, y_test_pred)

# Calculate recall
recall = recall_score(test_labels, y_test_pred)

# Calculate F1 score
f1 = f1_score(test_labels, y_test_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate the confusion matrix
cm = confusion_matrix(test_labels, y_test_pred)

# Plot the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False,
            xticklabels=[0, 1], yticklabels=[0, 1])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()

In [23]:
test_data = ["the real reason why you be sad you be attach to people who have be distant with you you be pay attention to people who ignore you you make time for people who be too busy for you you be too care to people who be care less when it come to you let those people go",
        "when your therapist be write down some of the things she be worry about barely eat yup depression and this eat disorder be go to kill me one day and i do not even care i just want to fuck die"
        "my best friend be all the way across the country and she tell me if i kill myself she be not fly home to come to my service fuck me right what be best friends for anyway",
        "all i want be to be happyhow be that so hard do people really have to put me down every single day i be just so do with everyone",
        "i be go to be the quiet girl in school this year the one that sit in the back the one with no friends the one that hide her cut"]

df = pd.DataFrame(test_data, columns=["Content"])

In [25]:
test_encodings = tokenizer(df["Content"].tolist(), truncation=True, padding=True, return_tensors='pt')

test_dataset = torch.utils.data.TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'])
test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)

In [26]:
df["Score"] = predict(model,test_loader)
df["Result"] = (df["Score"] > 0.5).astype(float)

100%|██████████| 1/1 [00:00<00:00, 28.96it/s]


In [27]:
print(df)

                                             Content     Score  Result
0  the real reason why you be sad you be attach t...  0.849609     1.0
1  when your therapist be write down some of the ...  0.743840     1.0
2  all i want be to be happyhow be that so hard d...  0.931299     1.0
3  i be go to be the quiet girl in school this ye...  0.894988     1.0
