In [1]:
import numpy as np
import matplotlib.pyplot as plt
import os
import random
import pandas as pd
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import warnings
from sklearn.metrics import f1_score

In [2]:
train_df = pd.read_json('train.jsonl', lines=True)
X_train = train_df['string']
y_train = train_df['label']

test_df = pd.read_json('test.jsonl', lines=True)
X_test = test_df['string']
y_test = test_df['label']

In [3]:
def cleaning(text):
    stop_words = stopwords.words('english')
    text = text.lower()
    text = ' '.join(x for x in text.split() if x not in stop_words)
    return text

In [4]:
def lemmatize(text):
    lemmatizer = WordNetLemmatizer()
    words = []
    for x in text.split():
        x = lemmatizer.lemmatize(x)
        words.append(x)
    text = ' '.join(words)
    return text

In [5]:
def preprocessing(text):
    # Tokenization
    tokenizer = RegexpTokenizer(r'[a-zA-Z0-9]+')
    text = cleaning(text)
    text = lemmatize(text)
    text = ' '.join(tokenizer.tokenize(text))
    return text

In [6]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [7]:
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AdamW
from tqdm import tqdm
new_train = train_df[['string','label']]
new_train['string'] = new_train['string'].apply(lambda x: preprocessing(x))
new_test = test_df[['string','label']]
new_test['string'] = new_test['string'].apply(lambda x: preprocessing(x))

label_encoder = LabelEncoder()
new_train['label'] = label_encoder.fit_transform(new_train['label'])
new_test['label'] = label_encoder.transform(new_test['label'])
new_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_train['string'] = new_train['string'].apply(lambda x: preprocessing(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_test['string'] = new_test['string'].apply(lambda x: preprocessing(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_train['label'] = label_encoder.fit_transform(new_tr

Unnamed: 0,string,label
0,however frataxin interacts fe s cluster biosyn...,0
1,study hickey et al 2012 spike sampled field po...,0
2,drug also reduces catecholamine secretion ther...,0
3,clustering lowly aggressive close kin king 198...,0
4,ophthalmic symptom rare manifestation intracra...,0
...,...,...
8238,importantly result pascalis et al 2005 also re...,0
8239,suggested nguena et al need educate health pro...,0
8240,skeletal muscle also primary site disease mous...,0
8241,activation transcription factor role several t...,1


In [8]:
class Dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        string = str(self.data.string[index])
        label = int(self.data.label[index])
        encoding = self.tokenizer.encode_plus(
            string,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'string': string,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }


In [9]:
MAX_LEN = 128
BATCH_SIZE = 16
EPOCHS = 2
LEARNING_RATE = 2e-5

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)  # 3个类别


train_dataset = Dataset(new_train, tokenizer, MAX_LEN)
test_dataset = Dataset(new_test, tokenizer, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
criterion = torch.nn.CrossEntropyLoss()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
for epoch in range(EPOCHS):
    model.train()
    train_losses = []

    for batch in tqdm(train_loader, total=len(train_loader), desc=f'Epoch {epoch + 1}/{EPOCHS}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_losses.append(loss.item())

        loss.backward()
        optimizer.step()

    print(f'Training loss: {sum(train_losses)/len(train_losses)}')
    model.eval()
    test_losses = []
    test_correct = 0
    test_total = 0
    test_f1_scores = []

    with torch.no_grad():
        for batch in tqdm(test_loader, total=len(test_loader), desc=f'Test'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            test_losses.append(loss.item())
            _, predicted = torch.max(outputs.logits, 1)
            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()

            f1_batch = f1_score(labels.cpu(), predicted.cpu(), average='macro')
            test_f1_scores.append(f1_batch)
    test_accuracy = test_correct / test_total
    test_f1_score = sum(test_f1_scores) / len(test_f1_scores)

    print(f'Test loss: {sum(test_losses)/len(test_losses)}')
    print(f'Test accuracy: {test_accuracy}')
    print(f'Test F1-score: {test_f1_score}')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/2: 100%|██████████| 516/516 [02:52<00:00,  3.00it/s]


Training loss: 0.5218103139899498


Test: 100%|██████████| 117/117 [00:14<00:00,  7.86it/s]


Test loss: 0.5014412614524874
Test accuracy: 0.8049435787211177
Test F1-score: 0.7570290950389489


Epoch 2/2: 100%|██████████| 516/516 [02:51<00:00,  3.02it/s]


Training loss: 0.3341859229515458


Test: 100%|██████████| 117/117 [00:14<00:00,  7.94it/s]

Test loss: 0.4581925071712233
Test accuracy: 0.8414830736163353
Test F1-score: 0.7967176947154733



