In [None]:
!pip install transformers



### init

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import random

import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset
import datetime

from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score,accuracy_score

In [None]:
# data 19 oktober
url = 'https://drive.google.com/file/d/10lxJKCtUepwxgD7xLEJnCjIRZtKYcCTw/view?usp=sharing' #dataset_19okt_1316_fix
raw_df = pd.read_csv('https://drive.google.com/uc?export=download&id='+url.split('/')[-2])

In [None]:
df_unf = raw_df[['tweet','Topic Labeling (Multi Class)']].rename({'Topic Labeling (Multi Class)': 'topic'}, axis=1)
df=df_unf[df_unf['topic']!='DELETE']
df["topic"]=df["topic"].str.lower()
df.dropna(inplace=True)

#### clean

In [None]:
def clean_text(tweet):
    # tweet = tweet.lower()
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','',tweet)
    tweet = re.sub('@[^\s]+','',tweet)
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    tweet = re.sub(r'[^\w\s]',' ', tweet)
    tweet = re.sub(r'[\d-]', '', tweet)
    tweet = re.sub('[\s]+', ' ', tweet)
    tweet = tweet.strip('\'"')
    
    words = tweet.split()
    tokens=[]
    for ww in words:
        for w in re.split(r'[-/\s]\s*', ww):
            pattern = re.compile(r"(.)\1{1,}", re.DOTALL)
            w = pattern.sub(r"\1\1", w)
            w = w.strip('\'"?,.')
            val = re.search(r"^[a-zA-Z][a-zA-Z][a-zA-Z]*$", w)
            if(w in stops or val is None):
                continue
            else:
                tokens.append(w.lower())
    
    tweet = " ".join(tokens)
    return tweet

In [None]:
# nltk.download('stopwords')
# stops = list(stopwords.words('indonesian'))
# df['tweet'] = df['tweet'].map(lambda x: clean_text(x))

### encode

In [None]:
labels=df.topic.unique()

label_dict = {}
for index, possible_label in enumerate(labels):
    label_dict[possible_label] = index
print(label_dict)
df['label'] = df.topic.replace(label_dict)

{'mesin & pelayanan cabang': 0, '-': 1, 'pelayanan online & layanan digital': 2, 'tapcash': 3, 'kartu & rekening': 4, 'reputasi': 5, 'fraud': 6, 'call center & kredit': 7, 'campaign': 8, 'others': 9}


### split

In [None]:
X_train, X_val, y_train, y_val = train_test_split(df.index.values,df.topic.values, 
                                                  test_size=0.2, random_state=133, 
                                                  stratify=df.label.values)
df['data_type'] = ['not_set']*df.shape[0]
df.loc[X_train, 'data_type'] = 'train'
df.loc[X_val, 'data_type'] = 'val'

df.groupby(['topic', 'label', 'data_type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,tweet
topic,label,data_type,Unnamed: 3_level_1
-,1,train,364
-,1,val,91
call center & kredit,7,train,79
call center & kredit,7,val,20
campaign,8,train,40
campaign,8,val,10
fraud,6,train,54
fraud,6,val,14
kartu & rekening,4,train,107
kartu & rekening,4,val,27


### tokenization

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",do_lower_case=True)
                                          
encoded_data_train = tokenizer.batch_encode_plus(
    df[df.data_type=='train'].tweet.values,add_special_tokens=True, 
    return_attention_mask=True,pad_to_max_length=True, 
    max_length=256,return_tensors='pt'
)

encoded_data_val = tokenizer.batch_encode_plus(
    df[df.data_type=='val'].tweet.values,add_special_tokens=True, 
    return_attention_mask=True,pad_to_max_length=True, 
    max_length=256,return_tensors='pt'
)

input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(df[df.data_type=='train'].label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(df[df.data_type=='val'].label.values)

dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

batch_size = 4
dataloader_train = DataLoader(dataset_train, 
                              sampler=RandomSampler(dataset_train), 
                              batch_size=batch_size)
dataloader_validation = DataLoader(dataset_val, 
                                   sampler=SequentialSampler(dataset_val), 
                                   batch_size=batch_size)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


model definition

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",num_labels=len(label_dict),
                                                      output_attentions=False,output_hidden_states=False)

optimizer = AdamW(model.parameters(),lr=1e-5,eps=1e-8)
epochs = 25 #di sini EPOCH
scheduler = get_linear_schedule_with_warmup(optimizer,num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*epochs)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
print(device)

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

cuda


In [None]:
seed_val = 17
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

def metrics(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted'),accuracy_score(labels_flat, preds_flat)

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

def evaluate(dataloader_val,model_to_pred):
    model.eval()
    
    loss_val_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_val:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_val_total/len(dataloader_val) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [None]:
patience_limit=3
best_val_loss=9^133
for epoch in tqdm(range(1, epochs+1)):
    model.train()

    loss_train_total = 0
    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(device) for b in batch)        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }       
        outputs = model(**inputs)
        
        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()        
        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})

    tqdm.write(f'\nEpoch {epoch}')
    
    loss_train_avg = loss_train_total/len(dataloader_train)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    
    val_loss, predictions, true_vals = evaluate(dataloader_validation,model)
    val_f1,val_acc = metrics(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}\n')
    tqdm.write(f'Validation F1 score: {val_f1}')
    tqdm.write(f'Validation accuracy: {val_acc}')
    #early stopping
    if val_loss>=best_val_loss:
      patience+=1
      if patience>patience_limit:
        print(f'early stopping at epoch {epoch}!')
        break
    else:
      best_val_loss=val_loss
      patience=0

#save di sini

  0%|          | 0/25 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/263 [00:00<?, ?it/s]


Epoch 1
Training loss: 1.8203719371625227
Validation loss: 1.5949979214505716

Validation F1 score: 0.36456799779168203
Validation accuracy: 0.4659090909090909


Epoch 2:   0%|          | 0/263 [00:00<?, ?it/s]


Epoch 2
Training loss: 1.387047745655698
Validation loss: 1.3359966339035467

Validation F1 score: 0.5487684806947715
Validation accuracy: 0.5871212121212122


Epoch 3:   0%|          | 0/263 [00:00<?, ?it/s]


Epoch 3
Training loss: 1.0901316006883923
Validation loss: 1.119155154140158

Validation F1 score: 0.6431780048864928
Validation accuracy: 0.6742424242424242


Epoch 4:   0%|          | 0/263 [00:00<?, ?it/s]


Epoch 4
Training loss: 0.82466967410971
Validation loss: 0.9129406604915857

Validation F1 score: 0.7170243965052588
Validation accuracy: 0.7234848484848485


Epoch 5:   0%|          | 0/263 [00:00<?, ?it/s]


Epoch 5
Training loss: 0.5582082238997343
Validation loss: 0.8332755052569237

Validation F1 score: 0.7725245054195778
Validation accuracy: 0.7689393939393939


Epoch 6:   0%|          | 0/263 [00:00<?, ?it/s]


Epoch 6
Training loss: 0.3774090903446418
Validation loss: 0.8713640243196014

Validation F1 score: 0.7900310148358299
Validation accuracy: 0.7878787878787878


Epoch 7:   0%|          | 0/263 [00:00<?, ?it/s]


Epoch 7
Training loss: 0.2692992456129862
Validation loss: 0.882396786251444

Validation F1 score: 0.8150377926629877
Validation accuracy: 0.8143939393939394


Epoch 8:   0%|          | 0/263 [00:00<?, ?it/s]


Epoch 8
Training loss: 0.17881923267695726
Validation loss: 1.0406730975497118

Validation F1 score: 0.7895263027021989
Validation accuracy: 0.7878787878787878


Epoch 9:   0%|          | 0/263 [00:00<?, ?it/s]


Epoch 9
Training loss: 0.13915345208106752
Validation loss: 1.097537639240424

Validation F1 score: 0.7895237414656754
Validation accuracy: 0.7916666666666666
early stopping at epoch 9!


In [None]:
import os

path=f'bert_multiclass-ebrtcase_nonstopwords' 
os.makedirs(path)
model.save_pretrained(path)

In [None]:
label_dict_inverse = {v: k for k, v in label_dict.items()}
def accuracy_per_class(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    label_diprediksi=[]
    total_true,total_pred=0,0
    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'\nTrue topic: *{label_dict_inverse[label]}*')
        print(f'Recall = {len(y_preds[y_preds==label])}/{len(y_true)}\n       = {len(y_preds[y_preds==label])/len(y_true)*100}%')
        label_tiap_label=[]
        for pred in y_preds:
          label_tiap_label.append(label_dict_inverse[pred])
          label_diprediksi.append(label_dict_inverse[pred])
        # print(label_tiap_label,sep=', ')
        total_true+=len(y_preds[y_preds==label])
        total_pred+=len(y_true)
    print(f'\nOverall accuracy {total_true/total_pred*100}%')

In [None]:
preds_flat = np.argmax(predictions, axis=1).flatten()
pred_topics=[label_dict_inverse[pred] for pred in preds_flat] 

#### check loaded model

In [None]:
model_path="bert_multiclass_indobert_nonstopwords"
loaded_model = BertForSequenceClassification.from_pretrained(model_path)

404 Client Error: Not Found for url: https://huggingface.co/bert_multiclass_indobert_nonstopwords/resolve/main/config.json


OSError: ignored

In [None]:
val_loss, predictions, true_vals = evaluate(dataloader_validation,model)
val_f1,val_acc = metrics(predictions, true_vals)

In [None]:
print(f'Validation F1 score: {val_f1}')
print(f'Validation accuracy: {val_acc}')

#### acc per class

In [None]:
_, predictions, true_vals = evaluate(dataloader_validation,model)

In [None]:
accuracy_per_class(predictions, true_vals)

#### to excel

In [None]:
preds_flat = np.argmax(predictions, axis=1).flatten()
pred_topics=[label_dict_inverse[pred] for pred in preds_flat] 

In [None]:
eval_to_excel=df[df.data_type=='val'][['tweet','topic']]
eval_to_excel.columns=['tweet','true topic']
eval_to_excel['pred topic']=pred_topics

In [None]:
eval_to_excel_sorted=eval_to_excel.sort_values(by=['true topic'])

In [None]:
eval_to_excel_sorted.to_csv(f'evaluate_{path}.csv')

**### gtw apan**

In [None]:
label_dict_inverse = {v: k for k, v in label_dict.items()}
label_dict_inverse

In [None]:
df

#### load model

load model to predict

In [None]:
from google.colab import drive

drive.mount('/content/drive')


In [None]:
!ls


In [None]:
# url = 'https://drive.google.com/file/d/1Jwc7vmEG61RouFGVhEJQaa6zwOsYn7Wi/view?usp=sharing'
path = 'models/finetuned_BERT_epoch_8_lama.model'
loaded_model=torch.load(path)

In [None]:
_, predictions, true_vals = evaluate(dataloader_validation,loaded_model)


In [None]:
accuracy_per_class(predictions, true_vals)

In [None]:
# state = {
#     'epoch': epoch,
#     'state_dict': model.state_dict(),a
#     'optimizer': optimizer.state_dict(),
# }

# torch.save(state, f'saved_model_to_resume_{tm}.model')