In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '4'
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

import transformers
import evaluate
import torch
import operator
import re
import sys
import collections
import string
import contextlib

import numpy as np
import pandas as pd
import torch.nn as nn

from multiprocessing import cpu_count
from nusacrowd import NusantaraConfigHelper
from datetime import datetime
from tqdm import tqdm
from deep_translator import GoogleTranslator
from huggingface_hub import HfApi, create_repo

from datasets import (
    load_dataset, 
    Dataset,
    DatasetDict
)
from transformers import (
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    BertForQuestionAnswering,
    AutoTokenizer,
    EarlyStoppingCallback,
    AutoModelForQuestionAnswering,
    pipeline
)

In [20]:
MODEL_NAME = 'xlm-roberta-large'
TYPE_QAS = "entailment_only"
TYPE_SMOOTHING = "just_concat_answer_and_question"
MAXIMUM_SEARCH_ITER = 3
VARIATION = 3
THRESHOLD = 0.5
MODEL_SC_NAME = "muhammadravi251001/fine-tuned-NLI-indonli-with-xlm-roberta-large"

USER = "muhammadravi251001"   
MODEL_TG_IND_NAME = "Wikidepia/IndoT5-base-paraphrase"
MODEL_TG_ENG_NAME = "humarin/chatgpt_paraphraser_on_T5_base"
MODEL_NER_NAME = "ageng-anugrah/indobert-large-p2-finetuned-ner"
MAX_LENGTH = 512
STRIDE = 128
LOGGING_STEPS = 50
WARMUP_RATIO = 0.0
WEIGHT_DECAY = 0.0
EVAL_STEPS_RATIO = 0.5
SAMPLE = sys.maxsize

In [84]:
tokenizer_kwargs = {'truncation': True, 'max_length': 512}
MODEL_QA_NAME = "muhammadravi251001/fine-tuned-DatasetQAS-IDK-MRC-with-xlm-roberta-large-without-ITTL-without-freeze-LR-1e-05"

nlp_qa = pipeline(task="question-answering", model=MODEL_QA_NAME, tokenizer=MODEL_QA_NAME, 
                device=torch.cuda.current_device())

In [85]:
context = "Di Hispania, Ataulf dengan tidak hati-hati menerima pengabdiannya kepada salah satu bekas pengikut almarhum Sarus, tidak menyadari bahwa pria tersebut menyimpan sebuah keinginan rahasia untuk membalas kematian pelindung kesayangannya. Jadi, di istana Barcelona, pria yang membuat kekuasaan Ataulf tiba-tiba berakhir dengan membunuhnya saat dia mandi."
question = "Dimana Raja  Ataulf meninggal?"

In [86]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
import torch

model = AutoModelForQuestionAnswering.from_pretrained(MODEL_QA_NAME)
tokenizer = AutoTokenizer.from_pretrained(MODEL_QA_NAME)
model.config.top_k = 10

inputs = tokenizer(question, context, return_tensors="pt")
outputs = model(**inputs)

sorted_start_logits = torch.argsort(outputs.start_logits)
sorted_end_logits = torch.argsort(outputs.end_logits)

for i in range(1, 10+1):
    start_index = sorted_start_logits[0, -i]
    end_index = sorted_end_logits[0, -i]
    answer_tokens = inputs["input_ids"][0][start_index : end_index + 1]

    answer = tokenizer.decode(answer_tokens)
    print("Jawaban:", answer)

Jawaban: Hispania, Ataulf dengan tidak hati-hati menerima pengabdiannya kepada salah satu bekas pengikut almarhum Sarus, tidak menyadari bahwa pria tersebut menyimpan sebuah keinginan rahasia untuk membalas kematian pelindung kesayangannya. Jadi, di istana Barcelona
Jawaban: 
Jawaban: Barcelona, pria yang membuat kekuasaan Ataulf tiba-tiba berakhir dengan membunuhnya saat dia mandi
Jawaban: 
Jawaban: di istana
Jawaban: 
Jawaban: 
Jawaban: 
Jawaban: 
Jawaban: 


In [87]:
x = nlp_qa(question=question, context=context, top_k=10)

for i in x:
    print(f"Jawaban: {i['answer']}")

Jawaban: istana Barcelona,
Jawaban: Hispania,
Jawaban: Barcelona,
Jawaban: di istana Barcelona,
Jawaban: Di Hispania,
Jawaban: istana Barcelona,
Jawaban: Jadi, di istana Barcelona,
Jawaban: saat dia mandi.
Jawaban: istana
Jawaban: mandi.


In [88]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer
import torch

# Inisialisasi model dan tokenizer untuk tugas question-answering
tokenizer_qa = AutoTokenizer.from_pretrained(MODEL_QA_NAME)
model_qa = AutoModelForQuestionAnswering.from_pretrained(MODEL_QA_NAME)

# Fungsi kustom yang meniru perilaku pipeline
def custom_qa(text):
    # Tokenisasi input
    inputs = tokenizer_qa(text, return_tensors="pt", truncation=True, max_length=512)

    # Lakukan prediksi menggunakan model question-answering
    qa_outputs = model_qa(**inputs)

    # Ambil hasil prediksi
    start_index = torch.argmax(qa_outputs.start_logits)
    end_index = torch.argmax(qa_outputs.end_logits)

    # Mendekode token untuk mendapatkan jawaban
    answer_tokens = inputs["input_ids"][0][start_index : end_index + 1]
    answer = tokenizer_qa.decode(answer_tokens)

    return answer

# Contoh penggunaan variabel custom_qa
answer = custom_qa(f"{question} {context}")
print("Jawaban:", answer)

Jawaban: Hispania, Ataulf dengan tidak hati-hati menerima pengabdiannya kepada salah satu bekas pengikut almarhum Sarus, tidak menyadari bahwa pria tersebut menyimpan sebuah keinginan rahasia untuk membalas kematian pelindung kesayangannya. Jadi, di istana Barcelona


In [23]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

# Inisialisasi model dan tokenizer untuk tugas NER
tokenizer_ner = AutoTokenizer.from_pretrained(MODEL_NER_NAME)
model_ner = AutoModelForTokenClassification.from_pretrained(MODEL_NER_NAME)

# Fungsi kustom yang menggunakan .predict()
def custom_ner(text, tokenizer, model):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        ner_outputs = model(**inputs)

    predicted_labels = torch.argmax(ner_outputs.logits, dim=2)[0].tolist()

    entity_results = []
    entity = None
    start = None

    for i, label_id in enumerate(predicted_labels):
        label = model.config.id2label[label_id]
        if label.startswith('B-'):
            if entity:
                entity_results.append({
                    'entity': entity[2:],
                    'score': 1.0,
                    'index': i - 1,
                    'word': tokenizer.decode(inputs['input_ids'][0, start:i].tolist()),
                    'start': start,
                    'end': i - 1
                })
            entity = label
            start = i
        elif label.startswith('I-') and entity:
            continue
        else:
            if entity:
                entity_results.append({
                    'entity': entity[2:],
                    'score': 1.0,
                    'index': i - 1,
                    'word': tokenizer.decode(inputs['input_ids'][0, start:i].tolist()),
                    'start': start,
                    'end': i - 1
                })
                entity = None

    return entity_results

# Contoh penggunaan variabel custom_ner
text = "Rumah saya di Jakarta, nama saya Rafi"
ner_results = custom_ner(text, tokenizer_ner, model_ner)
print(ner_results)


[{'entity': 'PLACE', 'score': 1.0, 'index': 4, 'word': 'jakarta', 'start': 4, 'end': 4}, {'entity': 'PERSON', 'score': 1.0, 'index': 9, 'word': 'rafi', 'start': 8, 'end': 9}]


In [72]:
TASK_CHUNKING_NAME = "token-classification"
MODEL_CHUNKING_NAME = "ageng-anugrah/indobert-large-p2-finetuned-chunking"

tokenizer_chunking = AutoTokenizer.from_pretrained(MODEL_CHUNKING_NAME)
model_chunking = AutoModelForTokenClassification.from_pretrained(MODEL_CHUNKING_NAME)

def predict(model, tokenizer, sentence):
    
    inputs = tokenizer_chunking(sentence,
                        return_offsets_mapping=True,
                        return_tensors="pt",
                        **tokenizer_kwargs)
        
    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]

    # Proses forward
    outputs = model_chunking(ids, attention_mask=mask)
    logits = outputs.logits

    active_logits = logits.view(-1, model_chunking.config.num_labels)
    flattened_predictions = torch.argmax(active_logits, dim=1)

    tokens = tokenizer_chunking.tokenize(sentence)
    token_predictions = [model_chunking.config.id2label[i] for i in flattened_predictions.cpu().numpy()]

    offset_mapping = inputs["offset_mapping"].squeeze().tolist()

    results = []

    entity = None
    start_index = 0

    for i, (token, token_pred, mapping) in enumerate(zip(tokens, token_predictions, offset_mapping)):
        # hanya prediksi pada token pertama yang penting
        if entity:
            results.append({
                'entity': entity,
                'score': 1.0,
                'index': i - 1,
                'word': tokens[start_index:i][0],
                'start': start_index,
                'end': i - 1
            })
        entity = token_pred
        start_index = i

    return results

sentence = "Rumah saya di Jakarta, nama saya Rafi"

predicted_results = predict(model_chunking, tokenizer_chunking, sentence)
print(predicted_results)

[{'entity': 'I-PP', 'score': 1.0, 'index': 0, 'word': 'rumah', 'start': 0, 'end': 0}, {'entity': 'B-NP', 'score': 1.0, 'index': 1, 'word': 'saya', 'start': 1, 'end': 1}, {'entity': 'I-NP', 'score': 1.0, 'index': 2, 'word': 'di', 'start': 2, 'end': 2}, {'entity': 'B-PP', 'score': 1.0, 'index': 3, 'word': 'jakarta', 'start': 3, 'end': 3}, {'entity': 'B-NP', 'score': 1.0, 'index': 4, 'word': ',', 'start': 4, 'end': 4}, {'entity': 'O', 'score': 1.0, 'index': 5, 'word': 'nama', 'start': 5, 'end': 5}, {'entity': 'B-NP', 'score': 1.0, 'index': 6, 'word': 'saya', 'start': 6, 'end': 6}, {'entity': 'I-NP', 'score': 1.0, 'index': 7, 'word': 'raf', 'start': 7, 'end': 7}]


In [61]:
nlp_chunking = pipeline(task="token-classification", model=MODEL_CHUNKING_NAME, tokenizer=MODEL_CHUNKING_NAME)

chunking = nlp_chunking("Rumah saya di Jakarta, nama saya Rafi")
chunking

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity': 'B-NP',
  'score': 0.99981314,
  'index': 1,
  'word': 'rumah',
  'start': 0,
  'end': 5},
 {'entity': 'I-NP',
  'score': 0.9776594,
  'index': 2,
  'word': 'saya',
  'start': 6,
  'end': 10},
 {'entity': 'B-PP',
  'score': 0.9982919,
  'index': 3,
  'word': 'di',
  'start': 11,
  'end': 13},
 {'entity': 'B-NP',
  'score': 0.9987502,
  'index': 4,
  'word': 'jakarta',
  'start': 14,
  'end': 21},
 {'entity': 'B-NP',
  'score': 0.9893422,
  'index': 6,
  'word': 'nama',
  'start': 23,
  'end': 27},
 {'entity': 'I-NP',
  'score': 0.975521,
  'index': 7,
  'word': 'saya',
  'start': 28,
  'end': 32},
 {'entity': 'I-NP',
  'score': 0.70781696,
  'index': 8,
  'word': 'raf',
  'start': 33,
  'end': 36},
 {'entity': 'I-NP',
  'score': 0.9916013,
  'index': 9,
  'word': '##i',
  'start': 36,
  'end': 37}]

In [71]:
tokenizer_ner = AutoTokenizer.from_pretrained(MODEL_NER_NAME)
model_ner = AutoModelForTokenClassification.from_pretrained(MODEL_NER_NAME)

def predict(model, tokenizer, sentence):
    
    inputs = tokenizer_ner(sentence,
                        return_offsets_mapping=True,
                        return_tensors="pt",
                        **tokenizer_kwargs)
        
    ids = inputs["input_ids"]
    mask = inputs["attention_mask"]

    # Proses forward
    outputs = model_ner(ids, attention_mask=mask)
    logits = outputs.logits

    active_logits = logits.view(-1, model_ner.config.num_labels)
    flattened_predictions = torch.argmax(active_logits, dim=1)

    tokens = tokenizer_ner.tokenize(sentence)
    token_predictions = [model_ner.config.id2label[i] for i in flattened_predictions.cpu().numpy()]

    offset_mapping = inputs["offset_mapping"].squeeze().tolist()

    results = []

    entity = None
    start_index = 0

    for i, (token, token_pred, mapping) in enumerate(zip(tokens, token_predictions, offset_mapping)):
        # hanya prediksi pada token pertama yang penting
        
        print(entity)
        
        if entity and entity != 'O':
            results.append({
                'entity': entity,
                'score': 1.0,
                'index': i - 1,
                'word': tokens[start_index:i][0],
                'start': start_index,
                'end': i - 1
            })
        entity = token_pred
        start_index = i

    return results

sentence = "Rumah saya di Jakarta, nama saya Rafi"

predicted_results = predict(model_ner, tokenizer_ner, sentence)
print(predicted_results)

None
O
O
O
O
B-PLACE
O
O
O
[{'entity': 'B-PLACE', 'score': 1.0, 'index': 4, 'word': ',', 'start': 4, 'end': 4}]


In [35]:
nlp_ner = pipeline(task="ner", model=MODEL_NER_NAME, tokenizer=MODEL_NER_NAME)

ner = nlp_ner("Rumah saya di Jakarta, nama saya Rafi")
ner

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'entity': 'B-PLACE',
  'score': 0.9998228,
  'index': 4,
  'word': 'jakarta',
  'start': 14,
  'end': 21},
 {'entity': 'B-PERSON',
  'score': 0.9993711,
  'index': 8,
  'word': 'raf',
  'start': 33,
  'end': 36},
 {'entity': 'I-PERSON',
  'score': 0.9902058,
  'index': 9,
  'word': '##i',
  'start': 36,
  'end': 37}]

In [16]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, pipeline
import torch

tokenizer_kwargs = {'truncation': True, 'max_length': 512}

# Inisialisasi model dan tokenizer untuk tugas text-classification
tokenizer_sc = AutoTokenizer.from_pretrained(MODEL_SC_NAME)
model_sc = AutoModelForSequenceClassification.from_pretrained(MODEL_SC_NAME)

# Fungsi kustom yang meniru perilaku pipeline
def custom_text_classification(text_dict):
    
    inputs = tokenizer_sc(text_dict['text'], text_dict['text_pair'], return_tensors="pt", truncation=True, max_length=512)
    outputs = model_sc(**inputs)

    label_id = torch.argmax(outputs.logits).item()
    label = model_sc.config.id2label[label_id]
    score = outputs.logits.softmax(dim=-1)[0][label_id].item()

    return {'label': label, 'score': score}

# Contoh penggunaan variabel custom_text_classification
context_decoded = "Bambang Pamungkas adalah pemain sepak bola asal Bandung"
pred_hypothesis = "Bambang Pamungkas asal Jakarta"

# Memanggil fungsi custom_text_classification dengan model dan tokenizer yang sesuai
predicted_result = custom_text_classification({'text': context_decoded, 'text_pair': pred_hypothesis})

print(predicted_result)

tensor(2)
{'label': 'contradiction', 'score': 0.9868278503417969}


In [102]:
def custom_qa(question, context):
        
    inputs = tokenizer_qa(question, context, 
                          return_tensors="pt",
                          **tokenizer_kwargs)

    outputs = model_qa(**inputs)

    sorted_start_logits = torch.argsort(outputs.start_logits)
    sorted_end_logits = torch.argsort(outputs.end_logits)

    answer_array = []
    for i in range(1, (MAXIMUM_SEARCH_ITER + 1)):

        start_index = sorted_start_logits[0, -i]
        end_index = sorted_end_logits[0, -i]
        answer_tokens = inputs["input_ids"][0][start_index : end_index + 1]

        answer = tokenizer_qa.decode(answer_tokens)
        answer_array.append({'answer': answer})

    return answer_array

custom_qa(question, context)

[{'answer': 'Ir. Basuki Tjahaja Purnama'}, {'answer': ''}, {'answer': ''}]

In [31]:
x = nlp_qa(question=question, context=context, top_k=3)

unique_start_end = set()
unique_answers = []

for answer in answers:
    
    start_end_pair = (answer['start'], answer['end'])

    if start_end_pair not in unique_start_end:
        unique_start_end.add(start_end_pair)
        unique_answers.append(answer)

print(unique_answers)

[{'score': 0.7026851773262024, 'start': 980, 'end': 988, 'answer': '28997m2,'}, {'score': 0.01276418287307024, 'start': 1103, 'end': 1115, 'answer': '29095m2.[16]'}]


In [30]:
answers = [{'score': 0.7026851773262024, 'start': 980, 'end': 988, 'answer': '28997m2,'},
 {'score': 0.01276418287307024, 'start': 1103, 'end': 1115, 'answer': '29095m2.[16]'},
 {'score': 0.0036276059690862894, 'start': 980, 'end': 988, 'answer': '28997m2,'}]

# Buat set kosong untuk menyimpan pasangan start-end unik
unique_start_end = set()

# Buat daftar baru untuk jawaban unik
unique_answers = []

for answer in answers:
    start_end_pair = (answer['start'], answer['end'])

    # Cek apakah pasangan start-end sudah ada dalam set
    if start_end_pair not in unique_start_end:
        unique_start_end.add(start_end_pair)
        unique_answers.append(answer)

# unique_answers sekarang berisi jawaban dengan start-end yang unik
print(unique_answers)


[{'score': 0.7026851773262024, 'start': 980, 'end': 988, 'answer': '28997m2,'}, {'score': 0.01276418287307024, 'start': 1103, 'end': 1115, 'answer': '29095m2.[16]'}]


In [74]:
nlp_tg_ind = pipeline(task="text2text-generation", model=MODEL_TG_IND_NAME, tokenizer=MODEL_TG_IND_NAME, device=torch.cuda.current_device(), **tokenizer_kwargs)

nlp_tg_ind("Saya seorang kapitan")

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[{'generated_text': 'Saya kapitan'}]

In [78]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

MODEL_TG_IND_NAME = "Wikidepia/IndoT5-base-paraphrase"

# Load model and tokenizer
model_tg_ind = T5ForConditionalGeneration.from_pretrained(MODEL_TG_IND_NAME)
tokenizer_tg_ind = T5Tokenizer.from_pretrained(MODEL_TG_IND_NAME)
model_tg_ind.to("cuda")  # Pindahkan model ke GPU jika tersedia

# Fungsi untuk melakukan generasi teks
def nlp_tg_ind(prompt):
    input_ids = tokenizer_tg_ind(prompt, return_tensors="pt").input_ids.to("cuda")
    output = model_tg_ind.generate(input_ids, max_length=50, num_return_sequences=1, no_repeat_ngram_size=2, top_k=50, top_p=0.95)
    generated_text = tokenizer_tg_ind.batch_decode(output, skip_special_tokens=True)
    
    return [{'generated_text': generated_text[0]}]

# Menggunakan nlp_tg_ind
result = nlp_tg_ind("Saya seorang kapitan")
print(result)


[{'generated_text': 'Saya kapitan'}]


In [79]:
nlp_tg_eng = pipeline(task="text2text-generation", model=MODEL_TG_ENG_NAME, tokenizer=MODEL_TG_ENG_NAME, device=torch.cuda.current_device(), **tokenizer_kwargs)

nlp_tg_eng("Saya seorang kapitan")

[{'generated_text': 'Saya seorang kapitan kapitan kapitaran sabuk terhada.'}]

In [83]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

MODEL_TG_ENG_NAME = "humarin/chatgpt_paraphraser_on_T5_base"

# Load model and tokenizer
model_tg_eng = T5ForConditionalGeneration.from_pretrained(MODEL_TG_ENG_NAME)
tokenizer_tg_eng = T5Tokenizer.from_pretrained(MODEL_TG_ENG_NAME) # Pindahkan model ke GPU jika tersedia

# Fungsi untuk melakukan generasi teks
def nlp_tg_eng(prompt):
    input_ids = tokenizer_tg_eng(prompt, return_tensors="pt").input_ids
    output = model_tg_eng.generate(input_ids, max_length=50, num_return_sequences=1, no_repeat_ngram_size=2, top_k=50, top_p=0.95)
    generated_text = tokenizer_tg_eng.batch_decode(output, skip_special_tokens=True)
    
    return [{'generated_text': generated_text[0]}]

# Menggunakan nlp_tg_ind
result = nlp_tg_eng("Saya seorang kapitan")
print(result)


[{'generated_text': 'Saya seorang kapitan adilirih sabukh di dua dalah saya salaman terhada.'}]
