In [1]:
import torch
from transformers import pipeline
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM, TextStreamer
from torch.nn import functional as F
from optimum.bettertransformer import BetterTransformer
# set pytorch to use bfloat

# get device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def check_tokens_in_dict(labels, tokenizer):
    c = 0
    for token in labels:
        if token in tokenizer.get_vocab().keys():
            c+=1
    print(f"{c} tokens in dict")

def classify_text(sentence, labels, tokenizer, model, device):
    with torch.no_grad():
        tokenizer.pad_token = tokenizer.eos_token
        inputs = tokenizer.batch_encode_plus([sentence] + labels,
                                            return_tensors='pt',
                                            pad_to_max_length=True)
        input_ids = inputs['input_ids']
        attention_mask = inputs['attention_mask']

        output = model(input_ids, attention_mask=attention_mask)[0]
        sentence_rep = output[:1].mean(dim=1)
        label_reps = output[1:].mean(dim=1)

        # now find the labels with the highest cosine similarities to
        # the sentence
        similarities = F.cosine_similarity(sentence_rep.float(), label_reps.float())
        closest = similarities.argsort(descending=True)
        result = {}
        #for ind in closest:
            #print(f'label: {labels[ind]} \t similarity: {similarities[ind]}')
            
        return closest, similarities

In [3]:
# model pool
# LinkSoul/Chinese-Llama-2-7b
# ckiplab/gpt2-base-chinese
# 

model_path = "FlagAlpha/Llama2-Chinese-7b-Chat"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model_hf = AutoModel.from_pretrained(model_path, load_in_4bit=True)
model = BetterTransformer.transform(model_hf, keep_original_model=False)



sentence = '將「睡眠呼吸中止症」重新納入身心障礙'
labels = ['交通', '法律與正義', '教育', '金融與經濟', '環境與氣候', '能源', '社會福祉', '軍事與國安']

# run inputs through model and mean-pool over the sequence
# dimension to get sequence-level representations
check_tokens_in_dict(labels, tokenizer)

Loading checkpoint shards: 100%|██████████| 2/2 [00:11<00:00,  5.57s/it]
The BetterTransformer implementation does not support padding during training, as the fused kernels do not support attention masks. Beware that passing padded batched data during training may result in unexpected outputs. Please refer to https://huggingface.co/docs/optimum/bettertransformer/overview for more details.


0 tokens in dict


In [4]:
from urbandev.utils import load_data_excel

data = load_data_excel("./data/JOIN_iVoting_Proposals_categorized.xlsx")

In [5]:
data.head()

Unnamed: 0,Index,publishDate,url,title,proposal,benefits&impact,#Votes,MinVotesNecessary,SubmissionDate,Followers,Messages,GA,proposer,Category,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17
0,0,2015-09-10 13:26:19,https://join.gov.tw/idea/detail/25824c17-f141-...,Join 平台應提供匯出資料供批次下載,\n目前 Join 平台為方便機關人員作業，在後台有「打包匯出資料」的功能，但前台沒有開放給...,這是 kiang 在 g0v 提出的想法。\n利益：提供民間備份，並可介接第三方進行全文檢索...,22,250,2015-09-10 17:12:05,1,0,無,au,Social,,,,
1,1,2015-09-10 16:08:10,https://join.gov.tw/idea/detail/75185e90-3a37-...,你是否贊成推動「十八歲投票權」及「二十歲被選舉權」?,鑒於國民年滿十六歲即可工作、納稅，\n年滿十八歲就須負完全的刑事責任並有應考試、服公職的權利...,世代正義是我國民主發展所必須正視的課題，\n若設置過高的年齡門檻形同將年輕世代排除在體制性的...,0,0,2015-09-10 16:11:31,0,0,無,森里蛍一,Law & Justice,,,Categories,
2,2,2015-09-10 19:40:31,https://join.gov.tw/idea/detail/4e658586-2a08-...,你是否贊成將國家撥給政黨的競選費用補助金門檻由3.5%降為3%，並設置10%的上限?,雖然之前國家撥給政黨的競選費用補助金門檻從5%下降至3.5%但仍不夠低，\n以國外案例來說德...,修法調降政黨競選費用補助金門檻有利於小黨發展並可促進多元政黨政治發展，\n稚現行修正後之門檻...,5,250,2015-09-22 00:45:06,1,3,無,森里蛍一,Law & Justice,,,English,Chinese
3,3,2015-09-10 20:45:11,https://join.gov.tw/idea/detail/94b5dca9-57fc-...,都更的建議,國家既然採多數決.為何總是被少數人綁架.都更常因釘子戶造成困擾.個人建議.國家要發展.這個問...,,2,250,2015-09-10 20:49:17,0,0,無,樂與喜,Social,,,Transport,交通
4,4,2015-09-10 21:40:32,https://join.gov.tw/idea/detail/3bdab9bf-d874-...,引進鞭刑,依先進國家如新加坡的刑法引進鞭刑讓重刑犯，強姦犯等重大罪犯得到應得的逞罰,促進社會正義，讓正義得以伸張並幫助這些罪犯記取教訓以及促進受害者人權，進而讓國家進步,184,250,2015-09-10 21:42:48,4,0,無,UFO,Law & Justice,,,Law & Justice,法律與正義


In [6]:
labelsLUT = {"Transport": '交通',"Law & Justice": '法律與正義',"Education": '教育',"Finance": '金融與經濟',
"Environment & Climate": '環境與氣候',"Energy": '能源',"Social": '社會福祉',"Military and Natonal Security": '軍事與國安'}

In [7]:
import numpy as np
sims = []
predictions = []
correct = 0
n=80
total = 0
for index, row in data.iloc[:n].iterrows():
    preds, scores = classify_text(row["proposal"], labels, tokenizer, model, device)
    predictions.append(preds)
    sims.append(scores)
    gt = row["Category"]
    if labels[preds[0]]==labelsLUT[gt.strip()]:
        correct+=1
    total+=1
print(f"Accuracy is {correct/(total)}")






OutOfMemoryError: CUDA out of memory. Tried to allocate 1.30 GiB (GPU 0; 11.90 GiB total capacity; 8.60 GiB already allocated; 940.88 MiB free; 10.81 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
tokenizer.get_vocab().keys()



In [None]:
data.iloc[index]

Index                                                               79
publishDate                                        2016-03-01 11:29:55
url                  https://join.gov.tw/idea/detail/531937fc-a5ce-...
title                                                         企業，外勞，分紅
proposal             政府開放企業雇用外勞逐，無非是要減輕企業的人事成本，但卻加速國內就業薪資停滯以及失業率惡化。...
benefits&impact                                                    NaN
#Votes                                                               1
MinVotesNecessary                                                  250
SubmissionDate                                     2016-03-01 11:50:52
Followers                                                            0
Messages                                                             0
GA                                                                   無
proposer                                                          Even
Category                                                       Social 
Unname

In [None]:
model.dict

AttributeError: 'LlamaModel' object has no attribute 'dict'

## Zero SHot classification

In [None]:
pipe = pipeline("zero-shot-classification", model=model, tokenizer=tokenizer)

Failed to determine 'entailment' label id from the label2id mapping in the model config. Setting to -1. Define a descriptive label2id mapping in the model config to ensure correct outputs.
