In [2]:
import torch
from utils import process_dataset
from argparse import ArgumentParser
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix
from tqdm import tqdm
import numpy as np
import random

seed = 123
torch.manual_seed(seed)
random.seed(seed)
np.random.seed(seed)
torch.cuda.set_device(1)

In [3]:
from datasets import load_dataset

dataset = load_dataset(
    "csv",
    data_files={
        "train": "/home/vincent0730/ML_pondlet_level_predictor/datasets/pondlet_STB_pondlet_20220803_content_data_train.csv",
        "test": "/home/vincent0730/ML_pondlet_level_predictor/datasets/pondlet_STB_pondlet_20220803_content_data_test.csv",
    },
)

Using custom data configuration default-0f8587004575917f
Reusing dataset csv (/home/vincent0730/.cache/huggingface/datasets/csv/default-0f8587004575917f/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58)


  0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'Length', 'content'],
        num_rows: 1561
    })
    test: Dataset({
        features: ['labels', 'Length', 'content'],
        num_rows: 174
    })
})

In [5]:
label_list = dataset["train"].unique("labels")
label_list.sort()
num_labels = len(label_list)
label_to_id = {}
id_to_label = {}
for i, label in enumerate(label_list):
    label_to_id[label] = i
    id_to_label[i] = label


def preprocess_labels(examples):
    if label_to_id is not None and "labels" in examples:
        examples["labels"] = [label_to_id[l] for l in examples["labels"]]
    return examples


dataset = dataset.map(preprocess_labels, batched=True)

Loading cached processed dataset at /home/vincent0730/.cache/huggingface/datasets/csv/default-0f8587004575917f/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-44867db30d67b366.arrow
Loading cached processed dataset at /home/vincent0730/.cache/huggingface/datasets/csv/default-0f8587004575917f/0.0.0/51cce309a08df9c4d82ffd9363bbe090bf173197fc01a71b034e8594995a1a58/cache-c4da537c1651ddae.arrow


In [6]:
label_to_id

{'Lv.0': 0, 'Lv.1': 1, 'Lv.2': 2, 'Lv.3': 3, 'Lv.4': 4, 'Lv.5': 5}

In [7]:
train_dataset = dataset["train"]
train_dataset[0]

{'labels': 5,
 'Length': 519,
 'content': '如今一个企业拼的是人才，一件产品拼的 是技术含量，一项文化活动拼的是创意。 有了好的创意，就是白菜豆腐到了烹饪大 师手中，也能做出一盘大菜来苏州人真的当起了“烹饪大师”。那里的金 鸡湖，木渎古镇、山塘街等地前几年我也 去过，这些地方好是好，但总比不上虎 丘、拙政园、水乡周庄来得显赫。现在可 不同了，金鸡湖能媲美“西湖”，木渎偏以 园林见长，山塘街的游客也成倍地猛增， 靠的是什么？还不是人家巧打了创意牌， 才赢得了游客的认同与赞赏创新！关键在创新！这是一个民族进步的 灵魂，是各行各业兴旺不竭的动力，也是 旅游业永葆生机的“源头活水”。只有在不 断创新上下功夫，我国的旅游业才能保持 竞争力和可持续发展的能力可是看看某些地方的旅游开发，确实让人 揪心。机械性地照搬照抄，或掠夺性地过 度开发，使大量的旅游景点停留在低水 平、单调重复、小散全的状态，有的只能 靠“门票经济”吃饭。就拿“西游记宫”来说 吧，有人统计全国造了 820 多个，这种低层次的简单模仿，其结果可想而知：只 能是一批又一批地倒闭。旅游创新是一道智慧题。必须突破传统思 维定式，敢于不按常规出牌，需要周密的 策划与高超的技巧。让我们一起打好“创 意牌”，擦亮创意火花，让旅游景点开出 一朵朵“领异标新二月花”，清香宜人。'}

In [8]:
eval_dataset = dataset["test"]

In [9]:
from setfit import SetFitModel

def make_model(params=None):
    model_id = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
    return SetFitModel.from_pretrained(model_id)


In [10]:
from sentence_transformers.losses import CosineSimilarityLoss

from setfit import SetFitTrainer

trainer = SetFitTrainer(
    model_init=make_model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    loss_class=CosineSimilarityLoss,
    num_epochs=3,
    num_iterations=20,
    column_mapping={"content": "text", "labels": "label"},
)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [11]:
def hyperparameter_search_function(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-3, log=True),
        "batch_size": trial.suggest_categorical("batch_size", [24, 32]),
    }

In [None]:
best = trainer.hyperparameter_search(hyperparameter_search_function, n_trials=10)
best

[32m[I 2022-11-02 11:41:52,080][0m A new study created in memory with name: no-name-69897ec6-aa1f-4613-8072-e96993b4b127[0m
Trial: {'learning_rate': 0.00014915537758629986, 'batch_size': 24}
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
Applying column mapping to training dataset
***** Running training *****
  Num examples = 62440
  Num epochs = 3
  Total optimization steps = 2602
  Total train batch size = 24


Epoch:   0%|          | 0/3 [00:00<?, ?it/s]

Iteration:   0%|          | 0/2602 [00:00<?, ?it/s]