In [1]:
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    EvalPrediction,
    HfArgumentParser,
    PretrainedConfig,
    # Trainer,
    TrainingArguments,
    default_data_collator,
    set_seed,
)
import os
import json
import re
import random
from sklearn.metrics import precision_score, recall_score, f1_score
import datasets
from datasets import Dataset
import pandas as pd
from transformers.data.data_collator import DataCollator, DataCollatorWithPadding, default_data_collator
from trainer import Trainer
from scipy.special import softmax

class DataTrainingArguments:
    """
    Arguments pertaining to what data we are going to input our model for training and eval.

    Using `HfArgumentParser` we can turn this class
    into argparse arguments to be able to specify them on
    the command line.
    """

    task_name: Optional[str] = field(
        default=None,
        metadata={"help": "The name of the task to train on: " + ", ".join(task_to_keys.keys())},
    )
    dataset_name: Optional[str] = field(
        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
    )
    dataset_config_name: Optional[str] = field(
        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
    )
    max_seq_length: int = field(
        default=512,
        metadata={
            "help": (
                "The maximum total input sequence length after tokenization. Sequences longer "
                "than this will be truncated, sequences shorter will be padded."
            )
        },
    )
    overwrite_cache: bool = field(
        default=False, metadata={"help": "Overwrite the cached preprocessed datasets or not."}
    )
    pad_to_max_length: bool = field(
        default=True,
        metadata={
            "help": (
                "Whether to pad all samples to `max_seq_length`. "
                "If False, will pad the samples dynamically when batching to the maximum length in the batch."
            )
        },
    )
    max_train_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of training examples to this "
                "value if set."
            )
        },
    )
    max_eval_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
                "value if set."
            )
        },
    )
    max_predict_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "For debugging purposes or quicker training, truncate the number of prediction examples to this "
                "value if set."
            )
        },
    )
    train_file: Optional[str] = field(
        default=None, metadata={"help": "A csv or a json file containing the training data."}
    )
    validation_file: Optional[str] = field(
        default=None, metadata={"help": "A csv or a json file containing the validation data."}
    )
    test_file: Optional[str] = field(default=None, metadata={"help": "A csv or a json file containing the test data."})

    def __post_init__(self):
        if self.task_name is not None:
            self.task_name = self.task_name.lower()
            if self.task_name not in task_to_keys.keys():
                raise ValueError("Unknown task, you should pick one in " + ",".join(task_to_keys.keys()))
        elif self.dataset_name is not None:
            pass
        elif self.train_file is None or self.validation_file is None:
            raise ValueError("Need either a GLUE task, a training/validation file or a dataset name.")
        else:
            train_extension = self.train_file.split(".")[-1]
            assert train_extension in ["csv", "json"], "`train_file` should be a csv or a json file."
            validation_extension = self.validation_file.split(".")[-1]
            assert (
                validation_extension == train_extension
            ), "`validation_file` should have the same extension (csv or json) as `train_file`."

def format_feature(feature):
    if isinstance(feature, float):
        feature = round(feature, 3)
        return feature

def load_prelabel_output_dir(path):
    files = [os.path.join(path, i) for i in os.listdir(path) if '.json' in i]
    with open(files[0], "r") as jf:
        data = json.load(jf)
    for file in files[1:]:
        with open(file, "r") as jf:
            data_new = json.load(jf)
            data.update(data_new)
    return data

def clean_string(sentence):
    cleaned_string = re.sub(r'\d+\.\s*', '', sentence)
    cleaned_string = cleaned_string.replace("\n", "")
    return cleaned_string

def truncate_dialogue(string, max_len):
    # 截取最后512个单词
    words = string.split()
    if len(words) < max_len:
        return string
    words_to_keep = words[-max_len:]

    # 将截取的单词重新组合成字符串
    truncated_string = " ".join(words_to_keep)

    # 确保对话以[user]或[advisor]开头
    index_user = truncated_string.find("[user]")
    index_advisor = truncated_string.find("[advisor]")

    # 如果找到了[user]或[advisor]开头的索引
    if (index_user != -1) or (index_advisor != -1):
        truncated_string = truncated_string[min(index_user, index_advisor):]

    # 确保截取后的对话以[user]或[advisor]开头
    if truncated_string.startswith("[user]") or truncated_string.startswith("[advisor]"):
        return truncated_string

    # 如果不是以[user]或[advisor]开头，则继续向后截取
    for i in range(len(truncated_string)):
        if truncated_string[i:].startswith("[user]") or truncated_string[i:].startswith("[advisor]"):
            return truncated_string[i:]

    # 如果未找到[user]或[advisor]开头，则返回空字符串
    return ""

def get_data(path,max_len=512):
    data = load_prelabel_output_dir(path)
    sentences = []
    scores = []
    order_ids = []
    sample_nums = []
    for order_id in data:
        for sample_num in data[order_id]:
            history = clean_string(data[order_id][sample_num]['history'])
            current = clean_string(data[order_id][sample_num]['current'])
            score = data[order_id][sample_num]['score']
            
            if score > 0.3:
                score = 2
            elif score < -0.3:
                score = 0
            else:
                score = 1
            
            # sentence = history + current
            # sentence = "{}{}[SEP] {}".format(history,current,current)
            sentence = "{}[SEP] {}".format(history,current)
            sentence = truncate_dialogue(sentence,max_len)
            sentences.append(sentence)
            scores.append(score)
            order_ids.append(order_id)
            sample_nums.append(sample_num)
            
    return sentences,scores,order_ids,sample_nums

def collate_data_deprecated(path, max_len=512):

    data = {
        "sentence": [],
        "label": [],
        "idx": [],
        "group_id": [] # 0 for raw data 1 for copied data
    }

    # balance means balancing the possitive and negative data
    
    raw_data,labels,order_ids,sample_nums = get_data(path,max_len)

    data_idx = [i for i in range(len(raw_data))]

    label_to_data_idxs = {}

    for idx in data_idx:
        label = labels[idx]
        data["sentence"].append(raw_data[idx])
        data["label"].append(label)
        data["idx"].append(idx)
        data["group_id"].append(0)
        if label not in label_to_data_idxs:
            label_to_data_idxs[label] = []
        label_to_data_idxs[label].append(idx)

    for label, idx_list in label_to_data_idxs.items():
        print("ratio of {} label is: {}".format(label, len(idx_list) / len(data_idx)))
    
    return data,order_ids,sample_nums

def get_my_dataset(train_data_path,valid_data_path,max_len):
    dataset_dict = {}
    
    train_data,train_order_id,train_sample_num = collate_data_deprecated(
        path=train_data_path,
        max_len=max_len
    )
    
    valid_data,valid_order_id,valid_sample_num = collate_data_deprecated(
        path=valid_data_path,
        max_len=max_len
    )

    dataset_dict["train_pre"] = Dataset.from_pandas(pd.DataFrame(train_data))
    dataset_dict["valid_pre"] = Dataset.from_pandas(pd.DataFrame(valid_data))

    return datasets.DatasetDict(dataset_dict),train_order_id,train_sample_num,valid_order_id,valid_sample_num

def my_metrics(pred_labels, true_labels): 
    metric_dict = {}
    precision, recall, f1_score, _ = precision_recall_fscore_support(true_labels, pred_labels, average=None)
    metric_dict['precision_negative'] = precision[0]
    metric_dict['recall_negative'] = recall[0]
    metric_dict['f1_score_negative'] = f1_score[0]
    metric_dict['precision_neutral'] = precision[1]
    metric_dict['recall_neutral'] = recall[1]
    metric_dict['f1_score_neutral'] = f1_score[1]
    metric_dict['precision_positive'] = precision[2]
    metric_dict['recall_positive'] = recall[2]
    metric_dict['f1_score_positive'] = f1_score[2]
    
    return metric_dict
        

# You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a
# predictions and label_ids field) and has to return a dictionary string to float.
def compute_metrics(p: EvalPrediction, group_id_list: list):
    id2label_tb = {0: "negative", 1: "neutral", 2:"positive"}
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
    if data_args.task_name is not None:
        result = metric.compute(predictions=preds, references=p.label_ids)
        # compute my metrics here
        my_result = my_metrics(preds, p.label_ids)
        print(my_result)
        for a_key in list(my_result.keys()):
            result[a_key] = my_result[a_key]
        # if len(result) > 1:
        #     result["combined_score"] = np.mean(list(result.values())).item()
        return result
    elif is_regression:
        return {"mse": ((preds - p.label_ids) ** 2).mean().item()}
    else:
        return {"accuracy": (preds == p.label_ids).astype(np.float32).mean().item()}

def main():
    model_path = './tmp/detect_flirt.finbert.v2.model/'
    train_data_path = '/data/yangxiaoran/nonrt_init/pay_rate_level_train/data/sentiment_train_new/'
    valid_data_path = '/data/yangxiaoran/nonrt_init/pay_rate_level_train/data/sentiment_valid_new/'
    max_len = 512
    parser = HfArgumentParser(TrainingArguments, CustomerArguments)
    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
        # If we pass only one argument to the script and it's the path to a json file,
        # let's parse it to get our arguments.
        training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
    else:
        training_args = parser.parse_args_into_dataclasses()

    raw_datasets,train_order_id,train_sample_num,valid_order_id,valid_sample_num = get_my_dataset(train_data_path,valid_data_path,max_len)
    
    # 加载配置文件
    config = AutoConfig.from_pretrained(model_path)
    
    # 加载分词器
    tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    # 加载模型
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    # Initialize our Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=None,
        eval_dataset=None,
        compute_metrics=compute_metrics,
        tokenizer=tokenizer,
        data_collator=data_collator,
        kick_ratio=0
    )
    
    train_predict_dataset = raw_datasets["train_pre"]
    valid_predict_dataset = raw_datasets["valid_pre"]
    
    order_id_2_feature={}
    order_id_predict={}
    feature_name=['bert_'+str(i) for i in range(768)]
    
    predict_datasets = [train_predict_dataset,valid_predict_dataset]
    order_id_list = [train_order_id,valid_order_id]
    sample_num_list = [train_sample_num,valid_sample_num]

    for predict_dataset, order_id_data, sample_num_data in zip(predict_datasets, order_id_list, sample_num_list):
        # Removing the `label` columns because it contains -1 and Trainer won't like that.
        predict_dataset_new = predict_dataset.remove_columns("label")
        predictions,hidden_states = trainer.predict(predict_dataset_new, metric_key_prefix="predict")
        predictions = predictions.predictions
        
        # predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)
        predictions = np.squeeze(predictions) if is_regression else softmax(predictions, axis=1)

        if trainer.is_world_process_zero():
            for index, item in enumerate(tqdm(predictions)):
                features = {}
                
                if order_id_data[index] not in order_id_predict:
                    order_id_predict[order_id_data[index]] = {}
                if sample_num_data[index] not in order_id_predict[order_id_data[index]]:
                    order_id_predict[order_id_data[index]][sample_num_data[index]] = {}
                order_id_predict[order_id_data[index]][sample_num_data[index]]['true'] = int(predict_dataset['label'][index])
                
                # order_id_predict[order_id_data[index]][sample_num_data[index]]['predict'] = int(item)
                order_id_predict[order_id_data[index]][sample_num_data[index]]['predict'] = list(item.astype(np.float))
                
                for i in range(768):
                    features[feature_name[i]] = list(hidden_states[index].astype(float))[i]
                if order_id_data[index] not in order_id_2_feature:
                    order_id_2_feature[order_id_data[index]] = {}
                if sample_num_data[index] not in order_id_2_feature[order_id_data[index]]:
                    order_id_2_feature[order_id_data[index]][sample_num_data[index]] = features

    with open('./data/sentence_feature_try.json', 'w') as f:
        json.dump(order_id_2_feature, f)
    with open('./tmp/sst2/order_id_predict_try.json', 'w') as f:
        json.dump(order_id_predict, f)
    

if __name__ == "__main__":
    main()

NameError: name 'field' is not defined