In [1]:
import pandas as pd

# データの読み込み
filepath = '../origin/categorize.csv'
df = pd.read_csv(filepath)[['mail', 'sentiment']]
display(df['sentiment'].value_counts())

# ニュートラルを除外
# df = df[df['sentiment'] != 'NEUTRAL']

# ラベルを数値に変換
label_map = { 'NEGATIVE': 0, 'POSITIVE': 1, 'NEUTRAL': 2}
df['sentiment'] = df['sentiment'].map(label_map)
display(df['sentiment'].value_counts())

df.rename(columns={'mail':'text', 'sentiment':'label'}, inplace=True)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], train_size=0.5, random_state=42, stratify=df['label'])

X_test, X_eval, y_test, y_eval = train_test_split(
    X_test, y_test, train_size=0.5, random_state=42, stratify=y_test)

sentiment
POSITIVE    657
NEGATIVE    362
NEUTRAL     280
Name: count, dtype: int64

sentiment
1    657
0    362
2    280
Name: count, dtype: int64

In [2]:
train_df = pd.DataFrame({'text': X_train, 'label': y_train})
test_df = pd.DataFrame({'text': X_test, 'label': y_test})
eval_df = pd.DataFrame({'text': X_eval, 'label': y_eval})
test_df.drop(index=0, inplace=True)

In [3]:
train_df.shape, test_df.shape, eval_df.shape

((649, 2), (324, 2), (325, 2))

In [4]:
train_df.to_csv('./train.csv', index=False, header=False)
test_df.to_csv('./test.csv', index=False, header=False)
eval_df.to_csv('./eval.csv', index=False, header=False)

In [5]:
# https://huggingface.co/docs/datasets/create_dataset
from datasets import DatasetBuilder, GeneratorBasedBuilder
import datasets
import csv

class FTDataset(GeneratorBasedBuilder):
    def _info(self):
        return datasets.DatasetInfo(
            features=datasets.Features({
                'text': datasets.Value('string'),
                'label': datasets.ClassLabel(names=['negative', 'positive', 'neutral']),
            }),
        )
    def _split_generators(self, dl_manager):
        """Returns SplitGenerators."""

        train_path = './train.csv'
        test_path = './test.csv'
        eval_path = './eval.csv'

        return [
            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepath": train_path}),
            datasets.SplitGenerator(name=datasets.Split.TEST, gen_kwargs={"filepath": test_path}),
            datasets.SplitGenerator(name=datasets.Split.VALIDATION, gen_kwargs={"filepath": eval_path}),
        ]

    def _generate_examples(self, filepath):
        # CSVファイルを行ごとに読み込み、それぞれの行をHugging Faceデータセットの形式に変換
        with open(filepath, encoding='utf-8') as csv_file:
            csv_reader = csv.reader(csv_file)
            for id_, row in enumerate(csv_reader):
                yield id_, {
                    'text': row[0], 
                    'label': row[1], 
                }

In [6]:
from datasets import load_dataset

dataset = load_dataset('dataset_loader.py', name='sentiment_dataset')
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 649
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 324
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 325
    })
})