In [1]:
import os
import random
import numpy as np
import pandas as pd
import torch
import transformers
from tqdm import tqdm, tqdm_notebook
from datasets import load_dataset, load_metric
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

In [35]:
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)  # type: ignore
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = True  # type: ignore
    torch.backends.cudnn.deterministic = True  # type: ignore

SEED = 42
seed_everything(SEED)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [13]:
DATA_PATH = "./data/"
data_strongbuy = pd.read_csv(os.path.join(DATA_PATH, "strongbuy.csv")) # 1637
data_sell = pd.read_csv(os.path.join(DATA_PATH, "sellall.csv")) # 1113
data_sell.drop(columns='Unnamed: 0', inplace=True)

df = pd.concat([data_strongbuy, data_sell], ignore_index=True)
df.info()
print()
print(df['label'].value_counts()); print()
print(df.shape)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2750 entries, 0 to 2749
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  2750 non-null   object
 1   article   2750 non-null   object
 2   length    2750 non-null   int64 
 3   label     2750 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 86.1+ KB

1    1637
0    1113
Name: label, dtype: int64

(2750, 4)


In [49]:
print(df.shape)

df_train_valid, df_test = train_test_split(df, test_size=0.2, random_state=SEED, stratify=df['label'])
df_train, df_valid = train_test_split(df_train_valid, test_size=0.2, random_state=SEED, stratify=df_train_valid['label'])

print(df_train.shape)
print(df_valid.shape)
print(df_test.shape)
print()

print("df_train label counts:"); print(df_train['label'].value_counts()); print()
print("df_valid label counts:"); print(df_valid['label'].value_counts()); print()
print("df_test label counts:"); print(df_test['label'].value_counts())

(2750, 4)
(1760, 4)
(440, 4)
(550, 4)

df_train label counts:
1    1048
0     712
Name: label, dtype: int64

df_valid label counts:
1    262
0    178
Name: label, dtype: int64

df_test label counts:
1    327
0    223
Name: label, dtype: int64


In [52]:
df_train_valid.to_csv("./data/train_report.tsv", index=False, sep='\t')
df_valid.to_csv("./data/valid_report.tsv", index=False, sep='\t')
df_test.to_csv("./data/test_report.tsv", index=False, sep='\t')

In [None]:
# MODEL = 'skt/kobert-base-v1'
# MAX_LEN = 512

# train_datasets = load_dataset("csv", data_files="./data/train_report.csv")['train']
# valid_datasets = load_dataset("csv", data_files="./data/test_report.csv")['train']

# tokenizer = AutoTokenizer.from_pretrained(MODEL)
# # tokenizer.truncation_side = 'left'

# def example_fn(examples):
#     outputs = tokenizer(examples['article'], padding=True, max_length=MAX_LEN, truncation=True)
#     if 'label' in examples:
#         outputs['labels'] = examples['label']
#     return outputs

# train_datasets = train_datasets.map(example_fn, remove_columns=['Unnamed: 0', 'filename', 'article', 'length', 'label'])
# valid_datasets = valid_datasets.map(example_fn, remove_columns=['Unnamed: 0', 'filename', 'article', 'length', 'label'])

In [None]:
# train_datasets.save_to_disk("./data/train_reports")
# valid_datasets.save_to_disk("./data/valid_reports")