In [1]:
import numpy as np
import pickle
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

In [6]:
from persian_re.preprocess.utils import load_raw_data, remove_re_type
from persian_re.settings import Config

In [7]:
train_df, test_df = load_raw_data()

removing `Entity-Destination(e2,e1)` class from dataset because it only has 1 sample in train and 1 sample in test data.

In [8]:
train_df = remove_re_type(train_df, 'Entity-Destination(e2,e1)')
test_df = remove_re_type(test_df, 'Entity-Destination(e2,e1)')

In [9]:
assert set(train_df['re_type'].unique()) == set(test_df['re_type'].unique())

getting list of class labels

In [10]:
labels = list(sorted(train_df['re_type'].unique()))

replacing id instead of label string

In [11]:
train_df['label_id'] = train_df['re_type'].apply(lambda x: labels.index(x))
test_df['label_id'] = test_df['re_type'].apply(lambda x: labels.index(x))

removing redundant columns

In [12]:
train = train_df.drop(columns=['comment', 'id'])
test = test_df.drop(columns=['comment', 'id'])

we preserve the train data and split test into test and validation

In [13]:
test, valid = train_test_split(test, test_size=717, random_state=42, stratify=test['label_id'])

train = train.reset_index(drop=True)
valid = valid.reset_index(drop=True)
test = test.reset_index(drop=True)

In [14]:
x_train, y_train = train['text'].values.tolist(), train['re_type'].values.tolist()
x_valid, y_valid = valid['text'].values.tolist(), valid['re_type'].values.tolist()
x_test, y_test = test['text'].values.tolist(), test['re_type'].values.tolist()

In [15]:
print(train.shape)
print(valid.shape)
print(test.shape)

(7999, 3)
(717, 3)
(1999, 3)


computing class weights for balancing dataset.

In [16]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

`label2id` and `id2label` mapping

In [17]:
label2id = {label: i for i, label in enumerate(labels)}
id2label = {v: k for k, v in label2id.items()}

In [18]:
data = {
    'train': (x_train, y_train),
    'valid': (x_valid, y_valid),
    'test': (x_test, y_test),
    'labels': {
        'label2id': label2id,
        'id2label': id2label
    },
    'class_weights': class_weights
}

serializing data to binary file:

In [19]:
with open(Config.BASE_PATH / 'PERLEX' / 'transformed_data.bin','wb') as binary_file:
    pickle.dump(data,binary_file)