# Multilingual Data Processing for SequenceClassification

## Dataset-specific Prep and Sampling

In [28]:
import pandas as pd
# prep
df = pd.read_csv('Multilingual_Data/english_original.csv', index_col=0, encoding='utf-8') # file path of the corpus
df = df.reset_index(drop=True)
# df = df.loc[df['Category'].isin([0, 1])] # has some no-label rows
df = df.dropna()
# df = df.astype({'Category':'int'})

sampled_corpus = pd.DataFrame() # the sampled examples
PET_col_name = 'type' # name of the column indicating the PET
label_col_name = 'is_euph' # name of the column indicating the label 
max_num_ex = 40

for PET in df[PET_col_name].unique():
    # sample from euphemistic examples
    euph_ex = df.loc[(df[PET_col_name]==PET) & (df[label_col_name]==1)]
    num_euph_sample = min(max_num_ex, len(euph_ex)) # choose up to the max num of euphemistic examples
    euph_sample = euph_ex.sample(n=num_euph_sample)
    sampled_corpus = pd.concat([sampled_corpus, euph_sample])
    # sample from non-euphemistic examples
    noneuph_ex = df.loc[(df[PET_col_name]==PET) & (df[label_col_name]==0)] # choose up to the max num, or num of euphemistic examples
    num_noneuph_sample = min(max_num_ex, len(noneuph_ex))
    noneuph_sample = noneuph_ex.sample(n=num_noneuph_sample)
    sampled_corpus = pd.concat([sampled_corpus, noneuph_sample])
    # print out stats per PET
    print("\"{}\" has {} euphemistic examples and {} non-euphemistic examples. Taking {} and {} for the final dataset.".format(PET, len(euph_ex), len(noneuph_ex), num_euph_sample, num_noneuph_sample))

print("Max number of examples taken per label:", max_num_ex)
print("Original size of corpus:", len(df))
print("Size of sampled corpus:", len(sampled_corpus))
euph_examples = sampled_corpus.loc[sampled_corpus[label_col_name]==1]
noneuph_examples = sampled_corpus.loc[sampled_corpus[label_col_name]==0]
print("Sample corpus contains {} euphemistic examples and {} non-euphemistic examples".format(len(euph_examples), len(noneuph_examples)))

# save the sampled corpus
sampled_corpus = sampled_corpus.reset_index(drop=True)
dest = "Multilingual_Data/english_sampled_v0.csv"
print("Saving corpus to \"{}\"".format(dest))
sampled_corpus.to_csv(dest)

"tinkle" has 2 euphemistic examples and 0 non-euphemistic examples. Taking 2 and 0 for the final dataset.
"undocumented immigrant" has 20 euphemistic examples and 0 non-euphemistic examples. Taking 20 and 0 for the final dataset.
"venereal disease" has 6 euphemistic examples and 0 non-euphemistic examples. Taking 6 and 0 for the final dataset.
"sex worker" has 20 euphemistic examples and 0 non-euphemistic examples. Taking 20 and 0 for the final dataset.
"mentally disabled" has 11 euphemistic examples and 0 non-euphemistic examples. Taking 11 and 0 for the final dataset.
"correctional facility" has 18 euphemistic examples and 0 non-euphemistic examples. Taking 18 and 0 for the final dataset.
"freedom fighter" has 20 euphemistic examples and 0 non-euphemistic examples. Taking 20 and 0 for the final dataset.
"detainee" has 20 euphemistic examples and 0 non-euphemistic examples. Taking 20 and 0 for the final dataset.
"comfort women" has 3 euphemistic examples and 0 non-euphemistic examples

### Chinese

In [9]:
import pandas as pd
# prep
df = pd.read_csv('Multilingual_Data/chinese_original.csv', index_col=0, encoding='utf-8') # file path of the corpus
df = df.reset_index(drop=True)
df = df.loc[df['Category'].isin([0, 1])] # has some no-label rows
df = df.dropna()
df = df.astype({'Category':'int'})

sampled_corpus = pd.DataFrame() # the sampled examples
PET_col_name = 'PET' # name of the column indicating the PET
label_col_name = 'Category' # name of the column indicating the label 
max_num_ex = 40

for PET in df[PET_col_name].unique():
    # sample from euphemistic examples
    euph_ex = df.loc[(df[PET_col_name]==PET) & (df[label_col_name]==1)]
    num_euph_sample = min(max_num_ex, len(euph_ex)) # choose up to the max num of euphemistic examples
    euph_sample = euph_ex.sample(n=num_euph_sample)
    sampled_corpus = pd.concat([sampled_corpus, euph_sample])
    # sample from non-euphemistic examples
    noneuph_ex = df.loc[(df[PET_col_name]==PET) & (df[label_col_name]==0)] # choose up to the max num, or num of euphemistic examples
    num_noneuph_sample = min(max_num_ex, len(noneuph_ex))
    noneuph_sample = noneuph_ex.sample(n=num_noneuph_sample)
    sampled_corpus = pd.concat([sampled_corpus, noneuph_sample])
    # print out stats per PET
    print("\"{}\" has {} euphemistic examples and {} non-euphemistic examples. Taking {} and {} for the final dataset.".format(PET, len(euph_ex), len(noneuph_ex), num_euph_sample, num_noneuph_sample))

print("Max number of examples taken per label:", max_num_ex)
print("Original size of corpus:", len(df))
print("Size of sampled corpus:", len(sampled_corpus))
euph_examples = sampled_corpus.loc[sampled_corpus[label_col_name]==1]
noneuph_examples = sampled_corpus.loc[sampled_corpus[label_col_name]==0]
print("Sample corpus contains {} euphemistic examples and {} non-euphemistic examples".format(len(euph_examples), len(noneuph_examples)))

# save the sampled corpus
sampled_corpus = sampled_corpus.reset_index(drop=True)
dest = "Multilingual_Data/chinese_sampled_v1.csv"
print("Saving corpus to \"{}\"".format(dest))
sampled_corpus.to_csv(dest)

"下身" has 7 euphemistic examples and 122 non-euphemistic examples. Taking 7 and 40 for the final dataset.
"不在了" has 12 euphemistic examples and 14 non-euphemistic examples. Taking 12 and 14 for the final dataset.
"云雨" has 1 euphemistic examples and 1 non-euphemistic examples. Taking 1 and 1 for the final dataset.
"作古" has 3 euphemistic examples and 2 non-euphemistic examples. Taking 3 and 2 for the final dataset.
"升天" has 4 euphemistic examples and 6 non-euphemistic examples. Taking 4 and 6 for the final dataset.
"同志" has 16 euphemistic examples and 786 non-euphemistic examples. Taking 16 and 40 for the final dataset.
"同房" has 17 euphemistic examples and 1 non-euphemistic examples. Taking 17 and 1 for the final dataset.
"大号" has 7 euphemistic examples and 144 non-euphemistic examples. Taking 7 and 40 for the final dataset.
"大清洗" has 2 euphemistic examples and 1 non-euphemistic examples. Taking 2 and 1 for the final dataset.
"夫妻生活" has 19 euphemistic examples and 3 non-euphemistic exampl

### Spanish

In [87]:
import pandas as pd

# prep
df = pd.read_csv('Multilingual_Data/spanish_original.csv', index_col=0, encoding='utf-8')
df = df.reset_index(drop=True)

sampled_corpus = pd.DataFrame() # the sampled examples
PET_col_name = 'Palabra clave' # name of the column indicating the PET
label_col_name = 'Es Eufemistico' # name of the column indicating the label 
max_num_ex = 40

for PET in df[PET_col_name].unique():
    # sample from euphemistic examples
    euph_ex = df.loc[(df[PET_col_name]==PET) & (df[label_col_name]==1)]
    num_euph_sample = min(max_num_ex, len(euph_ex)) # choose up to the max num of euphemistic examples
    euph_sample = euph_ex.sample(n=num_euph_sample)
    sampled_corpus = pd.concat([sampled_corpus, euph_sample])
    # sample from non-euphemistic examples
    noneuph_ex = df.loc[(df[PET_col_name]==PET) & (df[label_col_name]==0)] # choose up to the max num, or num of euphemistic examples
    num_noneuph_sample = min(max_num_ex, len(noneuph_ex))
    noneuph_sample = noneuph_ex.sample(n=num_noneuph_sample)
    sampled_corpus = pd.concat([sampled_corpus, noneuph_sample])
    # print out stats per PET
    # print("\"{}\" has {} euphemistic examples and {} non-euphemistic examples. Taking {} and {} for the final dataset.".format(PET, len(euph_ex), len(noneuph_ex), num_euph_sample, num_noneuph_sample))

print("Max number of examples taken per label:", max_num_ex)
print("Original size of corpus:", len(df))
print("Size of sampled corpus:", len(sampled_corpus))
euph_examples = sampled_corpus.loc[sampled_corpus[label_col_name]==1]
noneuph_examples = sampled_corpus.loc[sampled_corpus[label_col_name]==0]
print("Sample corpus contains {} euphemistic examples and {} non-euphemistic examples".format(len(euph_examples), len(noneuph_examples)))

# save the sampled corpus
sampled_corpus = sampled_corpus.reset_index(drop=True)
dest = "Multilingual_Data/spanish_sampled_v0.csv"
print("Saving corpus to \"{}\"".format(dest))
sampled_corpus.to_csv(dest)

Max number of examples taken per label: 40
Original size of corpus: 1000
Size of sampled corpus: 961
Sample corpus contains 564 euphemistic examples and 397 non-euphemistic examples
Saving corpus to "Multilingual_Data/spanish_sampled_v0.csv"


### Yoruba

In [10]:
import pandas as pd

# prep
df = pd.read_csv("Multilingual_Data/yoruba_original.csv")
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df = df.loc[df['Rating'].isin([0, 1])] # has some no-label rows
df = df.reset_index(drop=True)
# df = df.dropna()
df = df.astype({'Rating':'int'})

sampled_corpus = pd.DataFrame() # the sampled examples
PET_col_name = 'Euphemisms'
label_col_name = 'Rating'
max_num_ex = 40

for PET in df[PET_col_name].unique():
    # sample from euphemistic examples
    euph_ex = df.loc[(df[PET_col_name]==PET) & (df[label_col_name]==1)]
    num_euph_sample = min(max_num_ex, len(euph_ex)) # choose up to the max num of euphemistic examples
    euph_sample = euph_ex.sample(n=num_euph_sample)
    sampled_corpus = pd.concat([sampled_corpus, euph_sample])
    # sample from non-euphemistic examples
    noneuph_ex = df.loc[(df[PET_col_name]==PET) & (df[label_col_name]==0)] # choose up to the max num, or num of euphemistic examples
    num_noneuph_sample = min(max_num_ex, len(noneuph_ex))
    noneuph_sample = noneuph_ex.sample(n=num_noneuph_sample)
    sampled_corpus = pd.concat([sampled_corpus, noneuph_sample])
    # print out stats per PET
    # print("\"{}\" has {} euphemistic examples and {} non-euphemistic examples. Taking {} and {} for the final dataset.".format(PET, len(euph_ex), len(noneuph_ex), num_euph_sample, num_noneuph_sample))

print("Max number of examples taken per label:", max_num_ex)
print("Original size of corpus:", len(df))
print("Size of sampled corpus:", len(sampled_corpus))
euph_examples = sampled_corpus.loc[sampled_corpus[label_col_name]==1]
noneuph_examples = sampled_corpus.loc[sampled_corpus[label_col_name]==0]
print("Sample corpus contains {} euphemistic examples and {} non-euphemistic examples".format(len(euph_examples), len(noneuph_examples)))

# save the sampled corpus
sampled_corpus = sampled_corpus.reset_index(drop=True)
dest = "Multilingual_Data/yoruba_sampled_v1.1.csv"
print("Saving corpus to \"{}\"".format(dest))
sampled_corpus.to_csv(dest)

Max number of examples taken per label: 40
Original size of corpus: 2144
Size of sampled corpus: 1942
Sample corpus contains 1281 euphemistic examples and 661 non-euphemistic examples
Saving corpus to "Multilingual_Data/yoruba_sampled_v1.1.csv"


In [None]:
# OLD CODE to print out stats
# num_euph_ex = 0 
# num_noneuph_ex = 0

# for PET in df[PET_col_name].unique():
#     euph_ex = df.loc[(df[PET_col_name]==PET) & (df[label_col_name]==1)]
#     num_euph_sample = min(max_num_ex, len(euph_ex)) # choose up to the max num of euphemistic examples
#     num_euph_ex += num_euph_sample
#     noneuph_ex = df.loc[(df[PET_col_name]==PET) & (df[label_col_name]==0)] # choose up to the max num, or num of euphemistic examples
#     num_noneuph_sample = min(max_num_ex, len(noneuph_ex))
#     num_noneuph_ex += num_noneuph_sample
#     print("\"{}\" has {} euphemistic examples and {} non-euphemistic examples. Taking {} and {} for the final dataset.".format(PET, len(euph_ex), len(noneuph_ex), num_euph_sample, num_noneuph_sample))
    
# print(num_euph_ex)
# print(num_noneuph_ex)
# df

## Format and Create Input Data for HuggingFace SequenceClassification
Must have columns named 'text' and 'label'.

In [29]:
# given a corpus with a 'text' column and 'label', return k folds
from sklearn.model_selection import StratifiedKFold
def get_k_fold_splits(corpus, k=5):
    folds = []
    skf = StratifiedKFold(n_splits=k)
    split = skf.split(corpus, corpus.label)
    for train_index, test_index in split:
        train_df = corpus.iloc[train_index].sample(frac=1) # select training rows and shuffle them
        test_df = corpus.iloc[test_index].sample(frac=1)
        print("TRAIN: {} ({} 1s and {} 0s) | TEST: {} ({} 1s and {} 0s)".format(len(train_df), len(train_df[train_df['label']==1]), len(train_df[train_df['label']==0]),
                                                                               len(test_df), len(test_df[test_df['label']==1]), len(test_df[test_df['label']==0])))
        folds.append((train_df,test_df))
    return folds

In [32]:
import pandas as pd
import os
# specify source folders
chinese_src = 'Multilingual_Data/chinese_sampled_v0.csv'
spanish_src = 'Multilingual_Data/spanish_sampled_v0.csv'
yoruba_src = 'Multilingual_Data/yoruba_sampled_v1.csv'
english_src = "Multilingual_Data/english_sampled_v0.csv"
# and destination folders; set K in the next chunk
chinese_dest = 'Multilingual_Data/chinese-5_fold_v1'
spanish_dest = 'Multilingual_Data/spanish-5_fold_v0'
yoruba_dest = 'Multilingual_Data/yoruba-5_fold_v1.1'
english_dest = 'Multilingual_Data/english-5-fold_v0'
# if necessary, make directories
# os.mkdir(chinese_dest)
# os.mkdir(spanish_dest)
# os.mkdir(yoruba_dest)
# os.mkdir(english_dest)

In [11]:
chinese_corpus = pd.read_csv(chinese_src, index_col=0)
chinese_corpus = chinese_corpus[['edited_text', 'Category']]
chinese_corpus = chinese_corpus.rename(columns={'edited_text':'text', 'Category':'label'})
chinese_corpus = chinese_corpus.sample(frac=1)

k_folds = get_k_fold_splits(chinese_corpus, k=5)
for i in range(0, len(k_folds)): # for each fold, save the first element as the train df and the second as the test df
    k_folds[i][0].to_csv(chinese_dest + '/hf_train_' + str(i) + '.csv', index=False)
    k_folds[i][1].to_csv(chinese_dest + '/hf_test_' + str(i) + '.csv', index=False)
print("Tests output to", chinese_dest)

TRAIN: 1205 (876 1s and 329 0s) | TEST: 302 (219 1s and 83 0s)
TRAIN: 1205 (876 1s and 329 0s) | TEST: 302 (219 1s and 83 0s)
TRAIN: 1206 (876 1s and 330 0s) | TEST: 301 (219 1s and 82 0s)
TRAIN: 1206 (876 1s and 330 0s) | TEST: 301 (219 1s and 82 0s)
TRAIN: 1206 (876 1s and 330 0s) | TEST: 301 (219 1s and 82 0s)
Tests output to Multilingual_Data/chinese-5_fold_v1


In [13]:
spanish_corpus = pd.read_csv(spanish_src, index_col=0)
spanish_corpus = spanish_corpus[['Texto editado', 'Es Eufemistico']]
spanish_corpus = spanish_corpus.rename(columns={'Texto editado':'text', 'Es Eufemistico':'label'})
spanish_corpus = spanish_corpus.sample(frac=1)

k_folds = get_k_fold_splits(spanish_corpus, k=5)
for i in range(0, len(k_folds)): # for each fold, save the first element as the train df and the second as the test df
    k_folds[i][0].to_csv(spanish_dest + '/hf_train_' + str(i) + '.csv', index=False)
    k_folds[i][1].to_csv(spanish_dest + '/hf_test_' + str(i) + '.csv', index=False)
print("Tests output to", spanish_dest)

TRAIN: 768 (451 1s and 317 0s) | TEST: 193 (113 1s and 80 0s)
TRAIN: 769 (451 1s and 318 0s) | TEST: 192 (113 1s and 79 0s)
TRAIN: 769 (451 1s and 318 0s) | TEST: 192 (113 1s and 79 0s)
TRAIN: 769 (451 1s and 318 0s) | TEST: 192 (113 1s and 79 0s)
TRAIN: 769 (452 1s and 317 0s) | TEST: 192 (112 1s and 80 0s)
Tests output to Multilingual_Data/spanish-5_fold_v0


In [20]:
yoruba_corpus = pd.read_csv(yoruba_src, index_col=0)
yoruba_corpus = yoruba_corpus[['text', 'Rating']]
yoruba_corpus = yoruba_corpus.rename(columns={'text':'text', 'Rating':'label'})
yoruba_corpus = yoruba_corpus.sample(frac=1)

k_folds = get_k_fold_splits(yoruba_corpus, k=5)
for i in range(0, len(k_folds)): # for each fold, save the first element as the train df and the second as the test df
    k_folds[i][0].to_csv(yoruba_dest + '/hf_train_' + str(i) + '.csv', index=False)
    k_folds[i][1].to_csv(yoruba_dest + '/hf_test_' + str(i) + '.csv', index=False)
print("Tests output to", yoruba_dest)

TRAIN: 1553 (1024 1s and 529 0s) | TEST: 389 (257 1s and 132 0s)
TRAIN: 1553 (1025 1s and 528 0s) | TEST: 389 (256 1s and 133 0s)
TRAIN: 1554 (1025 1s and 529 0s) | TEST: 388 (256 1s and 132 0s)
TRAIN: 1554 (1025 1s and 529 0s) | TEST: 388 (256 1s and 132 0s)
TRAIN: 1554 (1025 1s and 529 0s) | TEST: 388 (256 1s and 132 0s)
Tests output to Multilingual_Data/yoruba-5_fold_v1.1


In [33]:
english_corpus = pd.read_csv(english_src, index_col=0)
english_corpus = english_corpus[['edited_text', 'is_euph']]
english_corpus = english_corpus.rename(columns={'edited_text':'text', 'is_euph':'label'})
english_corpus = english_corpus.sample(frac=1)

k_folds = get_k_fold_splits(english_corpus, k=5)
for i in range(0, len(k_folds)): # for each fold, save the first element as the train df and the second as the test df
    k_folds[i][0].to_csv(english_dest + '/hf_train_' + str(i) + '.csv', index=False)
    k_folds[i][1].to_csv(english_dest + '/hf_test_' + str(i) + '.csv', index=False)
print("Tests output to", english_dest)

TRAIN: 1561 (1106 1s and 455 0s) | TEST: 391 (277 1s and 114 0s)
TRAIN: 1561 (1106 1s and 455 0s) | TEST: 391 (277 1s and 114 0s)
TRAIN: 1562 (1106 1s and 456 0s) | TEST: 390 (277 1s and 113 0s)
TRAIN: 1562 (1107 1s and 455 0s) | TEST: 390 (276 1s and 114 0s)
TRAIN: 1562 (1107 1s and 455 0s) | TEST: 390 (276 1s and 114 0s)
Tests output to Multilingual_Data/english-5-fold_v0


In [None]:
# add <> and re-run classifier

## Process Metrics
Written to extract best-run metrics from `results.csv`.

In [26]:
import pandas as pd
FOLDER = 'Multilingual_Data/chinese-5_fold_v0.1'
df = pd.read_csv(FOLDER + '/results_xlm_large.csv', index_col=0)
results = pd.DataFrame(columns=['F1', 'P', 'R', 'tn', 'fp', 'fn', 'tp'])
k = 5 # number of tests
num_epochs = 10

# for each test, select the row with the best F1, then evaluate separate F1s for pos vs neg examples
for x in range(0, k):
    test = df.loc[num_epochs*x:num_epochs*x+9] # select rows from this test (first epoch:last epoch)
    max_f1 = test.loc[test['f1'].idxmax()] # using F1, select the best row (epoch) from this test
    stats = max_f1[0:7].tolist() # take the base stats from the best row
    results.loc[len(results.index)] = stats
results.loc['AVG'] = results.mean() # compute an average row
results

Unnamed: 0,F1,P,R,tn,fp,fn,tp
0,0.928018,0.950893,0.972603,72.0,11.0,6.0,213.0
1,0.953786,0.968468,0.981735,76.0,7.0,4.0,215.0
2,0.909526,0.938053,0.968037,68.0,14.0,7.0,212.0
3,0.914186,0.942222,0.968037,69.0,13.0,7.0,212.0
4,0.956738,0.960352,0.995434,73.0,9.0,1.0,218.0
AVG,0.932451,0.951998,0.977169,71.6,10.8,5.0,214.0


In [20]:
# Experimental
import re
def hfify(df):
    # df = df.drop(['keyword', 'category', 'type', 'euph_status', 'sentence', PROPERTY], axis=1)
    df['preprocessed_text'] = ""
    for i, row in df.iterrows():
        text = df.loc[i, 'text']
        text = re.sub(r"[<>]", " ", text) # we're removing brackets...
        df.loc[i, 'preprocessed_text'] = " ".join(text.split()) 
    df = df[['preprocessed_text','label']]
    df = df.rename(columns={'preprocessed_text':'text'})
    return df

In [23]:
import pandas as pd

FOLDER = 'Multilingual_Data/chinese-5_fold_v0/'
DEST_FOLDER = 'Multilingual_Data/chinese-5_fold_v0.1/'
for n in range(0, 5):
    df = pd.read_csv(FOLDER + 'hf_train_' + str(n) + '.csv')
    df = hfify(df)
    df.to_csv(DEST_FOLDER + 'hf_train_' + str(n) + '.csv')
    df = pd.read_csv(FOLDER + 'hf_test_' + str(n) + '.csv')
    df = hfify(df)
    df.to_csv(DEST_FOLDER + 'hf_test_' + str(n) + '.csv')

In [19]:
t = "毛坯房 厨房贴砖 做防水 <卫生间>贴砖 文化砖 刷腻子 家里所有的家具都是请木工制作的"
t = re.sub(r"<>", " ", t) # we're removing brackets...
t = re.sub(r"[<>]", " ", t)
t

'毛坯房 厨房贴砖 做防水  卫生间 贴砖 文化砖 刷腻子 家里所有的家具都是请木工制作的'