In [30]:
import pandas as pd
import ast
# load tsv files
df_train = pd.read_csv('../data/CORE/multilabel_train.tsv', sep='\t')
df_train["label"] = df_train["label"].apply(ast.literal_eval)
df_val = pd.read_csv('../data/CORE/multilabel_dev.tsv', sep='\t')
df_val["label"] = df_val["label"].apply(ast.literal_eval)
df_test = pd.read_csv('../data/CORE/multilabel_test.tsv', sep='\t')
df_test["label"] = df_test["label"].apply(ast.literal_eval)

In [31]:
# problem: some texts have multiple labels
print(df_train["label"].value_counts())
# SOLUTION: only keep rows where label is exactly one label in the list and replace that entry with that entry without the list
def clean_labels(df):
    df = df[df["label"].apply(lambda x: len(x) == 1)]
    df["label"] = df["label"].apply(lambda x: x[0])
    df = df[df["label"] != "OTHER"]
    return df
df_val = clean_labels(df_val)
df_test = clean_labels(df_test)
df_train = clean_labels(df_train)
# rename label to single genre
df_train = df_train.rename(columns={"label": "genre"})
df_val = df_val.rename(columns={"label": "genre"})
df_test = df_test.rename(columns={"label": "genre"})

label
[NA]        12369
[IN]         6522
[OP]         5506
[ID]         2293
[NA, IN]     1250
[NA, OP]     1135
[HI]         1045
[IP]          932
[IN, OP]      502
[LY]          448
[SP]          409
[OTHER]       407
[IN, IP]      296
[IN, HI]      247
[OP, HI]      110
[OP, IP]      105
[NA, SP]       60
[OP, ID]       59
[NA, IP]       58
[IN, ID]       29
[IN, SP]       26
[NA, ID]       19
[OP, SP]       14
[NA, HI]       11
[NA, LY]       11
[OP, LY]       10
[ID, HI]        9
[HI, IP]        8
[IN, LY]        6
[ID, SP]        3
[ID, IP]        2
[HI, LY]        2
[IP, SP]        1
[HI, SP]        1
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = df["label"].apply(lambda x: x[0])


In [32]:
# Problem: too few samples for some genres, --> calculate the average text length per genre (BERT has a max token size anyway)
df_train["text_length"] = df_train["text"].apply(lambda x: len(x.split()))
print(df_train["genre"].value_counts())
print(df_train.groupby("genre")["text_length"].mean())
# SOLUTION: enlargen dataset by splitting up texts with more than 300 words

import re
def split_long_texts(df, word_limit=250):
    new_rows = []
    
    # This regex matches:
    #   \s*    optional leading whitespace (including newlines, tabs, etc.)
    #   \S+    one or more non-whitespace characters (a "word")
    #   \s*    trailing whitespace, if any
    #
    # Each match thus gives us exactly one word and any whitespace that follows it.
    # If there's leading whitespace before the first word, it will be included in that first token.
    token_pattern = re.compile(r'\s*\S+\s*')
    
    for _, row in df.iterrows():
        text = row['text']
        
        # Find all tokens
        tokens = token_pattern.findall(text)
        
        # If the number of tokens (words) does not exceed the limit, keep as is
        if len(tokens) <= word_limit:
            new_rows.append({
                'text': text,
                'genre': row['genre'],
                'register': row['register'],
                'document_id': row['document_id']
            })
        else:
            # Split into chunks of word_limit tokens
            for i in range(0, len(tokens), word_limit):
                chunk_tokens = tokens[i:i+word_limit]
                # Just concatenate the tokens as they are: this preserves exact original whitespace
                chunk_text = ''.join(chunk_tokens)
                new_rows.append({
                    'text': chunk_text,
                    'genre': row['genre'],
                    'register': row['register'],
                    'document_id': row['document_id']
                })
    
    return pd.DataFrame(new_rows)
df_train = split_long_texts(df_train)
df_val = split_long_texts(df_val)
df_test = split_long_texts(df_test)
df_train["text_length"] = df_train["text"].apply(lambda x: len(x.split()))
print(df_train.groupby("genre")["text_length"].mean())


genre
NA    12369
IN     6522
OP     5506
ID     2293
HI     1045
IP      932
LY      448
SP      409
Name: count, dtype: int64
genre
HI     993.147368
ID    1092.062364
IN    1358.373198
IP    1053.892704
LY     504.482143
NA    1078.671518
OP    1580.482564
SP    2073.672372
Name: text_length, dtype: float64
genre
HI    222.616688
ID    224.482205
IN    229.349436
IP    223.844120
LY    202.334825
NA    224.255618
OP    231.760333
SP    235.723180
Name: text_length, dtype: float64


In [33]:
# do stratified sampling with replacement to reach 30k, 5k, 5k size
def balanced_sampling(df, dataset_size):
    # Get the unique genres
    genres = df['genre'].unique()
    num_genres = len(genres)
    
    # Compute how many samples per genre
    base_count = dataset_size // num_genres
    remainder = dataset_size % num_genres

    # Create a dictionary to store the number of samples per genre
    samples_per_genre = {genre: base_count for genre in genres}
    
    # Distribute the remainder by adding one extra sample to some genres
    # until we have assigned all leftover samples
    for i, genre in enumerate(genres):
        if i < remainder:
            samples_per_genre[genre] += 1
    
    sampled_df_list = []
    
    # Sample for each genre
    for genre in genres:
        subset = df[df['genre'] == genre]
        required_samples = samples_per_genre[genre]
        
        # If we don't have enough rows in this genre, sample with replacement
        if len(subset) < required_samples:
            sampled_subset = subset.sample(n=required_samples, replace=True, random_state=42)
        else:
            # If we have enough rows, sample without replacement
            sampled_subset = subset.sample(n=required_samples, replace=False, random_state=42)

        sampled_df_list.append(sampled_subset)
    
    # Concatenate all sampled subsets
    balanced_df = pd.concat(sampled_df_list, ignore_index=True)
    return balanced_df
df_train = balanced_sampling(df_train, 30000)
df_val = balanced_sampling(df_val, 5000)
df_test = balanced_sampling(df_test, 5000)
print(df_train["genre"].value_counts())

genre
OP    3750
NA    3750
ID    3750
IN    3750
LY    3750
IP    3750
HI    3750
SP    3750
Name: count, dtype: int64


In [34]:
# sample minimum number of texts per genre
min_occurence_genre = df_train["genre"].value_counts().min()
df_split = df_train.groupby("genre").apply(lambda x: x.sample(min_occurence_genre)).reset_index(drop=True)
df_split

  df_split = df_train.groupby("genre").apply(lambda x: x.sample(min_occurence_genre)).reset_index(drop=True)


Unnamed: 0,text,genre,register,document_id,text_length
0,know her as a person first and foremost and ma...,HI,HI HT,483101,250
1,Type Tool once again. Increase the text size u...,HI,HI HT,3242280,144
2,incremental reforms have consistently failed f...,HI,HI FH,570618,250
3,How can you get rid of clothes moths? The comm...,HI,HI HT,31742,250
4,"mins. If it's still very sticky, keep adding f...",HI,HI RE,1480753,250
...,...,...,...,...,...
29995,toes in the hopes of getting that Presidents g...,SP,SP IT,3298386,250
29996,is not recognised as the capital by most of th...,SP,SP TA,484434,250
29997,quite a few watery references in the names of ...,SP,SP IT,275515,250
29998,"explosion, that's a part of it too. Hendricks:...",SP,SP IT,56437,116


In [35]:
# change "NA" in gener to "NARRATIVE"
df_train["genre"] = df_train["genre"].apply(lambda x: "NARRATIVE" if x == "NA" else x)
df_val["genre"] = df_val["genre"].apply(lambda x: "NARRATIVE" if x == "NA" else x)
df_test["genre"] = df_test["genre"].apply(lambda x: "NARRATIVE" if x == "NA" else x)
# shuffle
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
df_val = df_val.sample(frac=1, random_state=42).reset_index(drop=True)
df_test = df_test.sample(frac=1, random_state=42).reset_index(drop=True)
# save
df_train.to_csv('../data/CORE/multiclass_train_stratified.tsv', sep='\t', index=False)
df_val.to_csv('../data/CORE/multiclass_dev_stratified.tsv', sep='\t', index=False)
df_test.to_csv('../data/CORE/multiclass_test_stratified.tsv', sep='\t', index=False)

In [36]:
df_val

Unnamed: 0,text,genre,register,document_id
0,"the ""Mother of Exiles""—a symbol of hope to imm...",IN,IN EN,9993212
1,"the price, which I cannot. Re: In the end You ...",ID,ID DF,50307
2,fund . Why the phone and name are most importa...,ID,ID RR,1748366
3,executive director. Dr Wakefield remains adama...,OP,OP OB,511814
4,John McCain. He would consider today's McCain ...,OP,OP OB,773105
...,...,...,...,...
4995,want to say in an interview. and in a way this...,SP,SP IT,1441747
4996,"code, simply enter the code at the online chec...",NARRATIVE,NA NE,306010
4997,Best Answer - Chosen by Asker ****SPOILER ALER...,ID,ID QA,113111
4998,all the publicity isn't too exhausting for Mol...,IP,IP DS,1792064


In [37]:
# for each genre, print one text example
for genre in df_train["genre"].unique():
    print(genre)
    print(df_train[df_train["genre"] == genre].iloc[0]["text"])
    print()

OP
well from the Nazi's, on how to steal other peoples land. Palestine belongs to the world not just one group. It was a peacefull place pre WWII, I've talked with Jews and Muslims that lived on the same streets before the War. The Arab people at the time said devide Germany and make a Jewish homeland there. This will never end and Canada should mind it's own business. PS: We have own people in camps called reserves,,,,, Fiat lux We've lost many good friends We've lost many good friends in the Holocaust, some of them my schoolmates, including one of my best friends, Peter Adam, whose death has practically changed my way of thinking and life. My granddaughter is married to a Jew, with our 2 half Jewish great grandchildren, so anybody who'd call me an antisemite is nuts. Some 16 million were killed in Hitler's death camps, 6 million of them Jews. But this crime wave didn't give the right to Poles, or Ukrainians, who were also great losers, to claim superior, God given powers over others,

In [38]:
df_test

Unnamed: 0,text,genre,register,document_id
0,studio to help create a live performance atmos...,NARRATIVE,NA MA,3245655
1,didn't change my mind about anything. He has a...,IP,IP DS,167786
2,"the law of equal freedom? These restraints, wh...",IP,IP PA,652014
3,thing. It's a great moment of revelation for H...,OP,OP RV,1750251
4,reason given for burying UBL at sea was to ens...,OP,OP OB,12331
...,...,...,...,...
4995,"exhausted, Keep it going cause I'm living my d...",LY,LY SL,34220
4996,"As investors, we would all like to beat the ma...",IN,IN,326314
4997,mobs at bay. They are already trained for it. ...,IP,IP,347315
4998,this chamber. Progress and peace and justice a...,SP,SP,785913
