In [25]:
import pandas as pd
import ast
# load tsv files
df_train = pd.read_csv('../data/CORE/multilabel_train.tsv', sep='\t')
df_train["label"] = df_train["label"].apply(ast.literal_eval)
df_val = pd.read_csv('../data/CORE/multilabel_dev.tsv', sep='\t')
df_val["label"] = df_val["label"].apply(ast.literal_eval)
df_test = pd.read_csv('../data/CORE/multilabel_test.tsv', sep='\t')
df_test["label"] = df_test["label"].apply(ast.literal_eval)

In [26]:
# problem: some texts have multiple labels
print(df_train["label"].value_counts())
# SOLUTION: only keep rows where label is exactly one label in the list and replace that entry with that entry without the list
def clean_labels(df):
    df = df[df["label"].apply(lambda x: len(x) == 1)]
    df["label"] = df["label"].apply(lambda x: x[0])
    df = df[df["label"] != "OTHER"]
    return df
df_val = clean_labels(df_val)
df_test = clean_labels(df_test)
df_train = clean_labels(df_train)
# rename label to single genre
df_train = df_train.rename(columns={"label": "genre"})
df_val = df_val.rename(columns={"label": "genre"})
df_test = df_test.rename(columns={"label": "genre"})

label
[NA]        12369
[IN]         6522
[OP]         5506
[ID]         2293
[NA, IN]     1250
[NA, OP]     1135
[HI]         1045
[IP]          932
[IN, OP]      502
[LY]          448
[SP]          409
[OTHER]       407
[IN, IP]      296
[IN, HI]      247
[OP, HI]      110
[OP, IP]      105
[NA, SP]       60
[OP, ID]       59
[NA, IP]       58
[IN, ID]       29
[IN, SP]       26
[NA, ID]       19
[OP, SP]       14
[NA, HI]       11
[NA, LY]       11
[OP, LY]       10
[ID, HI]        9
[HI, IP]        8
[IN, LY]        6
[ID, SP]        3
[ID, IP]        2
[HI, LY]        2
[IP, SP]        1
[HI, SP]        1
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = df["label"].apply(lambda x: x[0])


In [27]:
# Problem: too few samples for some genres, --> calculate the average text length per genre (BERT has a max token size anyway)
df_train["text_length"] = df_train["text"].apply(lambda x: len(x.split()))
print(df_train["genre"].value_counts())
print(df_train.groupby("genre")["text_length"].mean())
# SOLUTION: enlargen dataset by splitting up texts with more than 300 words

import re
def split_long_texts(df, word_limit=250):
    new_rows = []
    
    # This regex matches:
    #   \s*    optional leading whitespace (including newlines, tabs, etc.)
    #   \S+    one or more non-whitespace characters (a "word")
    #   \s*    trailing whitespace, if any
    #
    # Each match thus gives us exactly one word and any whitespace that follows it.
    # If there's leading whitespace before the first word, it will be included in that first token.
    token_pattern = re.compile(r'\s*\S+\s*')
    
    for _, row in df.iterrows():
        text = row['text']
        
        # Find all tokens
        tokens = token_pattern.findall(text)
        
        # If the number of tokens (words) does not exceed the limit, keep as is
        if len(tokens) <= word_limit:
            new_rows.append({
                'text': text,
                'genre': row['genre'],
                'register': row['register'],
                'document_id': row['document_id']
            })
        else:
            # Split into chunks of word_limit tokens
            for i in range(0, len(tokens), word_limit):
                chunk_tokens = tokens[i:i+word_limit]
                # Just concatenate the tokens as they are: this preserves exact original whitespace
                chunk_text = ''.join(chunk_tokens)
                new_rows.append({
                    'text': chunk_text,
                    'genre': row['genre'],
                    'register': row['register'],
                    'document_id': row['document_id']
                })
    
    return pd.DataFrame(new_rows)
df_train = split_long_texts(df_train)
df_val = split_long_texts(df_val)
df_test = split_long_texts(df_test)
df_train["text_length"] = df_train["text"].apply(lambda x: len(x.split()))
print(df_train.groupby("genre")["text_length"].mean())


genre
NA    12369
IN     6522
OP     5506
ID     2293
HI     1045
IP      932
LY      448
SP      409
Name: count, dtype: int64
genre
HI     993.147368
ID    1092.062364
IN    1358.373198
IP    1053.892704
LY     504.482143
NA    1078.671518
OP    1580.482564
SP    2073.672372
Name: text_length, dtype: float64
genre
HI    222.616688
ID    224.482205
IN    229.349436
IP    223.844120
LY    202.334825
NA    224.255618
OP    231.760333
SP    235.723180
Name: text_length, dtype: float64


In [28]:
# do stratified sampling with replacement to reach 30k, 5k, 5k size
def balanced_sampling(df, dataset_size):
    # Get the unique genres
    genres = df['genre'].unique()
    num_genres = len(genres)
    
    # Compute how many samples per genre
    base_count = dataset_size // num_genres
    remainder = dataset_size % num_genres

    # Create a dictionary to store the number of samples per genre
    samples_per_genre = {genre: base_count for genre in genres}
    
    # Distribute the remainder by adding one extra sample to some genres
    # until we have assigned all leftover samples
    for i, genre in enumerate(genres):
        if i < remainder:
            samples_per_genre[genre] += 1
    
    sampled_df_list = []
    
    # Sample for each genre
    for genre in genres:
        subset = df[df['genre'] == genre]
        required_samples = samples_per_genre[genre]
        
        # If we don't have enough rows in this genre, sample with replacement
        if len(subset) < required_samples:
            sampled_subset = subset.sample(n=required_samples, replace=True, random_state=42)
        else:
            # If we have enough rows, sample without replacement
            sampled_subset = subset.sample(n=required_samples, replace=False, random_state=42)

        sampled_df_list.append(sampled_subset)
    
    # Concatenate all sampled subsets
    balanced_df = pd.concat(sampled_df_list, ignore_index=True)
    return balanced_df
df_train = balanced_sampling(df_train, 30000)
df_val = balanced_sampling(df_val, 5000)
df_test = balanced_sampling(df_test, 5000)
print(df_train["genre"].value_counts())

genre
OP    3750
NA    3750
ID    3750
IN    3750
LY    3750
IP    3750
HI    3750
SP    3750
Name: count, dtype: int64


In [14]:
# sample minimum number of texts per genre
min_occurence_genre = df_train["genre"].value_counts().min()
df_split = df_train.groupby("genre").apply(lambda x: x.sample(min_occurence_genre)).reset_index(drop=True)
df_split

  df_split = df_split.groupby("genre").apply(lambda x: x.sample(min_occurence_genre)).reset_index(drop=True)


Unnamed: 0,text,genre,register,document_id
0,have 4 or 5 different machine brands and some ...,HI,HI HT,457354
1,marketing methods is the finest way to create ...,HI,HI HT,3314812
2,"When you bring up the issue, give your friend ...",HI,HI HT,324833
3,started working on it -- SLOWLY. The Content I...,HI,HI HT,3187203
4,An ability for their company to respond to cha...,HI,HI HT,403629
...,...,...,...,...
8931,and go away most games. That said Christism sh...,SP,SP IT,447498
8932,sort of poetic. AVC: It's a little nihilistic....,SP,SP IT,173347
8933,for yourself. Once you?re preoccupied with a r...,SP,SP IT,88465
8934,Alien invasion. How this man could be such a h...,SP,SP IT,491481


In [29]:
# change "NA" in gener to "NARRATIVE"
df_train["genre"] = df_train["genre"].apply(lambda x: "NARRATIVE" if x == "NA" else x)
df_val["genre"] = df_val["genre"].apply(lambda x: "NARRATIVE" if x == "NA" else x)
df_test["genre"] = df_test["genre"].apply(lambda x: "NARRATIVE" if x == "NA" else x)
# save
df_train.to_csv('../data/CORE/multiclass_train_stratified.tsv', sep='\t', index=False)
df_val.to_csv('../data/CORE/multiclass_dev_stratified.tsv', sep='\t', index=False)
df_test.to_csv('../data/CORE/multiclass_test_stratified.tsv', sep='\t', index=False)

In [15]:
df_val

Unnamed: 0,register,document_id,text,full_label,genre
0,NA SR,601771,Already an accomplished adventure and 'ultrama...,"['NA', 'SR']",
1,OP OB,3160437,Being a Successful Band Has Nothing to Do With...,"['OP', 'OB']",OP
3,NA SR,3060117,QPR vs Southampton - Match preview and team ne...,"['NA', 'SR']",
4,NA NE,119728,Search age: Search in: US says it's ready to t...,"['NA', 'NE']",
6,NA NE,2697,Toronto cycling activists were gnashing their ...,"['NA', 'NE']",
...,...,...,...,...,...
4838,,783119,"As a prisoner of war in Vietnam, Eagle Scout G...",['NA'],
4839,NA NE,444339,Lollapalooza heading to the Middle East in 201...,"['NA', 'NE']",
4840,OP AV,3287928,How to: Improve Brand Engagement on Twitter Ea...,"['OP', 'AV']",OP
4841,OP OB,3345658,The Last Post. A shameful reflection on Canada...,"['OP', 'OB']",OP


In [40]:
# for each genre, print one text example
for genre in df_train["genre"].unique():
    print(genre)
    print(df_train[df_train["genre"] == genre].iloc[0]["text"])
    print()

OP
Talk To Your Parents Sometimes, people just don't feel well. But if you don't feel well more than sometimes, it may be helpful to talk to someone about it. Why might it be helpful to talk to a family member? A lot of people just don't want to talk about difficult experiences. Everyone has their own reason to keep quiet. Some people may have a lot of inside fears: they don't want to admit that there is something wrong, they blame themselves, they don't think anyone else will understand. Other people may have a lot of outside fears: they don't want other people to find out, they don't want to lose friends, they don't want to disappoint anyone, they don't want other people to take care of them. The problem is that it can be really tough to deal with difficult feeling on your own. But it's also hard to find support if you don't ask for support. Often, the quickest and most direct way to find support is to ask for it. Part of being strong and in control is knowing when to ask for help. S

In [17]:
df_test

Unnamed: 0,register,document_id,text,full_label,genre
0,IN,54888,Australian prisoners of war: Second World War ...,['IN'],IN
1,IN OI,540314,DEFINITION About a quarter of the world drives...,"['IN', 'OI']",IN
2,OP OB,3281556,How should retail investors own gold? Mostly p...,"['OP', 'OB']",OP
3,NA NE,723677,Famed railway leader Hunter Harrison said he i...,"['NA', 'NE']",
4,NA NE,85712,Louis Walsh also claimed that Gary Barlow and ...,"['NA', 'NE']",
...,...,...,...,...,...
9681,,1789364,Story: Evolution of plants and animals About 8...,['NA'],
9682,NA NE,600500,The nationwide smoking ban has triggered the b...,"['NA', 'NE']",
9683,OP OB,3028345,"Life in small business and local politics ""Opp...","['OP', 'OB']",OP
9684,NA NE,586681,We'd like to let you know that this site uses ...,"['NA', 'NE']",
