In [1]:
import pandas as pd
import ast
# load tsv files
df_train = pd.read_csv('../data/CORE/multilabel_train.tsv', sep='\t')
df_train["label"] = df_train["label"].apply(ast.literal_eval)
df_val = pd.read_csv('../data/CORE/multilabel_dev.tsv', sep='\t')
df_val["label"] = df_val["label"].apply(ast.literal_eval)
df_test = pd.read_csv('../data/CORE/multilabel_test.tsv', sep='\t')
df_test["label"] = df_test["label"].apply(ast.literal_eval)

In [2]:
df_train

Unnamed: 0,register,document_id,text,full_label,label
0,OP AV,348059,"Talk To Your Parents Sometimes, people just do...","['OP', 'AV']",[OP]
1,NA OP SR OB,3086555,The Top TEN 'Whiniest Sets of Fans' in English...,"['NA', 'OP', 'SR', 'OB']","[NA, OP]"
2,NA NE,355982,"Ferry consultation needs deeper questions, say...","['NA', 'NE']",[NA]
3,ID DF,437920,I'v been recording and mixing music for about ...,"['ID', 'DF']",[ID]
4,NA SR,389025,The 25-year-old did have chances at Anfield. B...,"['NA', 'SR']",[NA]
...,...,...,...,...,...
33900,NA NE,3254903,DFL appears poised to regain Minnesota Senate ...,"['NA', 'NE']",[NA]
33901,ID DF,3271108,"If this is your first visit, be sure to check ...","['ID', 'DF']",[ID]
33902,NA NE,94422,British Gas has increased energy prices by an ...,"['NA', 'NE']",[NA]
33903,NA NE,263285,Mitt Romney says U.S. Navy is smallest since 1...,"['NA', 'NE']",[NA]


In [3]:
df_train["label"].value_counts()

label
[NA]        12369
[IN]         6522
[OP]         5506
[ID]         2293
[NA, IN]     1250
[NA, OP]     1135
[HI]         1045
[IP]          932
[IN, OP]      502
[LY]          448
[SP]          409
[OTHER]       407
[IN, IP]      296
[IN, HI]      247
[OP, HI]      110
[OP, IP]      105
[NA, SP]       60
[OP, ID]       59
[NA, IP]       58
[IN, ID]       29
[IN, SP]       26
[NA, ID]       19
[OP, SP]       14
[NA, HI]       11
[NA, LY]       11
[OP, LY]       10
[ID, HI]        9
[HI, IP]        8
[IN, LY]        6
[ID, SP]        3
[ID, IP]        2
[HI, LY]        2
[IP, SP]        1
[HI, SP]        1
Name: count, dtype: int64

In [4]:
# only keep rows where label is exactly one label in the list and replace that entry with that entry without the list
def clean_labels(df):
    df = df[df["label"].apply(lambda x: len(x) == 1)]
    df["label"] = df["label"].apply(lambda x: x[0])
    df = df[df["label"] != "OTHER"]
    return df
df_val = clean_labels(df_val)
df_test = clean_labels(df_test)
df_train = clean_labels(df_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = df["label"].apply(lambda x: x[0])


In [30]:
# count occurences of each label and do stratified sampling
df_train

Unnamed: 0,register,document_id,text,full_label,label
0,OP AV,348059,"Talk To Your Parents Sometimes, people just do...","['OP', 'AV']",OP
2,NA NE,355982,"Ferry consultation needs deeper questions, say...","['NA', 'NE']",
3,ID DF,437920,I'v been recording and mixing music for about ...,"['ID', 'DF']",ID
4,NA SR,389025,The 25-year-old did have chances at Anfield. B...,"['NA', 'SR']",
6,NA NE,3337703,The value of NYC housing construction starts m...,"['NA', 'NE']",
...,...,...,...,...,...
33899,IN,522619,A number of websites purport to have tickets f...,['IN'],IN
33900,NA NE,3254903,DFL appears poised to regain Minnesota Senate ...,"['NA', 'NE']",
33901,ID DF,3271108,"If this is your first visit, be sure to check ...","['ID', 'DF']",ID
33902,NA NE,94422,British Gas has increased energy prices by an ...,"['NA', 'NE']",


In [5]:
# rename label to single genre
df_train = df_train.rename(columns={"label": "genre"})
df_val = df_val.rename(columns={"label": "genre"})
df_test = df_test.rename(columns={"label": "genre"})


In [8]:
# change "NA" in gener to "NARRATIVE"
df_train["genre"] = df_train["genre"].apply(lambda x: "NARRATIVE" if x == "NA" else x)
df_val["genre"] = df_val["genre"].apply(lambda x: "NARRATIVE" if x == "NA" else x)
df_test["genre"] = df_test["genre"].apply(lambda x: "NARRATIVE" if x == "NA" else x)

In [9]:
df_val["genre"].value_counts()

genre
NARRATIVE    1766
IN            934
OP            784
ID            327
HI            148
IP            134
LY             63
SP             59
Name: count, dtype: int64

In [10]:
# save
df_train.to_csv('../data/CORE/multilabel_train_cleaned.tsv', sep='\t', index=False)
df_val.to_csv('../data/CORE/multilabel_dev_cleaned.tsv', sep='\t', index=False)
df_test.to_csv('../data/CORE/multilabel_test_cleaned.tsv', sep='\t', index=False)

In [40]:
# for each genre, print one text example
for genre in df_train["genre"].unique():
    print(genre)
    print(df_train[df_train["genre"] == genre].iloc[0]["text"])
    print()

OP
Talk To Your Parents Sometimes, people just don't feel well. But if you don't feel well more than sometimes, it may be helpful to talk to someone about it. Why might it be helpful to talk to a family member? A lot of people just don't want to talk about difficult experiences. Everyone has their own reason to keep quiet. Some people may have a lot of inside fears: they don't want to admit that there is something wrong, they blame themselves, they don't think anyone else will understand. Other people may have a lot of outside fears: they don't want other people to find out, they don't want to lose friends, they don't want to disappoint anyone, they don't want other people to take care of them. The problem is that it can be really tough to deal with difficult feeling on your own. But it's also hard to find support if you don't ask for support. Often, the quickest and most direct way to find support is to ask for it. Part of being strong and in control is knowing when to ask for help. S

In [36]:
df_train

Unnamed: 0,register,document_id,text,full_label,label
0,OP AV,348059,"Talk To Your Parents Sometimes, people just do...","['OP', 'AV']",[OP]
1,NA OP SR OB,3086555,The Top TEN 'Whiniest Sets of Fans' in English...,"['NA', 'OP', 'SR', 'OB']","[NA, OP]"
2,NA NE,355982,"Ferry consultation needs deeper questions, say...","['NA', 'NE']",[NA]
3,ID DF,437920,I'v been recording and mixing music for about ...,"['ID', 'DF']",[ID]
4,NA SR,389025,The 25-year-old did have chances at Anfield. B...,"['NA', 'SR']",[NA]
...,...,...,...,...,...
33900,NA NE,3254903,DFL appears poised to regain Minnesota Senate ...,"['NA', 'NE']",[NA]
33901,ID DF,3271108,"If this is your first visit, be sure to check ...","['ID', 'DF']",[ID]
33902,NA NE,94422,British Gas has increased energy prices by an ...,"['NA', 'NE']",[NA]
33903,NA NE,263285,Mitt Romney says U.S. Navy is smallest since 1...,"['NA', 'NE']",[NA]
