In [1]:
import pandas as pd
import os 
from collections import defaultdict
from sklearn.utils import shuffle

In [2]:
!pwd


/Users/yiyichen/PycharmProjects/MRL/preprocessor


In [3]:
os.chdir("/Users/yiyichen/PycharmProjects/MRL")

In [4]:
# https://github.com/valeriobasile/hurtlex
lexica = pd.read_csv("datasets/sentiment_analysis/sv/ABSAbank/hurtlex_SV.tsv", sep="\t")

In [5]:
lemmas = lexica.lemma.tolist()

Recommended split(s)	Paragraph level: 10-fold cross-validation. If cross-validation is impossible, use the 00 fold as the standard split. The split is random at the document level (the documents are randomly shuffled), but consecutive at the paragraph level. The reason is that if paragraphs from the same document end up in both train and test, this will make the task easier and the estimates of how well the model generalizes to new data will be less reliable (the border between test and dev or dev and train, however, may split the document in two halves. The effect of that is presumably negligible);


In [8]:
data_dir = "datasets/sentiment_analysis/sv/ABSAbank/split10_consecutive_average"

In [9]:
def read_tsv(filepath, text_dict):
    with open(filepath) as f:
        for line in f.readlines():
            if not line.startswith("doc	par	text	label"):
                if len(line.split("\t"))==4:
                    doc, par, text, label = line.replace("\n", "").split("\t")
                    text_dict["doc"].append(doc)
                    text_dict["par"].append(par)
                    text_dict["text"].append(text)
                    text_dict["label"].append(label)

In [10]:
train_dict = defaultdict(list)
val_dict = defaultdict(list)
test_dict = defaultdict(list)

for file in os.listdir(data_dir):
    filepath = os.path.join(data_dir, file)
    if "test00" in file:
        read_tsv(filepath, test_dict)
    if "train00" in file:
        read_tsv(filepath, train_dict)
    if "dev00" in file:
        read_tsv(filepath, val_dict)

In [11]:
train_df= pd.DataFrame.from_dict(train_dict)
dev_df= pd.DataFrame.from_dict(val_dict)
test_df= pd.DataFrame.from_dict(test_dict)

In [12]:
len(train_df)

3898

# Get dataset for hate speech detection

In [55]:
# 1. very negative, 2, 3, 4, 5 (very positive)

In [13]:
sv_dir = "/Users/yiyichen/PycharmProjects/MRL/datasets/hate_speech_detection/swedish/ABSAbank/preprocessed"

In [14]:
def get_hs(df, dataset):
    print(len(df))
    df = df.dropna(subset=["label"])
    df.drop_duplicates(subset=["text"], inplace=True)

    print(len(df))
    df['label'] = df['label'].astype(float)

    df.loc[(df['label']==1) & (df["text"].str.contains("|".join(lemmas))), "hsd"]= 1 # offensive and hatespeech
    df.loc[df['label']>=3, "hsd"]=0 # normal
    print(df.hsd.value_counts())
    df.to_csv(os.path.join(sv_dir, f"{dataset}.csv"), index=False)
    return df

In [15]:
train = get_hs(train_df, "train")

3898
3884
0.0    2679
1.0     319
Name: hsd, dtype: int64


In [16]:
dev = get_hs(dev_df, "val")

487
486
0.0    282
1.0     35
Name: hsd, dtype: int64


In [17]:
test = get_hs(test_df, "test")

487
487
0.0    281
1.0     62
Name: hsd, dtype: int64


In [18]:
save_dir = "/Users/yiyichen/PycharmProjects/MRL/datasets/hate_speech_detection/swedish/preprocessed"

In [21]:
def get_balanced_data(df, dataset):
    min_ = min(df.hsd.value_counts())
    df = df.dropna(subset=["hsd"])
    df = df.groupby("hsd").sample(n=min_, random_state=1)
    df=df[['text', 'hsd']]
    df.rename(columns={'hsd':'label'}, inplace=True)
    df= shuffle(df)
    df.to_csv(os.path.join(save_dir, f"{dataset}.csv"), index=False)
    print(df.label.value_counts())
    return df

In [22]:
train_ = get_balanced_data(train, "train")
dev_ = get_balanced_data(dev, "val")
test_ = get_balanced_data(test, "test")

0.0    319
1.0    319
Name: label, dtype: int64
0.0    35
1.0    35
Name: label, dtype: int64
0.0    62
1.0    62
Name: label, dtype: int64


In [32]:
df= pd.concat([train_, dev_, test_])

In [33]:
df.label.value_counts()

0.0    1248
1.0    1248
Name: label, dtype: int64

In [34]:
df = df.drop_duplicates(subset=["text"])

In [35]:
df.label.value_counts()

0.0    1043
1.0     416
Name: label, dtype: int64

In [None]:
# 80, 10,10

# get dataset for sentiment analysis

In [23]:
data_dir

'datasets/sentiment_analysis/sv/ABSAbank/split10_consecutive_average'

In [24]:
train_df.columns

Index(['doc', 'par', 'text', 'label'], dtype='object')

In [25]:
def get_sentiment(df, dataset):
    df["label"] = df["label"].astype(float)
    df.loc[df['label']==3, "sentiment"]= 1 # neutral
    df.loc[df['label']>=4, "sentiment"]= 2 # positive
    df.loc[df['label']<=2, "sentiment"]= 0 # negative
    print(df.sentiment.value_counts())
    print(df.head(2))

    values = df.sentiment.value_counts()
    print(values)
    min_value = min(values)
    print(min_value)
    df.dropna(subset=["sentiment"], inplace=True)
    df= df[['text', 'sentiment']]
    df.rename(columns={'sentiment':'label'}, inplace=True)
    df = df.groupby("label").sample(min_value)
    print(df.label.value_counts())
    df.to_csv(f"datasets/sentiment_analysis/sv/preprocessed/{dataset}.csv", index=False)
    return df

In [26]:
train_df_ = get_sentiment(train_df, "train")

1.0    1793
0.0    1179
2.0     835
Name: sentiment, dtype: int64
                         doc par  \
0  z02679_flashback-50294563   3   
1  z02679_flashback-50294563   5   

                                                text  label  sentiment  
0  Förklarar regeringen och tidigare regeringars ...    3.0        1.0  
1  Kom gärna med förslag på olika samband mellan ...    3.0        1.0  
1.0    1793
0.0    1179
2.0     835
Name: sentiment, dtype: int64
835
0.0    835
1.0    835
2.0    835
Name: label, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [27]:
test_df_ = get_sentiment(test_df, "test")

1.0    230
0.0    202
2.0     50
Name: sentiment, dtype: int64
                         doc par  \
0  z01627_flashback-60850068   2   
1  z01627_flashback-60850068   3   

                                                text  label  sentiment  
0  All invandring är inte dålig men få kan förnek...    2.0        0.0  
1  Ser man tillbaka i tiden så var det bättre på ...    2.0        0.0  
1.0    230
0.0    202
2.0     50
Name: sentiment, dtype: int64
50
0.0    50
1.0    50
2.0    50
Name: label, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [28]:
dev_df_ = get_sentiment(dev_df, "val")

1.0    209
0.0    198
2.0     68
Name: sentiment, dtype: int64
                                            doc par  \
0  www_aftonbladet_se_debatt_article22871619_ab   5   
1  www_aftonbladet_se_debatt_article22871619_ab   6   

                                                text     label  sentiment  
0  Att vara intresserad av hemländernas politik ä...  3.000000        1.0  
1  Tyvärr verkar det som om några partier har bli...  2.666667        NaN  
1.0    209
0.0    198
2.0     68
Name: sentiment, dtype: int64
68
0.0    68
1.0    68
2.0    68
Name: label, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [61]:
sv_dir = "/Users/yiyichen/PycharmProjects/MRL/datasets/sentiment_analysis/sv"

In [62]:
train_len, val_len, test_len = 1839, 324, 870

In [69]:
from sklearn.utils import shuffle
def get_balanced_df(df, LEN):
    df= df.groupby("label").sample(LEN)
    print(df.label.value_counts())
    df= shuffle(df)
    return df

In [70]:
train_b = get_balanced_df(train_df, int(train_len/3))
test_b = get_balanced_df(test_df, int(test_len/3))
dev_b = get_balanced_df(dev_df, int(val_len/3))

0.0    613
1.0    613
2.0    613
Name: label, dtype: int64
0.0    290
1.0    290
2.0    290
Name: label, dtype: int64
0.0    108
1.0    108
2.0    108
Name: label, dtype: int64


In [72]:
train_b.to_csv(os.path.join(sv_dir, "train.csv"))
test_b.to_csv(os.path.join(sv_dir, "test.csv"))
dev_b.to_csv(os.path.join(sv_dir, "val.csv"))