In [3]:
os.chdir("/Users/yiyichen/PycharmProjects/MRL/")

In [4]:
import os
import argparse

import polars as pl
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import pandas as pd
from collections import defaultdict

from preprocessor.preprocessing import *

In [5]:
lexica = pd.read_csv("datasets/hate_speech_detection/finnish/hurtlex_FI.tsv", sep="\t")
lemmas = lexica.lemma.tolist()

In [6]:
def split_data(df):
    train, test_val = train_test_split(df, test_size=0.2, random_state=1, shuffle=True)
    val, test = train_test_split(test_val, test_size=0.5,random_state=1, shuffle=True)
    return train, val, test

# Finnish

In [7]:
data_dir = "datasets/sentiment_analysis/fi/"

In [8]:
header= ["A", "B", "C", "majority", "derived", "smiley", "product", "split", "batch", "filename_id", "text"]

In [27]:
def read_tsv(filepath):
    text_dict = defaultdict(list)
    with open(filepath) as f:
        for line in f.readlines():
            if len(line.split("\t"))==len(header):
                A, B, C, majority, derived, smiley, product, split, batch, filename_id, text = line.replace("\n", "").split("\t")
                text_dict["A"].append(A)
                text_dict["B"].append(B)
                text_dict["C"].append(C)
                text_dict["majority"].append(majority)
                text_dict["derived"].append(derived)
                text_dict["smiley"].append(smiley)
                text_dict["product"].append(product)
                text_dict["split"].append(split)
                text_dict["batch"].append(batch)
                text_dict["filename_id"].append(filename_id)
                text_dict["text"].append(text)
    return text_dict

In [28]:
text_dict = read_tsv(os.path.join(data_dir, "finsen-src", "FinnSentiment2020.tsv"))

In [29]:
df = pd.DataFrame.from_dict(text_dict)

In [30]:
df.majority.value_counts()

0     19825
-1     4109
1      3066
Name: majority, dtype: int64

In [31]:
df.derived.value_counts()

3    14195
2     6422
4     3460
5     1536
1     1387
Name: derived, dtype: int64

# get hate speech

In [34]:
def get_hs(df_):   
    df_["majority"]=df_["majority"].astype(int)
    df_["derived"]=df_["derived"].astype(int)
    df_.loc[df['majority']==-1, "sentiment"]=0 # negative
    df_.loc[df['majority']==0, "sentiment"]=1 # neutral
    df_.loc[df['majority']==1, "sentiment"]=2 # positive
    
    # derived. 1-5 (-3, -2/-1, 0, 1/2, 3)
    df_.loc[(df["sentiment"]==0) & (df_["text"].str.contains("|".join(lemmas))), "label"]=1
    df_.loc[(df["derived"]==3) | (df_["derived"]==5), "label"]=0
    print(df_.head())
    
    df_['text']= df_['text'].apply(preprocessing_one_tweet)
    df_ = df_.drop_duplicates(subset=["text"])

    df_["split"] = df_["text"].str.split()
    df_["LEN"] = [len(x) for x in df_['split']]
    df_ = df_[df_['LEN']>2]
    df_.dropna(subset=["text"], inplace=True)
    df_= df_[['text', 'label']]
    values = df_.label.value_counts()
    print(values)
    min_value = min(values)
    print(min_value) 
    df_ = df_.groupby("label").sample(min_value)
    print(df_.label.value_counts())
    df_ = shuffle(df_)
    return df_

In [35]:
df_hs = get_hs(df)

   A  B  C  majority  derived smiley product  \
0  1  0  1         1        4      0      -1   
1  0  1  0         0        4      0      -1   
2  0  0  0         0        3      0      -1   
3  1  1  1         1        5      0       1   
4  1  1  1         1        5      0       1   

                                               split batch  \
0                    [-, Tervetuloa, skotlantiin...]     1   
1  [......, No,, oikein, sopiva, sattumaha, se, v...     1   
2                                              [40.]     1   
3       [Kyseessä, voi, olla, loppuelämäsi, nainen.]     1   
4           [Sinne, vaan, ocean, clubiin, iskemään!]     1   

                 filename_id  \
0  comments2008c.vrt 2145269   
1  comments2011c.vrt 3247745   
2  comments2007c.vrt 3792960   
3  comments2010d.vrt 2351708   
4  comments2007d.vrt 1701675   

                                                text  sentiment  label  LEN  
0                        - Tervetuloa skotlantiin...        2.0    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_["split"] = df_["text"].str.split()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_["LEN"] = [len(x) for x in df_['split']]


In [36]:
train, val, test = split_data(df_hs)

In [37]:
train.label.value_counts()

0.0    1813
1.0    1809
Name: label, dtype: int64

In [38]:
save_dir = "datasets/hate_speech_detection/finnish/"

In [39]:
train.to_csv(os.path.join(save_dir, "preprocessed", "train.csv"))
val.to_csv(os.path.join(save_dir, "preprocessed", "val.csv"))
test.to_csv(os.path.join(save_dir, "preprocessed", "test.csv"))

# sentiment analysis

In [40]:
def get_sentiment_df(df_):   
    df_["majority"]=df_["majority"].astype(int)
    df_.loc[df['majority']==-1, "label"]=0
    df_.loc[df['majority']==0, "label"]=1
    df_.loc[df['majority']==1, "label"]=2
    print(df_.head())
    df_['text']= df_['text'].apply(preprocessing_one_tweet)
    df_ = df_.drop_duplicates(subset=["text"])
    
    df_["split"] = df_["text"].str.split()
    df_["LEN"] = [len(x) for x in df_['split']]
    df_ = df_[df_['LEN']>2]
    df_.dropna(subset=["text"], inplace=True)
    df_= df_[['text', 'label']]
    values = df_.label.value_counts()
    print(values)
    min_value = min(values)
    print(min_value) 
    
    df_ = df_.groupby("label").sample(min_value)
    print(df_.label.value_counts())
    df_ = shuffle(df_)
    return df_


In [41]:
df = get_sentiment_df(df)

   A  B  C  majority  derived smiley product  \
0  1  0  1         1        4      0      -1   
1  0  1  0         0        4      0      -1   
2  0  0  0         0        3      0      -1   
3  1  1  1         1        5      0       1   
4  1  1  1         1        5      0       1   

                                               split batch  \
0                    [-, Tervetuloa, skotlantiin...]     1   
1  [......, No,, oikein, sopiva, sattumaha, se, v...     1   
2                                              [40.]     1   
3       [Kyseessä, voi, olla, loppuelämäsi, nainen.]     1   
4           [Sinne, vaan, ocean, clubiin, iskemään!]     1   

                 filename_id  \
0  comments2008c.vrt 2145269   
1  comments2011c.vrt 3247745   
2  comments2007c.vrt 3792960   
3  comments2010d.vrt 2351708   
4  comments2007d.vrt 1701675   

                                                text  sentiment  label  LEN  
0                        - Tervetuloa skotlantiin...        2.0    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_["split"] = df_["text"].str.split()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_["LEN"] = [len(x) for x in df_['split']]


In [42]:
train, val, test = split_data(df)

In [43]:
save_dir = os.path.join(data_dir, "preprocessed")

In [44]:
train.to_csv(os.path.join(save_dir, "train.csv"))
test.to_csv(os.path.join(save_dir, "test.csv"))
val.to_csv(os.path.join(save_dir, "val.csv"))
