# Imports

In [1]:
import pandas as pd
from glob import glob
from tqdm import tqdm_notebook
from sklearn.model_selection import train_test_split
import os
import re

In [2]:
cols = [
    'Text',
    'Target',
    'Polarity',
#     'opinion_to"wards',
#     'sentiment',
#     'text',
#     'target_in_tweet',
#     'seen?'"
]

In [3]:
save_path = "./src/data/UStanceBR/"
os.makedirs(save_path, exist_ok=True)

# Formatting original datasets

In [4]:
pol2label = {
    "against": 0,
    "for": 1
}
for path in tqdm_notebook(glob("../../data/UStanceBR/v2/*.csv")):
    name = path.replace("/", "\\").split("\\")[-1]
    df = pd.read_csv(path, sep=";")
    
    df["Filler"] = 0
    
    df = df[cols]
#     df.columns = vast_cols
    
    if name.startswith("r2_bo"):
        df["topic_str"] = "bolsonaro"
    elif name.startswith("r2_lu"):
        df["topic_str"] = "lula"
    elif name.startswith("r2_cl"):
        df["topic_str"] = "cloroquina"
    elif name.startswith("r2_co"):
        df["topic_str"] = "coronavac"
    elif name.startswith("r2_gl"):
        df["topic_str"] = "globo"
    elif name.startswith("r2_ig"):
        df["topic_str"] = "igreja"
    
#     df["ori_topic"] = df["topic_str"]
    df["topic"] = df["topic_str"].apply(lambda x: f'["{x}"]')
    
    def tokenize_sentence(text):
        text = re.sub(r'[^\w\s]', '', text.lower())
        return text.split()
            
    df["text_token"] = df["Text"].apply(lambda x: '[["' + '", "'.join(tokenize_sentence(x)) + '"]]')
    df["label"] = df["Polarity"].apply(lambda x: pol2label[x])
    df["seen?"] = 1
    
    df = df[["Text", "topic", "label", "text_token", "seen?"]]
    df.columns = ['tweet', 'topic', 'label', 'text', "seen?"]

    if "train" in name:
        df_train, df_valid = train_test_split(
            df,
            test_size = 0.2,
            stratify = df["label"],
            random_state = 42,
            shuffle = True
        )
        
        df_train.to_csv(f"{save_path}/{name}", index=False)
        df_valid.to_csv(f"{save_path}/{name.replace('train', 'valid')}", index=False)
    else:
        df.to_csv(f"{save_path}/{name}", index=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  """


  0%|          | 0/12 [00:00<?, ?it/s]

## Train

In [5]:
train_data_dict = {
    "bo": pd.read_csv(f"{save_path}/r2_bo_train_statements.csv"),
    "lu": pd.read_csv(f"{save_path}/r2_lu_train_statements.csv"),
    "co": pd.read_csv(f"{save_path}/r2_co_train_statements.csv"),
    "cl": pd.read_csv(f"{save_path}/r2_cl_train_statements.csv"),
    "gl": pd.read_csv(f"{save_path}/r2_gl_train_statements.csv"),
    "ig": pd.read_csv(f"{save_path}/r2_ig_train_statements.csv"),
}

for topic in tqdm_notebook(train_data_dict.keys()):
    topic_data = train_data_dict[topic]
    other_data = pd.concat([data_ for topic_, data_ in train_data_dict.items() if topic_ != topic])
    
    topic_data["label"] = 2
    topic_data["seen?"] = 0
    
    train_data = other_data.append(topic_data)
    train_data.to_csv(f"{save_path}/final_{topic}_train.csv", index=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


  0%|          | 0/6 [00:00<?, ?it/s]

## Validation

In [6]:
valid_data_dict = {
    "bo": pd.read_csv(f"{save_path}/r2_bo_valid_statements.csv"),
    "lu": pd.read_csv(f"{save_path}/r2_lu_valid_statements.csv"),
    "co": pd.read_csv(f"{save_path}/r2_co_valid_statements.csv"),
    "cl": pd.read_csv(f"{save_path}/r2_cl_valid_statements.csv"),
    "gl": pd.read_csv(f"{save_path}/r2_gl_valid_statements.csv"),
    "ig": pd.read_csv(f"{save_path}/r2_ig_valid_statements.csv"),
}

for topic in tqdm_notebook(train_data_dict.keys()):
    other_data = pd.concat([data_ for topic_, data_ in train_data_dict.items() if topic_ != topic])    
    other_data.to_csv(f"{save_path}/final_{topic}_valid.csv", index=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


  0%|          | 0/6 [00:00<?, ?it/s]

## Test

In [7]:
test_data_dict = {
    "bo": pd.read_csv(f"{save_path}/r2_bo_valid_statements.csv"),
    "lu": pd.read_csv(f"{save_path}/r2_lu_valid_statements.csv"),
    "co": pd.read_csv(f"{save_path}/r2_co_valid_statements.csv"),
    "cl": pd.read_csv(f"{save_path}/r2_cl_valid_statements.csv"),
    "gl": pd.read_csv(f"{save_path}/r2_gl_valid_statements.csv"),
    "ig": pd.read_csv(f"{save_path}/r2_ig_valid_statements.csv"),
}

for topic in tqdm_notebook(train_data_dict.keys()):
    topic_data = train_data_dict[topic]
    topic_data.to_csv(f"{save_path}/final_{topic}_test.csv", index=False)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  # Remove the CWD from sys.path while we load stuff.


  0%|          | 0/6 [00:00<?, ?it/s]

In [6]:
# valid_data_dict = {
#     "bo": pd.read_csv(f"{save_path}/r2_bo_valid_statements.csv"),
#     "lu": pd.read_csv(f"{save_path}/r2_lu_valid_statements.csv"),
#     "co": pd.read_csv(f"{save_path}/r2_co_valid_statements.csv"),
#     "cl": pd.read_csv(f"{save_path}/r2_cl_valid_statements.csv"),
#     "gl": pd.read_csv(f"{save_path}/r2_gl_valid_statements.csv"),
#     "ig": pd.read_csv(f"{save_path}/r2_ig_valid_statements.csv"),
# }

# for file in tqdm_notebook(glob(f"{save_path}/*train*.csv")):
#     topic = file.split("\\")[-1].split("_")[1]

#     train_data = pd.read_csv(file)
#     valid_data = pd.concat([data_ for topic_, data_ in valid_data_dict.items() if topic_ != topic])
# #     valid_data = pd.read_csv(file.replace("train", "valid").replace(topic, topic_pair_dict[topic]))
    
#     valid_data["label"] = 2
#     valid_data["seen?"] = 0
    
#     train_data = train_data.append(valid_data)
#     train_data.to_csv(file, index=False)

In [7]:
train_data.topic.value_counts()

["bolsonaro"]     5648
["lula"]          4992
["cloroquina"]    4796
["coronavac"]     4783
["igreja"]        4281
["globo"]         3607
Name: topic, dtype: int64

In [9]:
pd.read_csv("./src/data/twitter_data_naacl/twitter_testCC_seenval/train.csv").topic.value_counts()

["climate", "change", "is", "a", "real", "concern"]    1899
["hillary", "clinton"]                                  837
["feminist", "movement"]                                807
["legalization", "of", "abortion"]                      794
["atheism"]                                             624
["donald", "trump"]                                     601
Name: topic, dtype: int64