In [16]:
!wandb disabled

W&B disabled.


In [17]:
import os
# os.environ["WANDB_SILENT"] = "true"

os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"
print(os.environ["CUDA_VISIBLE_DEVICES"])

# os.environ["TOKENIZERS_PARALLELISM"] = "false" 

0


In [19]:
# -*- coding: utf-8 -*-
"""
1. Read annotated multilingual ILI data using CustomDataset.
2. Convert encoded features and labels to dataset objects for integration with transformers model training.

""" 

import sys
import json
import pandas as pd
from sklearn.model_selection import train_test_split

import torch
from datasets import Dataset

class CustomDataset(object):
    def __init__(self, file_name):
        
        self._file_name = file_name        
        self.data =  pd.read_csv(self._file_name)
        self.tweets = self.data['tweet']
        self.labels = self.data['final_annotation']

    def __len__(self):    
        if len(self.tweets) != len(self.labels):
            raise sys.exit(f"Number of tweets({len(self.tweets)}) and its labels({len(self.labels)}) do not match.")
        else:
            return len(self.labels)
        
    def __getitem__(self, idx):
        tweet = self.tweets.iloc[idx] 
        label = self.labels.iloc[idx] 
        return tweet, label
    
    def getsplitidx(self, test_split=0.2, valid_split=None, group='lang', stratify_label='final_annotation', savepath=None, seed=42):
        
        # group day by language and then perform stratified split by categories and save indices as json
        lang_split_idx = {}
        for grp, grp_df in self.data.groupby(group): 
            print(f"\nProcessing data split for language: {grp}")
            train, test = train_test_split(grp_df, test_size=test_split, stratify=grp_df[stratify_label], random_state=seed)
            if valid_split is not None:
                # determine split size for validation on the remaining based on validation size required
                n = self.data.shape[0]
                valid_size = valid_split/(1-test_split)
                # print(f"{valid_split} of {n} is {valid_size} of "train.shape[0]}")
                train, valid = train_test_split(train, test_size=valid_size, stratify=train[stratify_label], random_state=seed)
                # print(f"Distribution of train, valid and test set: {train.shape[0]}, {valid.shape[0]}, {test.shape[0]}")
            
            print(f"Distribution of classes in train set\n{train[stratify_label].value_counts()}")
            print(f"Distribution of classes in test set\n{test[stratify_label].value_counts()}")
            lang_split_idx[grp] = {'train_idx':train.index.values.tolist(), 
                                   'test_idx':test.index.values.tolist()
                                  }
            if valid_split is not None:
                print(f"Distribution of classes in valid set\n{valid[stratify_label].value_counts()}\n")
                lang_split_idx[grp] = {'train_idx':train.index.values.tolist(), 
                                       'test_idx':test.index.values.tolist(), 
                                       'valid_idx':valid.index.values.tolist()
                                      }
                
        if savepath is not None:
            savepath = Path(savepath)
            savepath.mkdir(parents=True, exist_ok=True)
            with open(savepath.joinpath("split_idx.json"), "w")  as f:
                json.dump(lang_split_idx, f)
        else:
            return lang_split_idx

In [20]:
def getsplit(lang_split_idx, key='train_idx'):
    idx_list = [v[key] for k,v in lang_split_idx.items()]
    # print(len(idx_list))
    idx_list = [i for eachlist in idx_list for i in eachlist]
    return idx_list

## 1. Read data, split info and save split indices

In [21]:
# -*- coding: utf-8 -*-
"""
The script to split data into train, (validation) and test set
Update required in final_configs.json 
"""

import time
from pathlib import Path 
import argparse

# original two category annotation
# DATA_FILE = Path("/gaueko0/users/nmishra/multiling_fludetection/data/all/alldata.csv")
# PARAMS_FILE = Path("/gaueko0/users/nmishra/multiling_fludetection/final_configs.json")
# OUT_PATH = Path("/gaueko0/users/nmishra/multiling_fludetection/evals/evalnew")

# revised two category annotation
DATA_FILE = Path("/gaueko0/users/nmishra/multiling_fludetection/data/all/alldata_revisedcateg.csv")
PARAMS_FILE = Path("/gaueko0/users/nmishra/multiling_fludetection/params.json")
OUT_PATH = Path("/gaueko0/users/nmishra/multiling_fludetection/final_evals/eval_revisedcateg")

In [22]:
# read data
tweets = CustomDataset(DATA_FILE)
print(f"Number of tweets in data: {tweets.__len__()}")
print(f"Distribution of classes in all data\n{tweets.labels.value_counts()}")

Number of tweets in data: 4284
Distribution of classes in all data
final_annotation
3. Not Related to ILI or COVID-19 Infection    2587
1. Likely ILI infection                        1697
Name: count, dtype: int64


In [23]:
# # original four category collapsed to two
# # ------------------------------------------
# # likey covid to likely ILI &
# # ambigous to not related
# # ------------------------------------------

# # df = tweets.data
# # print(df.shape)
# # df['final_annotation'] = np.where(df['final_annotation']=='4. Ambiguous/Unsure', '3. Not Related to ILI or COVID-19 Infection',
# #                           np.where(df['final_annotation']=='2. Likely COVID-19 Infection (after 2020 only)','1. Likely ILI infection',
# #                                   df['final_annotation'])
# #                           )
# # df['final_annotation'].value_counts()
# # df.to_csv(DATA_FILE.parent.joinpath("alldata_twocateg_nonrevised.csv"))

# DATA_FILE = Path("/gaueko0/users/nmishra/multiling_fludetection/data/all/alldata_twocateg.csv")
# OUT_PATH = Path("/gaueko0/users/nmishra/multiling_fludetection/evals/originalcateg/evalnew_twocateg")
# tweets = CustomDataset(DATA_FILE)
# print(f"Number of tweets in data: {tweets.__len__()}")
# print(f"Distribution of classes in all data\n{tweets.labels.value_counts()}")

In [24]:
# determine number of splits required, split data and save
with open (PARAMS_FILE, "r") as f:
    params = json.load(f)

all_splits = []
for split in params['SPLITS']:
    print(f"\nsplit: {split}") 
    dirname = f"testset{'_'.join([str(i) for i in split])}"    
    split_path = OUT_PATH.joinpath(dirname)
    print(split_path)
    all_splits.append(tweets.getsplitidx(test_split=split[-1], valid_split=split[-2], savepath=split_path))
    # all_splits.append(tweets.getsplitidx(test_split=split[-1], valid_split=split[-2], savepath=None))


split: [0.6, 0.2, 0.2]
/gaueko0/users/nmishra/multiling_fludetection/final_evals/eval_revisedcateg/testset0.6_0.2_0.2

Processing data split for language: de
Distribution of classes in train set
final_annotation
3. Not Related to ILI or COVID-19 Infection    294
1. Likely ILI infection                        285
Name: count, dtype: int64
Distribution of classes in test set
final_annotation
3. Not Related to ILI or COVID-19 Infection    99
1. Likely ILI infection                        95
Name: count, dtype: int64
Distribution of classes in valid set
final_annotation
3. Not Related to ILI or COVID-19 Infection    99
1. Likely ILI infection                        95
Name: count, dtype: int64


Processing data split for language: en
Distribution of classes in train set
final_annotation
3. Not Related to ILI or COVID-19 Infection    64
1. Likely ILI infection                        53
Name: count, dtype: int64
Distribution of classes in test set
final_annotation
3. Not Related to ILI or C

## 2. Read saved split indices and verify

In [25]:
# -*- coding: utf-8 -*-
"""
The script to split data into train, (validation) and test set
Update required in final_configs.json 
"""

import time
from pathlib import Path 
import argparse

# original four categ
# -------------------
# DATA_FILE = Path("/gaueko0/users/nmishra/multiling_fludetection/data/all/alldata.csv")
# PARAMS_FILE = Path("/gaueko0/users/nmishra/multiling_fludetection/final_configs.json")
# OUT_PATH = Path("/gaueko0/users/nmishra/multiling_fludetection/evalnew")

# DATA_FILE = Path("/gaueko0/users/nmishra/multiling_fludetection/data/all/alldata_twocateg.csv")
# OUT_PATH = Path("/gaueko0/users/nmishra/multiling_fludetection/evals/originalcateg/evalnew_twocateg")

# revised two categ
# -------------------
DATA_FILE = Path("/gaueko0/users/nmishra/multiling_fludetection/data/all/alldata_revisedcateg.csv")
PARAMS_FILE = Path("/gaueko0/users/nmishra/multiling_fludetection/params.json")
OUT_PATH = Path("/gaueko0/users/nmishra/multiling_fludetection/evals/eval_revisedcateg")

In [26]:
# read data
tweets = CustomDataset(DATA_FILE)
print(tweets.data.shape)
print(f"Number of tweets in data: {tweets.__len__()}")
print(f"Distribution of classes in all data\n{tweets.labels.value_counts()}")

# determine number of splits required, split data and save
with open (PARAMS_FILE, "r") as f:
    params = json.load(f)

(4284, 6)
Number of tweets in data: 4284
Distribution of classes in all data
final_annotation
3. Not Related to ILI or COVID-19 Infection    2587
1. Likely ILI infection                        1697
Name: count, dtype: int64


In [27]:
for split in params['SPLITS'][:1]:
    print(split)
    # data split index info
    dirname = f"testset{'_'.join([str(i) for i in split])}"
    split_path = OUT_PATH.joinpath(dirname)
    print(f"Reading data split index from: {split_path}")
    with open(split_path.joinpath('split_idx.json'), 'r') as f:
        split_idx = json.load(f) 
        
    # determine languages for which to get split index
    if params['LANG']=='all':
        languages = [i for i in split_idx]
    else:
        languages = [i for i in split_idx if i in params['LANG'].split(',')]
    
    # train on all languages and then on each language
    lang_split_idx = {i:split_idx[i] for i in languages}
    print(f"Training data used for {params['LANG']} languages")

    # get train, valid and test split for selected languages
    train_idx = getsplit(lang_split_idx, key='train_idx')
    valid_idx = getsplit(lang_split_idx, key='valid_idx')
    test_idx = getsplit(lang_split_idx, key='test_idx')
    print(f"Distribution of data in train, validation and test splits: {len(train_idx)}, {len(valid_idx)}, {len(test_idx)}")

    # inspect data
    train_df = tweets.data.iloc[train_idx]
    valid_df = tweets.data.iloc[valid_idx]
    test_df = tweets.data.iloc[test_idx]

[0.6, 0.2, 0.2]
Reading data split index from: /gaueko0/users/nmishra/multiling_fludetection/evals/eval_revisedcateg/testset0.6_0.2_0.2
Training data used for all languages
Distribution of data in train, validation and test splits: 2567, 857, 860


In [28]:
print(train_df['id'].duplicated().sum(), valid_df['id'].duplicated().sum(), test_df['id'].duplicated().sum())

0 0 0


In [29]:
train_df['lang'].value_counts()

lang
es    698
it    597
de    579
fr    576
en    117
Name: count, dtype: int64

In [30]:
test_df['lang'].value_counts()

lang
es    233
it    200
de    194
fr    193
en     40
Name: count, dtype: int64

In [31]:
valid_df['lang'].value_counts()

lang
es    233
it    199
de    194
fr    192
en     39
Name: count, dtype: int64