In [1]:
import sys
sys.path.append("../../")

from baseline.MY_PATHS import *
import baseline.data_creation.wiki_parser as wiki_parser

## Preprocesse raw json and save to pkl
- For each language, we read the raw dumped data from `json_path`, process it using `wiki_parser`, save as df to `df_path` pkl file.
- Works around 1-2 hours per language, so better to run in parallel.


In [12]:
# load right paths to json, where to save df

language_to_info = {
    "english": {},
    "russian": {},
    "hindi"  : {},
}

for lang in language_to_info.keys():
    # read from "json_path"
    language_to_info[lang]["json_path"] = PATH_TO_DATA_FOLDER + f"{lang[:2]}_20191201_text_and_topics_common.json"
    # save processed df to "df_path"
    language_to_info[lang]["df_path"] = PATH_TO_DATA_FOLDER + f"{lang[:2]}_df_full.pkl"

In [27]:
# Function for parsing and saving parsed df to pkl file.

def process_lang(lang):
    print(lang)
    parser = wiki_parser.Parser(lang)
    wiki_df = parser.get_wiki_tokenized_dataset(
        language_to_info[lang]["json_path"],
        extract_title=True, extract_tokens=True, extract_categories=True,
        extract_section=True, extract_outlinks=True, 
#         debug=True,
    )
    wiki_df.to_pickle(language_to_info[lang]["df_path"])

In [None]:
# # Run in parallel

# from joblib import Parallel, delayed

# Parallel(n_jobs=3, verbose=1)(
#     delayed(process_lang)(lang) for  lang in language_to_info.keys()
# )

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.


In [3]:
import pandas as pd

wiki_df = pd.read_pickle(PATH_TO_DATA_FOLDER + "en_df_full.pkl")

In [5]:
wiki_df.head()

Unnamed: 0,QID,title,tokens,mid_level_categories,sections_tokens,raw_outlinks,outlinks
0,Q6199,Anarchism,"[anarchism, anti, authoritarianism, anti, auth...","[History and Society.Politics and government, ...","[etymology, terminology, definition, history, ...","[[[Anti-authoritarianism|anti-authoritarian]],...","[Anti-authoritarianism, Political philosophy, ..."
1,Q38404,Autism,"[autism, developmental, disorder, characterize...","[STEM.STEM*, STEM.Biology, STEM.Medicine & Hea...","[characteristics, social, development, communi...","[[[Psychiatry]], [[Interpersonal relationship|...","[Psychiatry, Interpersonal relationship, commu..."
2,Q101038,Albedo,"[sunlight, relative, various, surface, conditi...","[STEM.STEM*, History and Society.Society, STEM...","[terrestrial, albedo, white, sky, black, sky, ...",[[[File:Albedo-e hg.svg|thumb|upright=1.3|The ...,"[File:Albedo-e hg.svg, diffuse reflection, sun..."
3,Q173,Alabama,"[alabama, state, state, southern, united, stat...","[Geography.Regions.Americas.North America, Geo...","[etymology, history, pre, european, settlement...","[[[Coat of arms of Alabama|Coat of arms]], [[N...","[Coat of arms of Alabama, Northern flicker, Di..."
4,Q41746,Achilles,"[dating, three, zero, zero, bc, achilles, troj...","[History and Society.History, Geography.Region...","[etymology, birth, early, years, names, hidden...",[[[File:Achilles fighting against Memnon Leide...,[File:Achilles fighting against Memnon Leiden ...


## Load list of labels

In [13]:
from baseline.utils import get_classes_list

In [14]:
classes = get_classes_list(PATH_TO_DATA_FOLDER + "classes.txt")
len(classes)

64

## Aligned preprocess of dfs for en, ru, hi

In [8]:
import pickle as pkl
from collections import defaultdict

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

from baseline.data_creation.preprocess import (create_lookups_for_vocab, create_vocab_from_tokens,
                          remove_non_common_articles_and_sort_by_QID,
                          remove_rows_with_empty_column)

def get_dict_of_split_sizes_and_QIDs(QIDs, LANGUAGES_LIST, train_size=0.8, val_size=0.1):
    """
    Assumes there are: 
    splits = ["full", "monolingual_train", "multilingual_train", "val", "test"].
    
    """
    splits = ["full", "monolingual_train", "multilingual_train", "val", "test"]
    SPLIT_DICT = {split: {} for split in splits}
    SPLIT_DICT["full"]["size"]               = len(QIDs)
    SPLIT_DICT["monolingual_train"]["size"]  = int(train_size * SPLIT_DICT["full"]["size"])
    SPLIT_DICT["multilingual_train"]["size"] = SPLIT_DICT["monolingual_train"]["size"] // len(LANGUAGES_LIST)
    SPLIT_DICT["val"]["size"]                = int(val_size * SPLIT_DICT["full"]["size"])
    SPLIT_DICT["test"]["size"] = (
        SPLIT_DICT["full"]["size"]
        - SPLIT_DICT["monolingual_train"]["size"]
        - SPLIT_DICT["val"]["size"]
    )
    print(*SPLIT_DICT.items(), sep="\n")
    
    SPLIT_DICT["monolingual_train"]["QIDs"], val_and_test_QIDs = train_test_split(
        QIDs, 
        train_size=SPLIT_DICT["monolingual_train"]["size"], 
        random_state=SEED
    )
    SPLIT_DICT["multilingual_train"]["QIDs"], _ = train_test_split(
        SPLIT_DICT["monolingual_train"]["QIDs"], 
        train_size=SPLIT_DICT["multilingual_train"]["size"], 
        random_state=SEED
    )
    SPLIT_DICT["val"]["QIDs"], SPLIT_DICT["test"]["QIDs"] = train_test_split(
        val_and_test_QIDs, 
        train_size=SPLIT_DICT["val"]["size"], 
        random_state=SEED
    )
    return SPLIT_DICT

In [9]:
# Load list of classes
classes_list = classes

SAVE = False
LOAD = True

SEED = 57

LANGUAGES_LIST = ["english", "russian", "hindi"]
LANGUAGES_DICT = defaultdict(dict)

for language in LANGUAGES_LIST:
    print(language)
    # Get paths to files
    LANGUAGES_DICT[language]["FILE_NAMES_DICT"] = get_paths(language)

    # Load wiki_df
    wiki_df = pkl.load(open(LANGUAGES_DICT[language]["FILE_NAMES_DICT"]["wiki_df"], "rb"))
    LANGUAGES_DICT[language]["wiki_df"] = wiki_df

    # Remove rows with empty labels/tokens
    remove_rows_with_empty_column(LANGUAGES_DICT[language]["wiki_df"], column="mid_level_categories")
    remove_rows_with_empty_column(LANGUAGES_DICT[language]["wiki_df"], column="tokens")

# This step should be done BEFORE the splits
remove_non_common_articles_and_sort_by_QID(LANGUAGES_DICT)

for cur_dict in LANGUAGES_DICT.values():
    # Binarize labels
    mlb = MultiLabelBinarizer(classes_list)
    cur_dict["wiki_df"]["labels"] =\
        list(mlb.fit_transform(cur_dict["wiki_df"].mid_level_categories))
    assert (mlb.classes_ == classes_list).all()
    
    # Create and save OR load vocabulary
    if SAVE:
        vocab = create_vocab_from_tokens(cur_dict["wiki_df"]["tokens"])
        torch.save(vocab, cur_dict["FILE_NAMES_DICT"]["vocab"])
        print("Saved: ", cur_dict["FILE_NAMES_DICT"]["vocab"])
    if LOAD:
        vocab = torch.load(cur_dict["FILE_NAMES_DICT"]["vocab"])

    index_to_word, word_to_index = create_lookups_for_vocab(vocab)
    cur_dict["index_to_word"], cur_dict["word_to_index"] = index_to_word, word_to_index

# train/val/test sizes and QIDs
splits = ["monolingual_train", "multilingual_train", "val", "test"]
QIDs = LANGUAGES_DICT["english"]["wiki_df"].QID
SPLIT_DICT = get_dict_of_split_sizes_and_QIDs(QIDs, LANGUAGES_LIST)

# Create and save OR load splitted dfs
for cur_dict in LANGUAGES_DICT.values():
    dict_of_dfs = defaultdict()
    if SAVE:
        for split in ["monolingual_train", "multilingual_train", "val", "test"]:
            dict_of_dfs[split] = cur_dict["wiki_df"][cur_dict["wiki_df"].QID.isin(SPLIT_DICT[split]["QIDs"])]
            # save
            torch.save(dict_of_dfs[split], cur_dict["FILE_NAMES_DICT"][split])
            print("Saved:, ", cur_dict["FILE_NAMES_DICT"][split])
    if LOAD:
        for split in ["monolingual_train", "multilingual_train", "val", "test"]:
            dict_of_dfs[split] = torch.load(cur_dict["FILE_NAMES_DICT"][split])


    cur_dict["dict_of_dfs"] = dict_of_dfs

english
Percentage of articles with no mid_level_categories: 0.0 (0 articles)
Percentage of articles with no tokens: 0.0 (0 articles)
russian
Percentage of articles with no mid_level_categories: 0.0 (0 articles)
Percentage of articles with no tokens: 0.0 (0 articles)
hindi
Percentage of articles with no mid_level_categories: 0.0 (0 articles)
Percentage of articles with no tokens: 0.011 (387 articles)
Num of articles initially: 
 dict_keys(['english', 'russian', 'hindi']) 
 [33843, 33843, 33456]
Num of articles after intersection: 
 33456
('full', {'size': 33456})
('monolingual_train', {'size': 26764})
('multilingual_train', {'size': 8921})
('val', {'size': 3345})
('test', {'size': 3347})


In [16]:
! python3 ../data_creation/run_aligned_en_ru_hi_df_preprocess.py

['english', 'russian', 'hindi']
english


In [10]:
!ls $PATH_TO_DATA_FOLDER

classes.txt
en_20191201_text_and_topics_common.json
en_20191201_text_and_topics_common.json.bz2
en_df_full.pkl
en_df_wiki_monolingual_train.pt
en_df_wiki_multilingual_train.pt
en_df_wiki_test.pt
en_df_wiki_valid.pt
en_vocab_all.pt
hi_20191201_text_and_topics_common.json
hi_20191201_text_and_topics_common.json.bz2
hi_df_full.pkl
hi_df_wiki_monolingual_train.pt
hi_df_wiki_multilingual_train.pt
hi_df_wiki_test.pt
hi_df_wiki_valid.pt
hi_vocab_all.pt
ru_20191201_text_and_topics_common.json
ru_20191201_text_and_topics_common.json.bz2
ru_df_full.pkl
ru_df_wiki_monolingual_train.pt
ru_df_wiki_multilingual_train.pt
ru_df_wiki_test.pt
ru_df_wiki_valid.pt
ru_vocab_all.pt
vocab_all_en.pt
vocab_all_hi.pt
vocab_all_ru.pt


## Next steps:
1. Get list of labels, save it.  
2. Create multilingual train, val, test 