In [1]:
# import dependencies
import nltk
import json
import io
import gzip
import torch
import string
import random
import jsonlines
import pandas as pd
import pickle as pkl
import numpy as np
from tqdm import tqdm, tqdm_notebook
from functools import partial

import torch
import torch.nn as nn
import torch.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, RandomSampler, SequentialSampler, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
from importlib import reload

import utils
reload(utils)

import wiki_parser
reload(wiki_parser)

<module 'wiki_parser' from '/home/mz2476/topic-modeling/topic-modeling/baseline/wiki_parser.py'>

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cuda:0'

In [4]:
PATH_TO_DATA_FOLDER = "/scratch/mz2476/wiki/data/aligned_datasets/"

SAVE = False
DEBUG = True
LOAD = True

## Preprocess it and save 
Output files:
`vocab_train_en.pt`, `wiki_tensor_dataset_en.pt`, `classes_list.pt`

Preprocess the data (the functions are in `preprocess.py`):
<ol>
    <li> Remove rows with missing labels. </li>
    <li> Remove rows with no tokens. </li>
    <li> Create a set of all categories. Binarize the labels. </li>
    <li> Split in train/val/test. </li>
    <li> Build vocabulary for train. </li>
</ol>

Make DataLoader:
<ol>
    <li> Tokenize train/val/test. </li>
    <li> Create batches using collate function that pads the short sentences. </li>
</ol>

Use pretrained embeddings:
<ol>
    <li> Load pretrained embeddings. </li>
    <li> Create embedding matrix for given vocabulary. Words that are in given vocabualry but not in pretrained embeddings have zero embedding vector. </li>
</ol>

In [5]:
LOAD = False
SAVE = True

In [6]:
import pickle as pkl
from collections import defaultdict

import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

from preprocess import (create_dict_of_tensor_datasets,
                        create_lookups_for_vocab, create_vocab_from_tokens,
                        remove_non_common_articles_and_sort_by_QID,
                        remove_rows_with_empty_column)

device = "cuda:0" if torch.cuda.is_available() else "cpu"

PATH_TO_DATA_FOLDER = "/scratch/mz2476/wiki/data/aligned_datasets/"

# Load list of classes
classes_list = torch.load(PATH_TO_DATA_FOLDER + '45_classes_list.pt')

SAVE = True
DEBUG = True
LOAD = False

monolingual_train_size = 30000
multilingual_train_size = 10000
val_size = 1000

SEED = 57

LANGUAGES_LIST = ["english", "russian", "hindi"]
LANGUAGES_DICT = defaultdict(dict)

for language in LANGUAGES_LIST:
    language_code = language[:2]
    FILE_NAMES_DICT = {
        "json": f"wikitext_topics_{language_code}_filtered.json",
        "wiki_df": f"wikitext_tokenized_text_sections_outlinks_{language_code}.p",
        "vocab": f"data_for_model/vocab_all_{language_code}.pt",
        "monolingual_train": f"data_for_model/df_wiki_monolingual_train_{monolingual_train_size}_{language_code}.pt",
        "multilingual_train": f"data_for_model/df_wiki_multilingual_train_{multilingual_train_size}_{language_code}.pt",
        "val": f"data_for_model/df_wiki_valid_{val_size}_{language_code}.pt",
        "test": f"data_for_model/df_wiki_test_{language_code}.pt",
#         "tensor_dataset": f"wiki_tensor_dataset_{language_code}.pt",
    }
    LANGUAGES_DICT[language]["FILE_NAMES_DICT"] = FILE_NAMES_DICT
    print(language, "\n", FILE_NAMES_DICT)

# Load wiki_df and remove rows with empty labels/tokens
for language in LANGUAGES_DICT.keys():
    wiki_df = pkl.load(open(PATH_TO_DATA_FOLDER + LANGUAGES_DICT[language]["FILE_NAMES_DICT"]["wiki_df"], "rb"))
    LANGUAGES_DICT[language]["wiki_df"] = wiki_df

    remove_rows_with_empty_column(LANGUAGES_DICT[language]["wiki_df"], column="mid_level_categories")
    remove_rows_with_empty_column(LANGUAGES_DICT[language]["wiki_df"], column="tokens")

remove_non_common_articles_and_sort_by_QID(LANGUAGES_DICT)

# Binarize labels, create vocabulary
for cur_dict in LANGUAGES_DICT.values():
    mlb = MultiLabelBinarizer(classes_list)
    cur_dict["wiki_df"]["labels"] =\
        list(mlb.fit_transform(cur_dict["wiki_df"].mid_level_categories))
    assert (mlb.classes_ == classes_list).all()

    if LOAD:
        vocab = torch.load(PATH_TO_DATA_FOLDER + cur_dict["FILE_NAMES_DICT"]["vocab"])

    if SAVE:
        vocab = create_vocab_from_tokens(cur_dict["wiki_df"]["tokens"])
        torch.save(vocab, PATH_TO_DATA_FOLDER + cur_dict["FILE_NAMES_DICT"]["vocab"])
        print("Saved: ", cur_dict["FILE_NAMES_DICT"]["vocab"])

    index_to_word, word_to_index = create_lookups_for_vocab(vocab)
    cur_dict["index_to_word"], cur_dict["word_to_index"] = index_to_word, word_to_index


# train/val/test split by QID
QIDs = LANGUAGES_DICT["english"]["wiki_df"].QID
monolingual_train_QIDs, val_and_test_QIDs = train_test_split(QIDs, train_size=monolingual_train_size, random_state=SEED)
multilingual_train_QIDs, _ = train_test_split(monolingual_train_QIDs, train_size=multilingual_train_size, random_state=SEED)
val_QIDs, test_QIDs = train_test_split(QIDs, train_size=val_size, random_state=SEED)
test_size = len(test_QIDs)

for cur_dict in LANGUAGES_DICT.values():
    dict_of_dfs = defaultdict()

    if LOAD:
        dict_of_dfs["monolingual_train"], dict_of_dfs["multilingual_train"], dict_of_dfs["val"], dict_of_dfs["test"] =\
            (torch.load(PATH_TO_DATA_FOLDER + cur_dict["FILE_NAMES_DICT"]["monolingual_train"]),
             torch.load(PATH_TO_DATA_FOLDER + cur_dict["FILE_NAMES_DICT"]["multilingual_train"]),
             torch.load(PATH_TO_DATA_FOLDER + cur_dict["FILE_NAMES_DICT"]["val"]),
             torch.load(PATH_TO_DATA_FOLDER + cur_dict["FILE_NAMES_DICT"]["test"]))

    if SAVE:
        dict_of_dfs["monolingual_train"], dict_of_dfs["multilingual_train"], dict_of_dfs["val"], dict_of_dfs["test"] =\
            (cur_dict["wiki_df"][cur_dict["wiki_df"].QID.isin(monolingual_train_QIDs)],
             cur_dict["wiki_df"][cur_dict["wiki_df"].QID.isin(multilingual_train_QIDs)],
             cur_dict["wiki_df"][cur_dict["wiki_df"].QID.isin(val_QIDs)],
             cur_dict["wiki_df"][cur_dict["wiki_df"].QID.isin(test_QIDs)])
        for name in dict_of_dfs.keys():
            torch.save(dict_of_dfs[name], PATH_TO_DATA_FOLDER + cur_dict["FILE_NAMES_DICT"][name])
            print("Saved:, ", cur_dict["FILE_NAMES_DICT"][name])
    
    cur_dict["dict_of_dfs"] = dict_of_dfs

# # Tokenized datasets
# for cur_dict in LANGUAGES_DICT.values():
#     create_dict_of_tensor_datasets(dict_of_dfs, word_to_index, max_num_tokens=None)
# ADD


english 
 {'json': 'wikitext_topics_en_filtered.json', 'wiki_df': 'wikitext_tokenized_text_sections_outlinks_en.p', 'vocab': 'data_for_model/vocab_all_en.pt', 'monolingual_train': 'data_for_model/df_wiki_monolingual_train_30000_en.pt', 'multilingual_train': 'data_for_model/df_wiki_multilingual_train_10000_en.pt', 'val': 'data_for_model/df_wiki_valid_1000_en.pt', 'test': 'data_for_model/df_wiki_test_en.pt'}
russian 
 {'json': 'wikitext_topics_ru_filtered.json', 'wiki_df': 'wikitext_tokenized_text_sections_outlinks_ru.p', 'vocab': 'data_for_model/vocab_all_ru.pt', 'monolingual_train': 'data_for_model/df_wiki_monolingual_train_30000_ru.pt', 'multilingual_train': 'data_for_model/df_wiki_multilingual_train_10000_ru.pt', 'val': 'data_for_model/df_wiki_valid_1000_ru.pt', 'test': 'data_for_model/df_wiki_test_ru.pt'}
hindi 
 {'json': 'wikitext_topics_hi_filtered.json', 'wiki_df': 'wikitext_tokenized_text_sections_outlinks_hi.p', 'vocab': 'data_for_model/vocab_all_hi.pt', 'monolingual_train': 'd

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cur_dict["wiki_df"].sort_values(by=["QID"], inplace=True)


Saved:  data_for_model/vocab_all_en.pt
Saved:  data_for_model/vocab_all_ru.pt


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Saved:  data_for_model/vocab_all_hi.pt
Saved:,  data_for_model/df_wiki_monolingual_train_30000_en.pt
Saved:,  data_for_model/df_wiki_multilingual_train_10000_en.pt
Saved:,  data_for_model/df_wiki_valid_1000_en.pt
Saved:,  data_for_model/df_wiki_test_en.pt
Saved:,  data_for_model/df_wiki_monolingual_train_30000_ru.pt
Saved:,  data_for_model/df_wiki_multilingual_train_10000_ru.pt
Saved:,  data_for_model/df_wiki_valid_1000_ru.pt
Saved:,  data_for_model/df_wiki_test_ru.pt
Saved:,  data_for_model/df_wiki_monolingual_train_30000_hi.pt
Saved:,  data_for_model/df_wiki_multilingual_train_10000_hi.pt
Saved:,  data_for_model/df_wiki_valid_1000_hi.pt
Saved:,  data_for_model/df_wiki_test_hi.pt


## Save wiki parsed df
If `LOAD = False`, load json and process it to get df

In [None]:
SAVE = True
DEBUG = True
LOAD = False

LANGUAGES_LIST = ["english", "russian", "hindi"]
LANGUAGES_DICT = defaultdict(dict)

for language in LANGUAGES_LIST:
    language_code = language[:2]
    FILE_NAMES_DICT = {
        "json": f"wikitext_topics_{language_code}_filtered.json",
        "wiki_df": f"wikitext_tokenized_text_sections_outlinks_{language_code}.p",
    }
    LANGUAGES_DICT[language]["FILE_NAMES_DICT"] = FILE_NAMES_DICT
    print(language, "\n", FILE_NAMES_DICT)


In [None]:
parser = wiki_parser.Parser(LANGUAGE)
wiki_df = parser.get_wiki_tokenized_dataset(
    PATH_TO_DATA_FOLDER + FILE_NAMES_DICT["json"],
    extract_section=True, extract_outlinks=True, debug=DEBUG
)
if SAVE:
    pkl.dump(wiki_df, open(
        PATH_TO_DATA_FOLDER + FILE_NAMES_DICT["wiki_df"], "wb"))

In [None]:
# # Binarize the labels
# # labels list: mlb.classes_
# mlb = MultiLabelBinarizer()
# wiki_df["labels"] = list(mlb.fit_transform(wiki_df.mid_level_categories))

# if SAVE:
#     # SAVE classes list
#     torch.save(mlb.classes_, PATH_TO_DATA_FOLDER + 'classes_list.pt')
#     print("Saved.")

# # LOAD
# classes = torch.load(PATH_TO_DATA_FOLDER + 'classes_list.pt')
# mlb = MultiLabelBinarizer(classes)

# print(classes)
# wiki_df.head()

In [18]:
# tokenize datasets
# CHANGE max number of tokens per article
max_num_tokens = None

# # specify vocabulary (word_to_index): 2 options
vocab_name = "vocab_train" 
word_to_index = word_to_index_train
# OR
# vocab_name = "vocab_all"
# word_to_index = word_to_index_all

wiki_tokenized_datasets = {}
wiki_tokenized_datasets['X_train'] = tokenize_dataset(wiki_train, word_to_index, max_num_tokens=max_num_tokens)
wiki_tokenized_datasets['X_val'] = tokenize_dataset(wiki_valid, word_to_index, max_num_tokens=max_num_tokens)
wiki_tokenized_datasets['X_test'] = tokenize_dataset(wiki_test, word_to_index, max_num_tokens=max_num_tokens)

wiki_tokenized_datasets['y_train'] = list(wiki_train.labels)
wiki_tokenized_datasets['y_val'] = list(wiki_valid.labels)
wiki_tokenized_datasets['y_test'] = list(wiki_test.labels)

wiki_tensor_dataset = {}
wiki_tensor_dataset['train'] = TensoredDataset(
    wiki_tokenized_datasets['X_train'], wiki_tokenized_datasets['y_train']
)
wiki_tensor_dataset['val'] = TensoredDataset(
    wiki_tokenized_datasets['X_val'], wiki_tokenized_datasets['y_val']
)
wiki_tensor_dataset['test'] = TensoredDataset(
    wiki_tokenized_datasets['X_test'], wiki_tokenized_datasets['y_test']
)

if SAVE:
    # SAVE tensor datasets
    torch.save(wiki_tensor_dataset, f'{PATH_TO_DATA_FOLDER}wiki_tensor_dataset_{vocab_name}_en.pt')
    print("Saved.")

# LOAD
wiki_tensor_dataset = torch.load(f'{PATH_TO_DATA_FOLDER}wiki_tensor_dataset_{vocab_name}_en.pt')

100%|██████████| 79968/79968 [00:09<00:00, 8749.83it/s] 
100%|██████████| 9996/9996 [00:00<00:00, 12383.81it/s]
100%|██████████| 9996/9996 [00:00<00:00, 13668.84it/s]


Saved.


In [41]:
reload(preprocess)
from preprocess import TensoredDataset

In [19]:
print("Vocab size:", len(word_to_index))
wiki_tensor_dataset["train"].__getitem__(200)

Vocab size: 595366


(tensor([13030,  8330,  3721,  8330,  3721,   132,  2496, 13031,  4719,  3982,
         13031,  3178,   303,  5510, 13032,  8334,  2496, 13031,  4719,  1828,
          2496,  1985, 13033, 10701, 13034,     7,  5299,  2338,  6948,     5,
             9,     9,     8, 10510,   480, 13035, 13036, 11814, 13035, 13036,
           965,   933,  2789,     5,   223,    10,   933, 13037,  6777,  1646,
          3271, 13038,  2496, 13031,  4719,  1036, 13039,  1985,  2300,  1495,
           601, 13040,  1495,     5,     9,   208,     6,     5,     9,     9,
            11,   568,     5,     9,     9,   208, 13041,  1467,   403, 13042,
          9309,  1065, 13043, 13044, 13043, 13044,  2300,  2189,  1880,  8330,
          4719,   452,    10,     8,     8,     8, 13035, 13036,    21, 13045,
          2300, 13045,  2641,  3721,  4340,  4251, 13043, 13044, 13046,  2496,
         13031,  4719,  4340, 13045, 13047, 13048, 13049, 13050,  5496,  9571,
           648,     5,     9,    10,     8,     5,  

In [18]:
# # Next step after loading tensor dataset -- create dataloader
# wiki_loaders = {}

# batch_size = 32

# for split, wiki_dataset in wiki_tensor_dataset.items():
#     wiki_loaders[split] = DataLoader(
#         wiki_dataset, 
#         batch_size=batch_size, 
#         shuffle=True, 
#         collate_fn=partial(pad_collate_fn, word_to_index=word_to_index)
#     )