In [12]:
# import dependencies
import nltk
import json
import io
import gzip
import torch
import string
import random
import jsonlines
import pandas as pd
import pickle as pkl
import numpy as np
from tqdm import tqdm, tqdm_notebook
from functools import partial

import torch
import torch.nn as nn
import torch.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, RandomSampler, SequentialSampler, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer

In [53]:
from importlib import reload

import utils
reload(utils)

import wiki_parser
reload(wiki_parser)

<module 'wiki_parser' from '/home/mz2476/topic-modeling/topic-modeling/baseline/wiki_parser.py'>

In [54]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
device

'cpu'

In [55]:
PATH_TO_DATA_FOLDER = "/scratch/mz2476/wiki/data/aligned_datasets/"

SAVE = False
DEBUG = True
LOAD = False

## Load df
If `LOAD = False`, load json and process it to get df

In [49]:
LANGUAGE = "hindi"
LANGUAGE_CODE = LANGUAGE[:2]
FILE_NAMES_DICT = {
    "classes_list": 'classes_list.pt',
    
    "json": f"wikitext_topics_{LANGUAGE_CODE}_filtered.json",
    "wiki_df": f"wikitext_tokenized_text_sections_outlinks_{LANGUAGE_CODE}.p",
#     "vocab": f"vocab_train_{LANGUAGE_CODE}.pt",
#     "train": f"df_wiki_train_{LANGUAGE_CODE}.pt",
#     "val": f"df_wiki_valid_{LANGUAGE_CODE}.pt",
#     "test": f"df_wiki_test_{LANGUAGE_CODE}.pt",
#     "tensor_dataset": f"wiki_tensor_dataset_{LANGUAGE_CODE}.pt",
}

print(FILE_NAMES_DICT)

{'classes_list': 'classes_list.pt', 'json': 'wikitext_topics_hi_filtered.json', 'wiki_df': 'wikitext_tokenized_text_sections_outlinks_hi.p'}


In [50]:
if LOAD:
    wiki_df = pkl.load(open(PATH_TO_DATA_FOLDER + FILE_NAMES_DICT["wiki_df"], "rb"))

else:
    parser = wiki_parser.Parser(LANGUAGE)
    wiki_df = parser.get_wiki_tokenized_dataset(
        PATH_TO_DATA_FOLDER + FILE_NAMES_DICT["json"],
        extract_section=True, extract_outlinks=True, debug=DEBUG
    )
    if SAVE:
        pkl.dump(wiki_df, open(
            PATH_TO_DATA_FOLDER + FILE_NAMES_DICT["wiki_df"], "wb"))

[nltk_data] Downloading package stopwords to /home/mz2476/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [56]:
parser = wiki_parser.Parser(LANGUAGE)
wiki_df_raw = parser.get_wiki_tokenized_dataset(
    PATH_TO_DATA_FOLDER + FILE_NAMES_DICT["json"],
    extract_section=True, extract_outlinks=True, debug=DEBUG
)

[nltk_data] Downloading package stopwords to /home/mz2476/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))

In [58]:
wiki_df_raw.tokens[3]

'{{देवनागरी साइडबार}}\n\n[[चित्र:rigveda ms2097.jpg|right|thumb|300px|\'\'\'देवनागरी\'\'\' में लिखी [[ऋग्वेद]] की [[पाण्डुलिपि]]]]\n\n\n\'\'\'देवनागरी\'\'\' एक भारतीय [[लिपियों की सूची|लिपि]] है जिसमें अनेक [[भारत|भारतीय]] [[भाषा|भाषाएँ]] तथा कई विदेशी भाषाएँ लिखी जाती हैं। यह बायें से दायें लिखी जाती है। इसकी पहचान एक क्षैतिज रेखा से है जिसे \'शिरिरेखा\' कहते हैं। [[संस्कृत]], [[पालि]], [[हिंदी]], [[मराठी]], [[कोंकणी]], [[सिंधी]], [[कश्मीरी भाषा|कश्मीरी]], [[डोगरी]], [[खस]], [[नेपाल भाषा]] (तथा अन्य नेपाली भाषाएँ), [[तामाङ भाषा]], [[गढ़वाली]], [[बोड़ो भाषा|बोडो]], [[अंगिका]], [[मगही]], [[भोजपुरी]], [[नागपुरी भाषा|नागपुरी]],  [[मैथिली]], [[संथाली]] आदि भाषाएँ देवनागरी में लिखी जाती हैं। इसके अतिरिक्त कुछ स्थितियों में [[गुजराती]], [[पंजाबी]], [[बिष्णुपुरिया मणिपुरी]], [[रोमानी]] और [[उर्दू]] भाषाएँ भी देवनागरी में लिखी जाती हैं। देवनागरी विश्व में सर्वाधिक प्रयुक्त लिपियों में से एक है। \n[[चित्र:marathishilalekhyear1109found atparalmaharashtraindia.jpg|thumb| मुंबई के परल नामक उपनगर म

In [52]:
wiki_df.tokens[3][-20:]

['तीन',
 'अंग्रेजी',
 'देवनागरी',
 'लिखें',
 'देवनागरी',
 'सॉफ्टवेयर',
 'केविन',
 'कार्मोदी',
 'तीन',
 'देवनागरी',
 'लिपि',
 'संगणक',
 'अम्बा',
 'कुलकर्णी',
 'विभागाध्यक्ष',
 'संस्कृत',
 'अध्ययन',
 'विभाग',
 'हैदराबाद',
 'विश्वविद्यालय']

## Preprocess it and save 
Output files:
`vocab_train_en.pt`, `wiki_tensor_dataset_en.pt`, `classes_list.pt`

Preprocess the data (the functions are in `preprocess.py`):
<ol>
    <li> Remove rows with missing labels. </li>
    <li> Remove rows with no tokens. </li>
    <li> Create a set of all categories. Binarize the labels. </li>
    <li> Split in train/val/test. </li>
    <li> Build vocabulary for train. </li>
</ol>

Make DataLoader:
<ol>
    <li> Tokenize train/val/test. </li>
    <li> Create batches using collate function that pads the short sentences. </li>
</ol>

Use pretrained embeddings:
<ol>
    <li> Load pretrained embeddings. </li>
    <li> Create embedding matrix for given vocabulary. Words that are in given vocabualry but not in pretrained embeddings have zero embedding vector. </li>
</ol>

In [22]:
import preprocess
import importlib
importlib.reload(preprocess)

from preprocess import train_validate_test_split
from preprocess import tokenize_dataset, TensoredDataset, pad_collate_fn
from preprocess import create_vocab_from_tokens, create_lookups_for_vocab

In [32]:
#Removing rows with missing labels
mask = wiki_df.mid_level_categories.apply(lambda x: len(x) > 0)
print(f"% of articles with missing labels: {(1 - mask).sum() / mask.shape[0]} ({(1 - mask).sum()} articles)")
wiki_df = wiki_df[mask]
wiki_df = wiki_df.reset_index(drop=True)
wiki_df.shape

% of articles with missing labels: 0.0 (0 articles)


(32825, 4)

In [31]:
(1 - mask).sum()

998

In [11]:
#Removing rows with no tokens
mask = wiki_df.tokens.apply(lambda x: len(x) > 0)
wiki_df = wiki_df[mask]
wiki_df = wiki_df.reset_index(drop=True)
wiki_df.shape

(99960, 3)

In [12]:
# Binarize the labels
# labels list: mlb.classes_
mlb = MultiLabelBinarizer()
wiki_df["labels"] = list(mlb.fit_transform(wiki_df.mid_level_categories))

if SAVE:
    # SAVE classes list
    torch.save(mlb.classes_, PATH_TO_DATA_FOLDER + 'classes_list.pt')
    print("Saved.")

# LOAD
classes = torch.load(PATH_TO_DATA_FOLDER + 'classes_list.pt')
mlb = MultiLabelBinarizer(classes)

print(classes)
wiki_df.head()

Saved.
['Culture.Arts' 'Culture.Broadcasting' 'Culture.Crafts and hobbies'
 'Culture.Entertainment' 'Culture.Food and drink' 'Culture.Games and toys'
 'Culture.Internet culture' 'Culture.Language and literature'
 'Culture.Media' 'Culture.Music' 'Culture.Performing arts'
 'Culture.Philosophy and religion' 'Culture.Plastic arts' 'Culture.Sports'
 'Culture.Visual arts' 'Geography.Africa' 'Geography.Americas'
 'Geography.Antarctica' 'Geography.Asia' 'Geography.Bodies of water'
 'Geography.Europe' 'Geography.Landforms' 'Geography.Maps'
 'Geography.Oceania' 'Geography.Parks'
 'History_And_Society.Business and economics'
 'History_And_Society.Education' 'History_And_Society.History and society'
 'History_And_Society.Military and warfare'
 'History_And_Society.Politics and government'
 'History_And_Society.Transportation' 'STEM.Biology' 'STEM.Chemistry'
 'STEM.Engineering' 'STEM.Geosciences' 'STEM.Information science'
 'STEM.Mathematics' 'STEM.Medicine' 'STEM.Meteorology' 'STEM.Physics'
 'STEM

Unnamed: 0,QID,mid_level_categories,tokens,labels
0,Q2000864,[Culture.Philosophy and religion],"[affirming, consequent, sometimes, called, con...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
1,Q1064113,[History_And_Society.Business and economics],"[growth, two, six, two, zero, one, six, zero, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,Q6941060,[Geography.Europe],"[museum, work, arbetets, museum, swedish, muse...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,Q843920,"[History_And_Society.History and society, STEM...","[like, one, dorset, england, arable, land, lat...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,Q178999,"[STEM.Biology, STEM.Medicine]","[axon, greek, axis, nerve, fiber, long, slende...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [13]:
# train/val/test split
wiki_train, wiki_valid, wiki_test = train_validate_test_split(wiki_df, seed=1)

wiki_train = wiki_train.reset_index(drop=True)
wiki_valid = wiki_valid.reset_index(drop=True)
wiki_test = wiki_test.reset_index(drop=True)

if SAVE:
    # SAVE train/val/test dfs
    torch.save(wiki_train, PATH_TO_DATA_FOLDER + "df_wiki_train_en.pt")
    torch.save(wiki_valid, PATH_TO_DATA_FOLDER + "df_wiki_valid_en.pt")
    torch.save(wiki_test, PATH_TO_DATA_FOLDER + "df_wiki_test_en.pt")
    print("Saved.")

wiki_train = torch.load(PATH_TO_DATA_FOLDER + "df_wiki_train_en.pt")
wiki_valid = torch.load(PATH_TO_DATA_FOLDER + "df_wiki_valid_en.pt")
wiki_test = torch.load(PATH_TO_DATA_FOLDER + "df_wiki_test_en.pt")


Saved.


In [14]:
wiki_train.head()

Unnamed: 0,QID,mid_level_categories,tokens,labels
0,Q5346784,[Culture.Language and literature],"[edwin, romanzo, elmer, one, eight, five, zero...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
1,Q4723109,[Culture.Language and literature],"[alfred, george, fysh, machin, born, one, eigh...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
2,Q1456016,"[Geography.Americas, Culture.Music]","[late, friends, first, full, length, studio, a...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ..."
3,Q59149462,"[Geography.Americas, Culture.Sports, Culture.L...","[mat, alexis, romero, born, one, february, one...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, ..."
4,Q30602920,"[Culture.Plastic arts, Geography.Americas, Cul...","[confederate, memorial, fountain, historic, fo...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, ..."


In [23]:
# Building vocabulary: train and all
vocab_train = create_vocab_from_tokens(wiki_train['tokens'])
vocab_val = create_vocab_from_tokens(wiki_valid['tokens'])
vocab_test = create_vocab_from_tokens(wiki_test['tokens'])

vocab_all = create_vocab_from_tokens([vocab_train, vocab_val, vocab_test])

print("Train vocab size is: {}".format(len(vocab_train)))
print("All vocab size is: {}".format(len(vocab_all)))

index_to_word_train, word_to_index_train = create_lookups_for_vocab(vocab_train, add_tokens_list=["<pad>", "<unk>"])
index_to_word_all, word_to_index_all = create_lookups_for_vocab(vocab_all, add_tokens_list=["<pad>", "<unk>"])

if SAVE:
    # SAVE vocab from train, all
    torch.save(index_to_word_train, PATH_TO_DATA_FOLDER + 'vocab_train_en.pt')
    torch.save(index_to_word_all, PATH_TO_DATA_FOLDER + 'vocab_all_en.pt')
    print("Saved.")

# LOAD
vocab_train = torch.load(PATH_TO_DATA_FOLDER + 'vocab_train_en.pt')
vocab_all = torch.load(PATH_TO_DATA_FOLDER + 'vocab_all_en.pt')

index_to_word_train, word_to_index_train = create_lookups_for_vocab(vocab_train)
index_to_word_all, word_to_index_all = create_lookups_for_vocab(vocab_all)

Train vocab size is: 595364
All vocab size is: 682848


In [18]:
# tokenize datasets
# CHANGE max number of tokens per article
max_num_tokens = None

# # specify vocabulary (word_to_index): 2 options
vocab_name = "vocab_train" 
word_to_index = word_to_index_train
# OR
# vocab_name = "vocab_all"
# word_to_index = word_to_index_all

wiki_tokenized_datasets = {}
wiki_tokenized_datasets['X_train'] = tokenize_dataset(wiki_train, word_to_index, max_num_tokens=max_num_tokens)
wiki_tokenized_datasets['X_val'] = tokenize_dataset(wiki_valid, word_to_index, max_num_tokens=max_num_tokens)
wiki_tokenized_datasets['X_test'] = tokenize_dataset(wiki_test, word_to_index, max_num_tokens=max_num_tokens)

wiki_tokenized_datasets['y_train'] = list(wiki_train.labels)
wiki_tokenized_datasets['y_val'] = list(wiki_valid.labels)
wiki_tokenized_datasets['y_test'] = list(wiki_test.labels)

wiki_tensor_dataset = {}
wiki_tensor_dataset['train'] = TensoredDataset(
    wiki_tokenized_datasets['X_train'], wiki_tokenized_datasets['y_train']
)
wiki_tensor_dataset['val'] = TensoredDataset(
    wiki_tokenized_datasets['X_val'], wiki_tokenized_datasets['y_val']
)
wiki_tensor_dataset['test'] = TensoredDataset(
    wiki_tokenized_datasets['X_test'], wiki_tokenized_datasets['y_test']
)

if SAVE:
    # SAVE tensor datasets
    torch.save(wiki_tensor_dataset, f'{PATH_TO_DATA_FOLDER}wiki_tensor_dataset_{vocab_name}_en.pt')
    print("Saved.")

# LOAD
wiki_tensor_dataset = torch.load(f'{PATH_TO_DATA_FOLDER}wiki_tensor_dataset_{vocab_name}_en.pt')

100%|██████████| 79968/79968 [00:09<00:00, 8749.83it/s] 
100%|██████████| 9996/9996 [00:00<00:00, 12383.81it/s]
100%|██████████| 9996/9996 [00:00<00:00, 13668.84it/s]


Saved.


In [19]:
print("Vocab size:", len(word_to_index))
wiki_tensor_dataset["train"].__getitem__(200)

Vocab size: 595366


(tensor([13030,  8330,  3721,  8330,  3721,   132,  2496, 13031,  4719,  3982,
         13031,  3178,   303,  5510, 13032,  8334,  2496, 13031,  4719,  1828,
          2496,  1985, 13033, 10701, 13034,     7,  5299,  2338,  6948,     5,
             9,     9,     8, 10510,   480, 13035, 13036, 11814, 13035, 13036,
           965,   933,  2789,     5,   223,    10,   933, 13037,  6777,  1646,
          3271, 13038,  2496, 13031,  4719,  1036, 13039,  1985,  2300,  1495,
           601, 13040,  1495,     5,     9,   208,     6,     5,     9,     9,
            11,   568,     5,     9,     9,   208, 13041,  1467,   403, 13042,
          9309,  1065, 13043, 13044, 13043, 13044,  2300,  2189,  1880,  8330,
          4719,   452,    10,     8,     8,     8, 13035, 13036,    21, 13045,
          2300, 13045,  2641,  3721,  4340,  4251, 13043, 13044, 13046,  2496,
         13031,  4719,  4340, 13045, 13047, 13048, 13049, 13050,  5496,  9571,
           648,     5,     9,    10,     8,     5,  

In [18]:
# # Next step after loading tensor dataset -- create dataloader
# wiki_loaders = {}

# batch_size = 32

# for split, wiki_dataset in wiki_tensor_dataset.items():
#     wiki_loaders[split] = DataLoader(
#         wiki_dataset, 
#         batch_size=batch_size, 
#         shuffle=True, 
#         collate_fn=partial(pad_collate_fn, word_to_index=word_to_index)
#     )