In [2]:
from torch.utils.data import Dataset
import pandas as pd
import os, torch, time, math, sys, re, csv
import numpy as np


from pathlib import Path, PurePath
from zipfile import ZipFile
import urllib.request
import shutil
import requests, zipfile, io

sys.path.append('..' + os.sep )
from src import default
from src.data import download as dl, tokenization as tkn, custom_dataset as cd

In [26]:
class BaseDataset(Dataset):
    """You can choose to set a transform (generally a tokenizer) to transform
    what is returned into another form of data more suitable (generally a 
    tensor of tokens).

    To change what a sample is, need only change the method :
        get_instance_pretransform
    """
    def __init__(self, transform=None):
        self.data = [1,2,3]
        self.transform = transform
        
        if self.transform is None:
            self.get_instance = self.get_instance_raw
        else:
            self.get_instance = self.get_instance_transformed # how to get a sample
                                                        # from the dataset
        return
    
    def __len__(self):
        # gives number of samples in dataset
        return len(self.data)
    
    def __getitem__(self, idx):
        # uses function as defined by self.get_instance
        return self.get_instance(idx)
    
    def set_transform(self, transform):
        # can set transform and will change how we get an instance
        self.transform = transform # transform.encode could be used with rawTokenizer (instead of FastTokenizer)
        self.get_instance = self.get_instance_transformed
    
    def get_instance_raw(self, idx):
        return self.data[idx]
    
    def get_instance_transformed(self, idx):
        """
        Once tranform is defined, can get item already transformed

        Input
            idx (int) : index of sample to fetch and transform
        Return
            transformed sample
        """
        instance = self.get_instance_raw(idx)
        # tokenize on-the-fly
        instance = self.transform( instance, return_tensors='pt')
        return instance['input_ids'][0]
    

class ArxivDataset(BaseDataset):
    """
    This Dataset takes the Arxiv data downloaded into a '.csv' files and, when
    called, returns a sample from a list of samples with data_field.
    """
    def __init__(self, csvfile, transform=None, data_field='summary'):
        """
        Loads all of the data and cleans it up slightly. Might wants to call it
        something else instead of 'raw', but for now will do. Sets a transform
        of the data (which is how data is presented if fetched).

        Input
            csvfile (str)               : csv file containing data
            data_field (str)            : name of the field to be used in train on
            transform (function, opt)   : a transform of the data if one already
                                            exists
        """
        super().__init__()
        self.data = pd.read_csv(csvfile)
        self.data_field = data_field
        # last one r'\s+|\\n' seems to be the only one that works
        remove_lst = ['\r\n','\n','\ n',r'\\n',r'\n',r'\s+|\\n']
        self.data.replace(remove_lst,' ',regex=True, inplace=True)

    def get_instance_raw(self, idx):
        # returns some form of the text which will be our sample
        return self.data[self.data_field][idx]

class WikiTextDataset(BaseDataset):
    def __init__(self, dataname='wikitext-2-raw', bptt=35):
        assert dataname in ['wikitext-2','wikitext-2-raw','wikitext-103','wikitext-103-raw']
        assert isinstance(bptt,int)
        super().__init__()
        self.dirname = dataname
        self.bptt = bptt
        zipfile = f'{self.dirname}-v1.zip'
        url = f'https://s3.amazonaws.com/research.metamind.io/wikitext/{zipfile}'
        
        if not Path(default.RAW_DATA_DIR,self.dirname).exists():
            r = requests.get(url)
            z = ZipFile(io.BytesIO(r.content))
            z.extractall(default.RAW_DATA_DIR)
        
        split = ['wiki.train','wiki.valid','wiki.test']      
        if 'raw' in self.dirname:
            split = [ s+'.raw' for s in split ]
        else:
            split = [ s+'.raw' for s in split ]

        for file in split:
            dataString = ''
            with open(PurePath(default.RAW_DATA_DIR,self.dirname,file), 'r') as f:
                dataString += ' '.join([line.strip() for line in f])
                #dataString += ' '.join([line for line in f])
            #dataString = "".join([s for s in dataString.splitlines(True) if s.strip("\r\n")])
        wordList = dataString.split(' ')
        self.data = [' '.join(wordList[i:i + bptt]) for i in range(0, len(wordList), bptt)]
            

In [27]:
# download data
nbrResults = 10**4 # number of data samples to download
filename = f'arxiv_{nbrResults}'
extension = '.csv'
filename += extension

filepath = default.RAW_DATA_DIR + os.sep + filename

if not os.path.exists(filepath):
    dl.arxiv_api( default.RAW_DATA_DIR, filename, max_results=nbrResults ) # TODO : CHANGE SO THAT NOT CONSTANTLY LOADING DATA
print(f'>> Using {filename} for training <<')

>> Using arxiv_10000.csv for training <<


In [28]:
dataset = ArxivDataset(filepath)

In [29]:
dataset[0]

'The effect of the electron-electron cusp on the convergence of configuration interaction (CI) wave functions is examined. By analogy with the pseudopotential approach for electron-ion interactions, an effective electron-electron interaction is developed which closely reproduces the scattering of the Coulomb interaction but is smooth and finite at zero electron-electron separation. The exact many-electron wave function for this smooth effective interaction has o cusp at zero electron-electron separation. We perform CI and quantum Monte Carlo calculations for He and Be atoms, both with the Coulomb electron-electron interaction and with the smooth effective electron-electron interaction. We find that convergence of the CI expansion of the wave function for the smooth electron-electron interaction is ot significantly improved compared with that for the divergent Coulomb interaction for energy differences on the order of 1 mHartree. This shows that, contrary to popular belief, description 

In [65]:
tknzerType = 'BPE'
tknzrFile = default.TOK_DIR + os.sep + filename + '_' + tknzerType + '.json'

# load PreTrainedTokenizerFast, for __call__. __call__ not implemented in
# the base Tokenizer class... that sounds silly, but it is what it is
tknzr = tkn.load_tokenizer(tknzrFile, **default.special_token_lst)

# set tknzr as the transform
dataset.set_transform( tknzr )

In [30]:
wt = WikiTextDataset()

In [31]:
wt[3]

'actors Mark Strong and Derek Jacobi . He was cast in the 2005 theatre productions of the Philip Ridley play Mercury Fur , which was performed at the Drum Theatre in Plymouth and the Menier'