In [1]:
import re
import os
import random
import tarfile
import urllib
from torchtext import data


class TarDataset(data.Dataset):
    """Defines a Dataset loaded from a downloadable tar archive.

    Attributes:
        url: URL where the tar archive can be downloaded.
        filename: Filename of the downloaded tar archive.
        dirname: Name of the top-level directory within the zip archive that
            contains the data files.
    """

    @classmethod
    def download_or_unzip(cls, root):
        path = os.path.join(root, cls.dirname)
        if not os.path.isdir(path):
            tpath = os.path.join(root, cls.filename)
            if not os.path.isfile(tpath):
                print('downloading')
                urllib.request.urlretrieve(cls.url, tpath)
            with tarfile.open(tpath, 'r') as tfile:
                print('extracting')
                tfile.extractall(root)
        return os.path.join(path, '')


class MR(TarDataset):

    url = 'https://www.cs.cornell.edu/people/pabo/movie-review-data/rt-polaritydata.tar.gz'
    filename = 'rt-polaritydata.tar'
    dirname = 'rt-polaritydata'

    @staticmethod
    def sort_key(ex):
        return len(ex.text)

    def __init__(self, text_field, label_field, path=None, examples=None, **kwargs):
        """Create an MR dataset instance given a path and fields.

        Arguments:
            text_field: The field that will be used for text data.
            label_field: The field that will be used for label data.
            path: Path to the data file.
            examples: The examples contain all the data.
            Remaining keyword arguments: Passed to the constructor of
                data.Dataset.
        """
        def clean_str(string):
            """
            Tokenization/string cleaning for all datasets except for SST.
            Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
            """
            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)
            return string.strip()

        text_field.preprocessing = data.Pipeline(clean_str)
        fields = [('text', text_field), ('label', label_field)]

        if examples is None:
            path = self.dirname if path is None else path
            examples = []
            with open(os.path.join(path, 'rt-polarity.neg'), errors='ignore') as f:
                examples += [
                    data.Example.fromlist([line, 'negative'], fields) for line in f]
            with open(os.path.join(path, 'rt-polarity.pos'), errors='ignore') as f:
                examples += [
                    data.Example.fromlist([line, 'positive'], fields) for line in f]
        super(MR, self).__init__(examples, fields, **kwargs)

    @classmethod
    def splits(cls, text_field, label_field, dev_ratio=.1, shuffle=True, root='.', **kwargs):
        """Create dataset objects for splits of the MR dataset.

        Arguments:
            text_field: The field that will be used for the sentence.
            label_field: The field that will be used for label data.
            dev_ratio: The ratio that will be used to get split validation dataset.
            shuffle: Whether to shuffle the data before split.
            root: The root directory that the dataset's zip archive will be
                expanded into; therefore the directory in whose trees
                subdirectory the data files will be stored.
            train: The filename of the train data. Default: 'train.txt'.
            Remaining keyword arguments: Passed to the splits method of
                Dataset.
        """
        path = cls.download_or_unzip(root)
        examples = cls(text_field, label_field, path=path, **kwargs).examples
        if shuffle: random.shuffle(examples)
        dev_index = -1 * int(dev_ratio*len(examples))

        return (cls(text_field, label_field, examples=examples[:dev_index]),
                cls(text_field, label_field, examples=examples[dev_index:]))


In [2]:
import pandas as pd
import gzip
from collections import Counter
import re

def load_initial_idata(path):
    df = []
    for line in gzip.open(path, 'rb'):
        df.append(eval(line))
    return pd.DataFrame.from_dict(df)

def clean_str(string):
            """
            Tokenization/string cleaning for all datasets
            Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
            """
            #comment = re.sub(r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", str(comment))
            string = string.lower()
            string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
            string = re.sub(r"\'s", " \'s", string)
            string = re.sub(r"\'ve", " \'ve", string)
            string = re.sub(r"n\'t", " n\'t", string)
            string = re.sub(r"\'re", " \'re", string)
            string = re.sub(r"\'d", " \'d", string)
            string = re.sub(r"\'ll", " \'ll", string)
            string = re.sub(r",", " , ", string)
            string = re.sub(r"!", " ! ", string)
            string = re.sub(r"\(", " \( ", string)
            string = re.sub(r"\)", " \) ", string)
            #string = re.sub(r"\?", " \? ", string)
            string = re.sub(r"\s{2,}", " ", string)
            return string.strip()

In [3]:
#A.head()['questionType']

In [4]:
def add_yesno_questions(dataframe):
    with open('customized_data/yesno_questions.txt', 'a') as fl:
        for i in range(len(dataframe)):
            if dataframe['questionType'][i]=='yes/no':
                fl.write(clean_str(dataframe['question'][i])+'\n')

In [5]:
def add_openend_questions(dataframe):
    with open('customized_data/openend_questions.txt', 'a') as fl:
        for i in range(len(dataframe)):
            if dataframe['questionType'][i]=='open-ended':
                fl.write(clean_str(dataframe['question'][i])+'\n')

In [11]:
import os

for file in os.listdir("amazon"):
    fl = load_initial_data("amazon/"+file)
    add_yesno_questions(fl)
    add_openend_questions(fl)

In [69]:
def load_customized_data(fl):
    df = []
    with open(fl) as data:
        for line in data:
            df.append(line.strip())
    return pd.DataFrame.from_dict(df)
    
d = load_customized_data('customized_data/yesno_questions.txt')
d.preprocessing = data.Pipeline()
#d = list(d[0])
#d[0:10]


#label_field = "yes/no"

#fields = [('question', d), ('type_label', label_field)]
#data.Dataset(text_field, label_field, path=None, examples=None)

In [59]:
a = pd.DataFrame
a.preprocessing = data.Pipeline()
#text_field.preprocessing = data.Pipeline()

In [71]:
#examples = []
#for file in os.listdir("customized_data"):
    #examples+=[data.Example.fromlist([line, file[:-14]], fields) for line in file]

In [None]:
c = "openend_questions.txt"
c[:-14]

In [54]:
class PredictQuestionType(data.Dataset):
    def __init__(self, text_field, label_field, **kwargs):
        fields = [("Question", text_field), ("Type", label_field)]
        examples = []
        files = os.listdir("customized_data")
        labels = [label[:-14] for label in files]
        for label in labels:
            for fname in files:
                with open("customized_data/"+fname, 'r') as f: text = f.readline()
                examples.append(data.Example.fromlist([text, label], fields))
        super().__init__(examples, fields, **kwargs)

    @staticmethod
    def sort_key(ex): return len(ex.Description)
    
    @classmethod
    def splits(cls, text_field, label_field, root='.',
               train='train', test='test', **kwargs):
        return super().splits(
            root, text_field=text_field, label_field=label_field,
            train=train, validation=None, test=test, **kwargs)

In [67]:
custom_data = PredictQuestionType(pd.DataFrame.prepocessing, [])

TypeError: __init__() missing 2 required positional arguments: 'text_field' and 'label_field'