In [1]:
import json
import glob
import random

### Download Stanford Large Movie Review Dataset
This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well. Raw text and already processed bag of words formats are provided. See the README file contained in the release for more details.

In [4]:
!curl -o ./data/aclImdb.tar.gz https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
!tar -xzf ./data/aclImdb.tar.gz -C ./data/
!rm -rf ./data/aclImdb.tar.gz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 80.2M  100 80.2M    0     0  20.3M      0  0:00:03  0:00:03 --:--:-- 20.3M


In [3]:
random.seed(742)
# Textflint uses labels x (text) and y (sentiment) for Sentiment Analysis task
pos = random.sample(glob.glob('./data/aclImdb/test/pos/*.txt'), 50)
neg = random.sample(glob.glob('./data/aclImdb/test/neg/*.txt'), 50)

pos_objects = []
for filename in pos:
    with open(filename, 'r') as file:
        pos_objects.append({
            'x': file.read(),
            'y': 'pos'
        })

neg_objects = []
for filename in neg:
    with open(filename, 'r') as file:
        neg_objects.append({
            'x': file.read(),
            'y': 'neg'
        })

with open('./data/aclImdb.json', 'w') as outfile:
    for object in [*pos_objects, *neg_objects]:
        outfile.write(json.dumps(object))
        outfile.write('\n')

## Download Stanford Natural Language Inference Corpus

In [1]:
!curl -o ./data/SNLI1.0.zip https://nlp.stanford.edu/projects/snli/snli_1.0.zip
!unzip ./data/SNLI1.0.zip -d ./data/
!rm -f ./data/SNLI1.0.zip ./data/_MACOSX

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 90.1M  100 90.0M    0     0  20.6M      0  0:00:04  0:00:03  0:00:01 20.6M.1M    0     0  22.5M      0  0:00:03  0:00:03 --:--:-- 22.5M
Archive:  ./data/SNLI1.0.zip
   creating: ./data/snli_1.0/
  inflating: ./data/snli_1.0/.DS_Store  
   creating: ./data/__MACOSX/
   creating: ./data/__MACOSX/snli_1.0/
  inflating: ./data/__MACOSX/snli_1.0/._.DS_Store  
 extracting: ./data/snli_1.0/Icon    
  inflating: ./data/__MACOSX/snli_1.0/._Icon  
  inflating: ./data/snli_1.0/README.txt  
  inflating: ./data/__MACOSX/snli_1.0/._README.txt  
  inflating: ./data/snli_1.0/snli_1.0_dev.jsonl  
  inflating: ./data/snli_1.0/snli_1.0_dev.txt  
  inflating: ./data/snli_1.0/snli_1.0_test.jsonl  
  inflating: ./data/snli_1.0/snli_1.0_test.txt  
  inflating: ./data/snli_1.0/snli_1.0_train.jsonl  
  inflating: ./data/snli_1.0/snli_1.0_train.txt  
 

In [42]:
random.seed(742)

lines = []
with open('./data/snli_1.0/snli_1.0_test.jsonl') as file:
    for line in file:
        lines.append(line)

with open('./data/snli_mini.json', 'w') as outfile:
    for i, line in enumerate(lines):
        line = json.loads(line)
        if i >= 100:
            break
        elif line['gold_label'] == '-':
            continue
        
        newline = {
            'premise': line['sentence1'],
            'hypothesis': line['sentence2'],
            'y': line['gold_label']
        }
        outfile.write(json.dumps(newline))
        outfile.write('\n')

with open('./data/snli_large.json', 'w') as outfile:
    for i, line in enumerate(lines):
        line = json.loads(line)
        if i >= 9000:
            break
        elif line['gold_label'] == '-':
            continue
        
        newline = {
            'premise': line['sentence1'],
            'hypothesis': line['sentence2'],
            'y': line['gold_label']
        }
        outfile.write(json.dumps(newline))
        outfile.write('\n')

## SQuAD2.0 MRC Dataset

In [51]:
!mkdir ./data/SQuAD
!curl -o ./data/SQuAD/dev-v2.0.json https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json

mkdir: cannot create directory ‘./data/SQuAD’: File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 4268k  100 4268k    0     0  64.1M      0 --:--:-- --:--:-- --:--:-- 64.1M


In [57]:
random.seed(742)

data = []
with open('./data/SQuAD/dev-v2.0.json') as file:
    data = json.load(file)['data']

samples = []

for obj in data:
    for para in obj['paragraphs']:
        for qa in para['qas']:
            sample = {
                'title': obj['title'],
                'context': para['context'],
                'question': qa['question'],
                'answers': qa['answers'],
                'is_impossible': qa['is_impossible']
            }
            samples.append(sample)

samples = random.sample(samples, 1250)

with open('./data/squad2.0.json', 'w') as outfile:
    for sample in samples:
        outfile.write(json.dumps(sample))
        outfile.write('\n')

# ReClor MRC+ Dataset

In [33]:
!curl -L https://github.com/yuweihao/reclor/releases/download/v1/reclor_data.zip > ./data/reclor_data.zip
!unzip -P for_non-commercial_research_purpose_only -d ./data/reclor_data ./data/reclor_data.zip
!rm -f ./data/reclor_data.zip

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 1928k  100 1928k    0     0  27.2M      0 --:--:-- --:--:-- --:--:-- 27.2M
Archive:  ./data/reclor_data.zip
  inflating: ./data/reclor_data/question_type_names.json  
  inflating: ./data/reclor_data/source_list.txt  
  inflating: ./data/reclor_data/test.json  
  inflating: ./data/reclor_data/train.json  
  inflating: ./data/reclor_data/use_items.txt  
  inflating: ./data/reclor_data/val.json  


In [44]:
random.seed(742)

data = []
with open('./data/reclor_data/train.json', 'r') as file:
    data += json.load(file)
with open('./data/reclor_data/val.json', 'r') as file:
    data += json.load(file)

with open('./data/reclor.json', 'w') as outfile:
    samples = random.sample(data, 2500)

    for sample in samples:
        new_sample = {
            'context': sample['context'],
            'question': sample['question'],
            'answer_choices': sample['answers'],
            'answers': [sample['answers'][sample['label']]], # needed to not throw error with Engine.run()
            'label': sample['label'],
            'title': sample['id_string'],
            'is_impossible': False
        }

        outfile.write(json.dumps(new_sample))
        outfile.write('\n')