In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

import ujson as json
import pickle
import re
from tqdm import tqdm

In [2]:
class Pointer:
    def __init__(self, filename):
        self.filename = filename
        self.index = 0

    def _save(self):
        with open(self.filename, 'wb') as fopen:
            pickle.dump(self.index, fopen)

    def increment(self):
        self.index += 1
        self._save()

    def load(self):
        if not os.path.exists(self.filename):
            return
        with open(self.filename, 'rb') as fopen:
            self.index = pickle.load(fopen)

In [18]:
filename = 'dialogpt.tsv'
pointer = Pointer(f'{filename}.pickle')
pointer.load()
index = 0
limit = 500000

In [19]:
results = []
with open(filename) as fopen:
    for line in fopen:
        if index >= pointer.index:
            l = line
            splitted = l.split('\t')
            if len(splitted) == 2:
                left = splitted[0].split()
                right = splitted[1].split()
                r = {'score': left[0], 'left': ' '.join(left[1:]),
                    'right_score': right[0], 'right': ' '.join(right[1:])}
                results.append(r)
                pointer.increment()
        index += 1
        if len(results) == limit:
            break

In [20]:
len(results)

500000

In [21]:
results[:10]

[{'score': '0.0',
  'left': "On the bright side , despite kidnapping and cruelly abandoning him , it doesn't sound like he was tortured ...",
  'right_score': '1.0',
  'right': "We didn't torture somebody ! USA"},
 {'score': '1.0',
  'left': 'will comments dissapear if ranked low enough ? I can just see the pages with 5000 comments now ..',
  'right_score': '1.0',
  'right': "not yet , but we'll play around with it"},
 {'score': '1.0',
  'left': 'choose flat in the upper right ?',
  'right_score': '1.0',
  'right': 'Of course , that makes replies made by users in the nested view look completely out of context .'},
 {'score': '0.0',
  'left': 'You know whats procrastination ? Sitting in front of your machine all day long doing nothing but reading all the hot articles on reddit .',
  'right_score': '1.0',
  'right': 'Amen to that !'},
 {'score': '1.0',
  'left': 'An article on Micro Economics by Joel Spolsky',
  'right_score': '1.0',
  'right': '... from 2002 !'},
 {'score': '0.0',
  'le

In [22]:
partition_size = 4
start = 0
limit = len(results) // partition_size
for i in range(partition_size):
    b = results[i * limit: limit + (i * limit)]
    with open(f'{filename}-{i + start}', 'w') as fopen:
        json.dump(b, fopen)