In [7]:
#Done for Paper
import unicodedata


def whitespace_tokenize(text,split):
	"""Runs basic whitespace cleaning and splitting on a piece of text."""
	text = text.strip()
	if not text:
		if split:
			return []
		else:
			return ""
	if split:
		return text.split()
	return text

def _is_whitespace(char):
	"""Checks whether `chars` is a whitespace character."""
	# \t, \n, and \r are technically contorl characters but we treat them
	# # as whitespace since they are generally considered as such.
	if char == " " or char == "\t" or char == "\n" or char == "\r":
		return True
	cat = unicodedata.category(char)
	if cat == "Zs":
		return True
	return False


def _is_control(char):
	"""Checks whether `chars` is a control character."""
	# These are technically control characters but we count them as whitespace characters.

	if char == "\t" or char == "\n" or char == "\r":
		return False
	cat = unicodedata.category(char)
	if cat in ("Cc", "Cf"):
		return True
	return False


def _is_punctuation(char):
	"""Checks whether `chars` is a punctuation character."""
	cp = ord(char)
	# We treat all non-letter/number ASCII as punctuation.
	# Characters such as "^", "$", and "`" are not in the Unicode
	# Punctuation class but we treat them as punctuation anyways, for
	# consistency.
	if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
		(cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
		return True
	cat = unicodedata.category(char)
	if cat.startswith("P"):
		return True
	return False

class BasicTokenizer(object):
	"""Runs basic tokenization (punctuation splitting, lower casing, etc.)."""

	def __init__(self, do_lower_case=True):
		"""Constructs a BasicTokenizer.
		Args: do_lower_case: Whether to lower case the input.
		"""
		self.do_lower_case = do_lower_case

	def tokenize(self, text):
		"""Tokenizes a piece of text."""
		# text = convert_to_unicode(text)
		text = self._clean_text(text)
		orig_tokens = whitespace_tokenize(text, split=True)
		split_tokens = []
		for token in orig_tokens:
			if self.do_lower_case:
				token = token.lower()
				token = self._run_strip_accents(token)
			split_tokens.extend(self._run_split_on_punc(token))
		output_text = whitespace_tokenize(" ".join(split_tokens), split=False)
		return output_text

	def _run_strip_accents(self, text):
		"""Strips accents from a piece of text."""
		text = unicodedata.normalize("NFD", text)
		output = []
		for char in text:
			cat = unicodedata.category(char)
			if cat == "Mn":
				continue
			output.append(char)
		return "".join(output)
	
	def _run_split_on_punc(self, text):
		"""Splits punctuation on a piece of text."""
		chars = list(text)
		i = 0
		start_new_word = True
		output = []
		while i < len(chars):
			char = chars[i]
			if _is_punctuation(char):
				output.append([char])
				start_new_word = True
			else:
				if start_new_word:
					output.append([])
				start_new_word = False
				output[-1].append(char)
			i += 1

		return ["".join(x) for x in output]

	def _clean_text(self, text):
		"""Performs invalid character removal and whitespace cleanup on text."""
		output = []
		for char in text:
			cp = ord(char)
			if cp == 0 or cp == 0xfffd or _is_control(char):
				continue
			if _is_whitespace(char):
				output.append(" ")
			else:
				output.append(char)
		return "".join(output)

In [8]:
import json

# Load the JSON file
try:
    with open('/Users/balazs/Desktop/debateorg.json', 'r') as f:
        data = json.load(f)
except Exception as e:
    print("Error reading JSON file:", str(e))
    # Handle the error appropriately

# Extract the relevant field that you want to tokenize
if isinstance(data, list):
    text_list = [d['text'] for d in data]
else:
    print("Error: JSON file does not contain a list of dictionaries.")
    # Handle the error appropriately

# Create an instance of the BasicTokenizer class
tokenizer = BasicTokenizer()

# Tokenize the text and write to a new file in the JSONL format
try:
    with open('/Users/balazs/Desktop/debateorg.json', 'w') as f:
        for text in text_list:
            tokens = tokenizer.tokenize(text)
            output_dict = {'text': ' '.join(tokens)}
            json.dump(output_dict, f)
            f.write('\n')
except Exception as e:
    print("Error writing JSONL file:", str(e))
    # Handle the error appropriately

: 

: 