# split_data
**Author:** Khoi Nguyen

**Date created:** 03/06/2023

**Last modified:** 04/15/2023

**Description:** This notebook splits data into train and test sets.

In [1]:
# Imports
import json
import random

### Ada
Split data into train and test sets inside the `data/ada` folder.

In [2]:
# randomly choose 1,000 samples from `data/generics.json` to be used for testing. The data is stored as {"sentences" : [{"sentence" : sentence1}, ...]}
# load the data
with open('data/generics.json') as json_file:
    data = json.load(json_file)

# Shuffle the data and split it into training and testing
sentences = data['sentences']
random.shuffle(sentences)
# train = sentences[:1000] # 1,000 sentences for training is not needed
test = sentences[-1000:]

# save the data
# with open('data/1k_ada/train.json', 'w') as outfile:
#     json.dump(train, outfile)

with open('data/ada/test.json', 'w') as outfile:
    json.dump(test, outfile)

In [3]:
# Print the first 10 samples reading from the test file with prettier formatting
with open('data/ada/test.json') as f:
    data = json.load(f)
    print(json.dumps(data[:5], indent=2))

[
  {
    "sentence": "Love is the immortal flow of energy that nourishes and preserves."
  },
  {
    "sentence": "Basic hygiene can help prevent meningococcal disease."
  },
  {
    "sentence": "Most land breezes occur in temperate regions."
  },
  {
    "sentence": "Degradations can have impact."
  },
  {
    "sentence": "Western tanagers are insectivores and catch insects while they are in flight."
  }
]


### 1k Ada
Split data into train and test sets inside the `data/1k_ada` folder.

In [4]:
# randomly choose 1,000 samples from `data/generics.json` to be used for training and testing. The data is stored as {"sentences" : [{"sentence" : sentence1}, ...]}
# load the data
with open('data/generics.json') as json_file:
    data = json.load(json_file)

# Shuffle the data and split it into training and testing
sentences = data['sentences']
random.shuffle(sentences)
train = sentences[:1000]
test = sentences[-1000:]

# save the data
with open('data/1k_ada/train.json', 'w') as outfile:
    json.dump(train, outfile)

with open('data/1k_ada/test.json', 'w') as outfile:
    json.dump(test, outfile)

In [5]:
# Print the first 5 samples reading from the test file with prettier formatting
with open('data/1k_ada/test.json') as f:
    data = json.load(f)
    print(json.dumps(data[:5], indent=2))

[
  {
    "sentence": "Artificial teeth are the single most expensive component of a denture."
  },
  {
    "sentence": "Monocots have one cotyledon and dicots have two cotyledons."
  },
  {
    "sentence": "Utilities use consumption."
  },
  {
    "sentence": "Architecture is engagement with the world."
  },
  {
    "sentence": "Recipes include fresh vegetables."
  }
]


### 10k Ada
Split data into train and test sets inside the `data/10k_ada` folder.

In [6]:
# randomly choose 10k samples from `data/generics.json` to be used for training and testing. The data is stored as {"sentences" : [{"sentence" : sentence1}, ...]}
# load the data
with open('data/generics.json') as json_file:
    data = json.load(json_file)

# Shuffle the data and split it into training and testing
sentences = data['sentences']
random.shuffle(sentences)
train = sentences[:10000]
test = sentences[-1000:]

# save the data
with open('data/10k_ada/train.json', 'w') as outfile:
    json.dump(train, outfile)

with open('data/10k_ada/test.json', 'w') as outfile:
    json.dump(test, outfile)

In [7]:
# Print the first 5 samples reading from the test file with prettier formatting
with open('data/10k_ada/test.json') as f:
    data = json.load(f)
    print(json.dumps(data[:5], indent=2))

[
  {
    "sentence": "Saddle soap is used for cleaning, conditioning, and softening leather."
  },
  {
    "sentence": "Paramecia regulate water by way of contractile vacuoles."
  },
  {
    "sentence": "Fireworks are dangerous because they contain gunpowder."
  },
  {
    "sentence": "Predatory birds hunt the reed beds for mammals and other birds that shelter there."
  },
  {
    "sentence": "Most cold cuts are high in fat and sodium."
  }
]


### 100k Ada
Split data into train and test sets inside the `data/100k_ada` folder.

In [8]:
# randomly choose 100k samples from `data/generics.json` to be used for training and testing. The data is stored as {"sentences" : [{"sentence" : sentence1}, ...]}
# load the data
with open('data/generics.json') as json_file:
    data = json.load(json_file)

# Shuffle the data and split it into training and testing
sentences = data['sentences']
random.shuffle(sentences)
train = sentences[:100000]
test = sentences[-1000:]

# save the data
with open('data/100k_ada/train.json', 'w') as outfile:
    json.dump(train, outfile)

with open('data/100k_ada/test.json', 'w') as outfile:
    json.dump(test, outfile)

In [9]:
# Print the first 5 samples reading from the test file with prettier formatting
with open('data/100k_ada/test.json') as f:
    data = json.load(f)
    print(json.dumps(data[:5], indent=2))

[
  {
    "sentence": "Plant survival depends on change environmental conditions."
  },
  {
    "sentence": "Cassowaries have (part) quill feathers."
  },
  {
    "sentence": "Most mammals belong to raccoon families."
  },
  {
    "sentence": "Qualitative knowledge is considered as a particular approach to knowledge."
  },
  {
    "sentence": "Money is located in floors."
  }
]
