<a href="https://colab.research.google.com/github/parwinderau/DataspaceConnector/blob/main/Simple_JSON_Synthetic_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install faker

Collecting faker
  Downloading Faker-27.0.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-27.0.0-py3-none-any.whl (1.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.8 MB[0m [31m5.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m1.4/1.8 MB[0m [31m20.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-27.0.0


In [3]:
import json
import random
from faker import Faker
from copy import deepcopy

fake = Faker()

# Base template for generating synthetic JSON data
base_template = {
    "device_id": None,
    "timestamp": None,
    "measurements": {
        "temperature": None,
        "humidity": None
    }
}

# Function to generate synthetic data
def generate_data(template, structure_variation=True, semantic_variation=True):
    data = deepcopy(template)

    # Semantically same information but structurally different
    if structure_variation and not semantic_variation:
        data = {key: data[key] for key in reversed(list(data.keys()))}  # Reverse the order of keys

    # Structurally same but semantically different
    elif not structure_variation and semantic_variation:
        data["device_id"] = fake.uuid4()
        data["timestamp"] = fake.date_time().isoformat()
        data["measurements"]["temperature"] = random.uniform(-10, 50)  # Different but valid temperature range
        data["measurements"]["humidity"] = random.uniform(10, 90)  # Different but valid humidity range

    # Structural and semantically both different
    elif structure_variation and semantic_variation:
        data["sensor_id"] = data.pop("device_id")  # Change the key name
        data["time"] = data.pop("timestamp")  # Change the key name
        data["readings"] = data.pop("measurements")  # Change the key name
        data["readings"]["temp"] = data["readings"].pop("temperature")  # Change the key name
        data["readings"]["humid"] = data["readings"].pop("humidity")  # Change the key name
        data["sensor_id"] = fake.uuid4()
        data["time"] = fake.date_time().isoformat()
        data["readings"]["temp"] = random.uniform(-10, 50)
        data["readings"]["humid"] = random.uniform(10, 90)

    # Structural and semantically both same
    else:
        data["device_id"] = fake.uuid4()
        data["timestamp"] = fake.date_time().isoformat()
        data["measurements"]["temperature"] = random.uniform(15, 30)
        data["measurements"]["humidity"] = random.uniform(30, 70)

    return data

# Generate a dataset with varying conditions
def generate_synthetic_dataset(num_samples=10):
    dataset = []

    for _ in range(num_samples):
        condition = random.choice([
            'structure_diff_semantic_same',
            'structure_same_semantic_diff',
            'structure_semantic_both_diff',
            'structure_semantic_both_same'
        ])

        if condition == 'structure_diff_semantic_same':
            data = generate_data(base_template, structure_variation=True, semantic_variation=False)

        elif condition == 'structure_same_semantic_diff':
            data = generate_data(base_template, structure_variation=False, semantic_variation=True)

        elif condition == 'structure_semantic_both_diff':
            data = generate_data(base_template, structure_variation=True, semantic_variation=True)

        else:  # 'structure_semantic_both_same'
            data = generate_data(base_template, structure_variation=False, semantic_variation=False)

        dataset.append(data)

    return dataset

# Generate and display synthetic data
synthetic_data = generate_synthetic_dataset(5)
print(json.dumps(synthetic_data, indent=2))


[
  {
    "device_id": "baeb342d-d9e8-4833-ae1e-d437c9c133ab",
    "timestamp": "2018-10-25T18:49:49.894426",
    "measurements": {
      "temperature": -0.9168523885050668,
      "humidity": 15.577727734748246
    }
  },
  {
    "device_id": "10cab8a2-9ea2-4c8d-9389-e80d8a892fdc",
    "timestamp": "1977-04-29T16:44:09.032658",
    "measurements": {
      "temperature": 17.40642111889551,
      "humidity": 57.87882101785955
    }
  },
  {
    "device_id": "eeccb19f-cec8-4d5b-996b-213b880a08b7",
    "timestamp": "2017-12-12T04:02:19.651435",
    "measurements": {
      "temperature": 23.046873212378458,
      "humidity": 17.822500057269934
    }
  },
  {
    "device_id": "22d7965a-e07a-4faf-9347-363f9c1e7867",
    "timestamp": "1979-10-10T23:26:13.392379",
    "measurements": {
      "temperature": 22.538488845879005,
      "humidity": 42.10645209994575
    }
  },
  {
    "sensor_id": "d34495a3-5a26-4e51-a871-bfd85d206633",
    "time": "2010-04-28T01:57:31.748625",
    "readings": {
   

In [6]:
#This is for NLP based JSON data generator for IoT domain
import json
import random
from faker import Faker
from copy import deepcopy
from nltk.corpus import wordnet
from nltk import download

# Download necessary NLTK data
download('wordnet')
download('omw-1.4')

fake = Faker()

# Function to get synonyms from WordNet
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

# Generate variations of terms used in JSON keys
term_variations = {
    "device_id": get_synonyms("device")[:5] + ["sensor_id", "equipment_id", "machine_id"],
    "timestamp": get_synonyms("timestamp")[:5] + ["time", "datetime", "log_time"],
    "measurements": get_synonyms("measurement")[:5] + ["readings", "data", "values"],
    "temperature": get_synonyms("temperature")[:5] + ["temp", "heat", "thermal"],
    "humidity": get_synonyms("humidity")[:5] + ["moisture", "dampness", "humid"],
}

# Base template for generating synthetic JSON data
base_template = {
    "device_id": None,
    "timestamp": None,
    "measurements": {
        "temperature": None,
        "humidity": None
    }
}

# Function to generate synthetic data with NLP variations
def generate_data(template, structure_variation=True, semantic_variation=True):
    data = deepcopy(template)

    # Apply term variations
    def apply_variations(d):
        for key in list(d.keys()):
            if isinstance(d[key], dict):
                apply_variations(d[key])
            new_key = random.choice(term_variations.get(key, [key]))
            if new_key != key:
                d[new_key] = d.pop(key)

    if structure_variation:
        apply_variations(data)

    # Semantically same but structurally different
    if structure_variation and not semantic_variation:
        pass  # Already handled by apply_variations

    # Structurally same but semantically different
    elif not structure_variation and semantic_variation:
        data["device_id"] = fake.uuid4()
        data["timestamp"] = fake.date_time().isoformat()
        data["measurements"]["temperature"] = random.uniform(-10, 50)
        data["measurements"]["humidity"] = random.uniform(10, 90)

    # Structural and semantically both different
    elif structure_variation and semantic_variation:
        data["device_id"] = fake.uuid4()
        data["timestamp"] = fake.date_time().isoformat()
        data["measurements"]["temperature"] = random.uniform(-10, 50)
        data["measurements"]["humidity"] = random.uniform(10, 90)

    # Structural and semantically both same
    else:
        data["device_id"] = fake.uuid4()
        data["timestamp"] = fake.date_time().isoformat()
        data["measurements"]["temperature"] = random.uniform(15, 30)
        data["measurements"]["humidity"] = random.uniform(30, 70)

    return data

# Generate a dataset with varying conditions
def generate_synthetic_dataset(num_samples=10):
    dataset = []

    for _ in range(num_samples):
        condition = random.choice([
            'structure_diff_semantic_same',
            'structure_same_semantic_diff',
            'structure_semantic_both_diff',
            'structure_semantic_both_same'
        ])

        if condition == 'structure_diff_semantic_same':
            data = generate_data(base_template, structure_variation=True, semantic_variation=False)

        elif condition == 'structure_same_semantic_diff':
            data = generate_data(base_template, structure_variation=False, semantic_variation=True)

        elif condition == 'structure_semantic_both_diff':
            data = generate_data(base_template, structure_variation=True, semantic_variation=True)

        else:  # 'structure_semantic_both_same'
            data = generate_data(base_template, structure_variation=False, semantic_variation=False)

        dataset.append(data)

    return dataset

# Generate and display synthetic data
synthetic_data = generate_synthetic_dataset(5)
print(json.dumps(synthetic_data, indent=2))


[
  {
    "device_id": "f70f6a2f-9afb-4e4e-b062-2466476d0105",
    "timestamp": "1995-05-12T08:43:15.351342",
    "measurements": {
      "temperature": 21.205621152364852,
      "humidity": 17.171543068727644
    }
  },
  {
    "device": null,
    "time": null,
    "mensuration": {
      "humidity": null,
      "temp": null
    }
  },
  {
    "device_id": "20cccd6f-5904-4373-8b59-ebc5abc0ec9d",
    "timestamp": "2002-10-23T02:30:10.833039",
    "measurements": {
      "temperature": 24.250107522842594,
      "humidity": 41.437023672510215
    }
  },
  {
    "twist": null,
    "time": null,
    "measure": {
      "temperature": null,
      "humidity": null
    }
  },
  {
    "sensor_id": null,
    "datetime": null,
    "measuring": {
      "temperature": null,
      "humidness": null
    }
  }
]


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [11]:
import json
import random
from faker import Faker
from copy import deepcopy
from nltk.corpus import wordnet
from nltk import download

# Download necessary NLTK data
download('wordnet')
download('omw-1.4')

fake = Faker()

# Function to get synonyms from WordNet
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

# Generate variations of terms used in JSON keys
term_variations = {
    "device_id": get_synonyms("device")[:5] + ["sensor_id", "equipment_id", "machine_id"],
    "timestamp": get_synonyms("timestamp")[:5] + ["time", "datetime", "log_time"],
    "measurements": get_synonyms("measurement")[:5] + ["readings", "data", "values"],
    "temperature": get_synonyms("temperature")[:5] + ["temp", "heat", "thermal"],
    "humidity": get_synonyms("humidity")[:5] + ["moisture", "dampness", "humid"],
}

# Base template for generating synthetic JSON data
base_template = {
    "device_id": None,
    "timestamp": None,
    "measurements": {
        "temperature": None,
        "humidity": None
    }
}

# Function to generate synthetic data with NLP variations
def generate_data(template, structural_variation=True, semantic_variation=True):
    data = deepcopy(template)

    # Apply term variations (semantic variation)
    def apply_semantic_variations(d):
        for key in list(d.keys()):
            if isinstance(d[key], dict):
                apply_semantic_variations(d[key])
            new_key = random.choice(term_variations.get(key, [key]))
            if new_key != key:
                d[new_key] = d.pop(key)

    if semantic_variation:
        apply_semantic_variations(data)

    # Apply structural variation
    if structural_variation:
        if random.choice([True, False]):  # Randomly decide to flatten
            flattened_data = {}
            for key, value in data.items():
                if isinstance(value, dict):
                    for subkey, subvalue in value.items():
                        flattened_data[f"{key}_{subkey}"] = subvalue
                else:
                    flattened_data[key] = value
            data = flattened_data

    # Generate actual data values if not already present
    if not semantic_variation:
        data["device_id"] = fake.uuid4()
        data["timestamp"] = fake.date_time().isoformat()
        data["measurements"]["temperature"] = random.uniform(-10, 50)
        data["measurements"]["humidity"] = random.uniform(10, 90)

    return data

# Generate a dataset with varying conditions
def generate_synthetic_dataset(num_samples=10):
    dataset = []

    for _ in range(num_samples):
        data = generate_data(base_template,
                            structural_variation=random.choice([True, False]),
                            semantic_variation=random.choice([True, False]))
        dataset.append(data)

    return dataset

# Generate and display synthetic data
synthetic_data = generate_synthetic_dataset(5)
print(json.dumps(synthetic_data, indent=2))


[
  {
    "device_id": "4b8a65b8-7535-4bbd-84c1-0849d9350efc",
    "timestamp": "1995-02-16T03:43:46.261065",
    "measurements": {
      "temperature": 34.061100695140134,
      "humidity": 76.96917312512983
    }
  },
  {
    "gimmick": null,
    "datetime": null,
    "readings": {
      "temp": null,
      "humidness": null
    }
  },
  {
    "device_id": "a2e39263-f663-42a0-8168-6a668ae9c0b5",
    "timestamp": "2003-12-27T17:14:32.167831",
    "measurements": {
      "temperature": -0.8889444020204778,
      "humidity": 21.915546841070856
    }
  },
  {
    "equipment_id": null,
    "log_time": null,
    "data": {
      "temperature": null,
      "dampness": null
    }
  },
  {
    "device_id": "cd0e059d-38bf-4172-8ac0-1efbf9584e9e",
    "timestamp": "2016-01-19T00:42:36.902679",
    "measurements": {
      "temperature": 36.64447775006671,
      "humidity": 33.43464361849385
    }
  }
]


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [25]:
import json
import random
import numpy as np
from nltk.corpus import wordnet
from nltk import download
from transformers import pipeline

# Download necessary NLTK data
download('wordnet')
download('omw-1.4')

# NLP pipeline for entity and attribute generation
nlp = pipeline("text-generation")

def generate_machine_data(machine_type, depth, size):
    # Generate core entities and attributes using NLP
    machine_description = f"A {machine_type} is a machine that"
    machine_attributes = nlp(machine_description, max_length=100, num_return_sequences=5)

    # Extract key entities and attributes
    entities = []
    for text_data in machine_attributes:
        entities.extend(text_data['generated_text'].split(','))
    entities = list(set(entities))

    # Create JSON structure
    data = {}
    for _ in range(random.randint(2, size)):
        root_key = random.choice(entities)
        data[root_key] = {}
        for _ in range(depth - 1):
            sub_key = random.choice(entities)
            data[root_key][sub_key] = {}
        # Populate leaf nodes with random data
        data[root_key][sub_key] = {
            'value': random.uniform(0, 100)
        }

    return data

# Example usage
machine_types = ['boiler', 'compressor', ...]  # Full list of machine types
dataset = []
for machine_type in machine_types:
    dataset.append(generate_machine_data(machine_type, 3, 5))

print(json.dumps(dataset, indent=2))




[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
No model was supplied, defaulted to openai-community/gpt2 and revision 6c0e608 (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos

[
  {
    " combined with additional hydrogen added outside of the boiler": {
      " does not require specialized tools. The heating and cooling system is a simple electrical circuit": {},
      " because of its size": {
        "value": 44.652330094907455
      }
    },
    "A boiler is a machine that has been subjected to continuous firing by the heat of the steam or steam engine for several hours. A steam boiler is a type of boiler that": {
      " boilers can actually provide the power needed when the stove is run. When it goes silent the boiler creates steam when it's turned off. There is an average of 2A/1W at the lower boilers": {},
      "A boiler is a machine that uses electricity to process a mixture of hydrogen (OH) and helium in the air\u2014an important component of a heat exchanger": {
        "value": 36.19726505126067
      }
    },
    " any place you want or want not to have. We have a lot of boiler rooms in Australia now and they're just huge. You could build a hous

In [28]:
import json
import random
import numpy as np
from nltk.corpus import wordnet
from nltk import download
from transformers import pipeline

# Download necessary NLTK data
download('wordnet')
download('omw-1.4')

# NLP pipeline for entity and attribute generation
nlp = pipeline("text-generation")

def shorten_key(key, max_len=3):
  return key[:max_len]

def generate_machine_data(machine_type, depth, size):
  # Generate core entities and attributes using NLP
  machine_description = f"A {machine_type} is a machine that"
  machine_attributes = nlp(machine_description, max_length=100, num_return_sequences=5)

  # Extract key entities and attributes
  entities = []
  for text_data in machine_attributes:
    entities.extend(text_data['generated_text'].split(','))
  entities = list(set(entities))

  # Create JSON structure
  data = {}
  for _ in range(random.randint(2, size)):
    root_key = shorten_key(random.choice(entities))
    data[root_key] = {}
    for _ in range(depth - 1):
      sub_key = shorten_key(random.choice(entities))
      data[root_key][sub_key] = {}
    # Populate leaf nodes with random data
    data[root_key][sub_key] = {
      'value': random.uniform(0, 100)
    }

  return data

# Example usage
machine_types = ['boiler', 'compressor', ...]
# Full list of machine types
dataset = []
for machine_type in machine_types:
  dataset.append(generate_machine_data(machine_type, 3, 5))

print(json.dumps(dataset, indent=2))


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
No model was supplied, defaulted to openai-community/gpt2 and revision 6c0e608 (https://huggingface.co/openai-community/gpt2).
Using a pipeline without specifying a model name and revision in production is not recommended.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos

[
  {
    " or": {
      " re": {},
      " ra": {
        "value": 29.347744512703567
      }
    },
    " ho": {
      " an": {},
      " te": {
        "value": 34.67956618987742
      }
    },
    " an": {
      " te": {},
      " an": {
        "value": 85.54038428135928
      }
    },
    "A b": {
      " an": {},
      " ra": {
        "value": 69.92615774716863
      }
    }
  },
  {
    " th": {
      "A c": {},
      " bu": {
        "value": 54.17633615026849
      }
    },
    " ad": {
      " ad": {},
      "A c": {
        "value": 53.65096891085324
      }
    },
    " bu": {
      "A c": {
        "value": 55.60562708929512
      }
    },
    "A c": {
      " a ": {},
      " th": {
        "value": 86.93883470567309
      }
    }
  },
  {
    "A E": {
      " as": {},
      " or": {
        "value": 44.3266823352529
      }
    },
    " wi": {
      " wi": {},
      "A E": {
        "value": 91.38661968673505
      }
    }
  }
]


In [36]:
import json
import random
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import download
from transformers import pipeline, AutoModel, AutoTokenizer
import spacy

# Download necessary NLTK data
download('stopwords')
download('punkt')

# NLP pipelines
ner_model = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")  # Replace with your preferred NER model
model_name = "bert-base-uncased"  # Replace with your preferred BERT model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Load the NER model
nlp = spacy.load("en_core_web_sm")  # Replace with your preferred model

def extract_keywords(text):
    # NER for potential keywords
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    keywords = [entity[0] for entity in entities]

    # BERT embeddings for keyword ranking (optional)
    # ...

    return keywords

def generate_machine_data(machine_type, depth, size):
    # Generate core entities and attributes using NLP
    machine_description = f"A {machine_type} is a machine that"
    machine_attributes = nlp(machine_description)

    # Create JSON structure
    data = {}
    for text_data in machine_attributes:
        keywords = extract_keywords(text_data.text)
        for keyword in keywords:
            if keyword not in data:
                data[keyword] = {}
            # Build nested structure based on subsequent keywords and values
            # ...

    return data

# Example usage
machine_types = ['boiler', 'compressor', ...]
# Full list of machine types
dataset = []
for machine_type in machine_types:
    dataset.append(generate_machine_data(machine_type, 3, 5))

print(json.dumps(dataset, indent=2))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[
  {},
  {},
  {}
]


In [40]:
import json
import random
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk import download
from transformers import pipeline, AutoModel, AutoTokenizer
import spacy

# Download necessary NLTK data
download('stopwords')
download('punkt')

# NLP pipelines
ner_model = pipeline("ner", model="dbmdz/bert-large-cased-finetuned-conll03-english")  # Replace with your preferred NER model
model_name = "bert-base-uncased"  # Replace with your preferred BERT model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Load the NER model
nlp = spacy.load("en_core_web_sm")  # Replace with your preferred model

def extract_keywords(text):
    # NER for potential keywords
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    keywords = [entity[0] for entity in entities]

    # Print keywords for debugging
    print(keywords)  # Check if keywords are extracted correctly

    return keywords

def generate_machine_data(machine_type, depth, size):
    # Generate core entities and attributes using NLP
    machine_description = f"A {machine_type} is a machine that"
    machine_attributes = nlp(machine_description)

    # Create JSON structure
    data = {}
    for text_data in machine_attributes:
        keywords = extract_keywords(text_data.text)
        for keyword in keywords:
            if keyword not in data:
                data[keyword] = {}
            # Build nested structure based on subsequent keywords and values
            # ...

    return data

# Example usage
machine_types = ['boiler', 'compressor', ...]
# Full list of machine types
dataset = []
for machine_type in machine_types:
    dataset.append(generate_machine_data(machine_type, 3, 5))

print(json.dumps(dataset, indent=2))



[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[
  {},
  {},
  {}
]
