<a href="https://colab.research.google.com/github/parwinderau/DataspaceConnector/blob/main/Simple_JSON_Synthetic_Generator.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
pip install faker

Collecting faker
  Downloading Faker-27.0.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-27.0.0-py3-none-any.whl (1.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.8 MB[0m [31m5.6 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━[0m [32m1.4/1.8 MB[0m [31m20.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-27.0.0


In [3]:
import json
import random
from faker import Faker
from copy import deepcopy

fake = Faker()

# Base template for generating synthetic JSON data
base_template = {
    "device_id": None,
    "timestamp": None,
    "measurements": {
        "temperature": None,
        "humidity": None
    }
}

# Function to generate synthetic data
def generate_data(template, structure_variation=True, semantic_variation=True):
    data = deepcopy(template)

    # Semantically same information but structurally different
    if structure_variation and not semantic_variation:
        data = {key: data[key] for key in reversed(list(data.keys()))}  # Reverse the order of keys

    # Structurally same but semantically different
    elif not structure_variation and semantic_variation:
        data["device_id"] = fake.uuid4()
        data["timestamp"] = fake.date_time().isoformat()
        data["measurements"]["temperature"] = random.uniform(-10, 50)  # Different but valid temperature range
        data["measurements"]["humidity"] = random.uniform(10, 90)  # Different but valid humidity range

    # Structural and semantically both different
    elif structure_variation and semantic_variation:
        data["sensor_id"] = data.pop("device_id")  # Change the key name
        data["time"] = data.pop("timestamp")  # Change the key name
        data["readings"] = data.pop("measurements")  # Change the key name
        data["readings"]["temp"] = data["readings"].pop("temperature")  # Change the key name
        data["readings"]["humid"] = data["readings"].pop("humidity")  # Change the key name
        data["sensor_id"] = fake.uuid4()
        data["time"] = fake.date_time().isoformat()
        data["readings"]["temp"] = random.uniform(-10, 50)
        data["readings"]["humid"] = random.uniform(10, 90)

    # Structural and semantically both same
    else:
        data["device_id"] = fake.uuid4()
        data["timestamp"] = fake.date_time().isoformat()
        data["measurements"]["temperature"] = random.uniform(15, 30)
        data["measurements"]["humidity"] = random.uniform(30, 70)

    return data

# Generate a dataset with varying conditions
def generate_synthetic_dataset(num_samples=10):
    dataset = []

    for _ in range(num_samples):
        condition = random.choice([
            'structure_diff_semantic_same',
            'structure_same_semantic_diff',
            'structure_semantic_both_diff',
            'structure_semantic_both_same'
        ])

        if condition == 'structure_diff_semantic_same':
            data = generate_data(base_template, structure_variation=True, semantic_variation=False)

        elif condition == 'structure_same_semantic_diff':
            data = generate_data(base_template, structure_variation=False, semantic_variation=True)

        elif condition == 'structure_semantic_both_diff':
            data = generate_data(base_template, structure_variation=True, semantic_variation=True)

        else:  # 'structure_semantic_both_same'
            data = generate_data(base_template, structure_variation=False, semantic_variation=False)

        dataset.append(data)

    return dataset

# Generate and display synthetic data
synthetic_data = generate_synthetic_dataset(5)
print(json.dumps(synthetic_data, indent=2))


[
  {
    "device_id": "baeb342d-d9e8-4833-ae1e-d437c9c133ab",
    "timestamp": "2018-10-25T18:49:49.894426",
    "measurements": {
      "temperature": -0.9168523885050668,
      "humidity": 15.577727734748246
    }
  },
  {
    "device_id": "10cab8a2-9ea2-4c8d-9389-e80d8a892fdc",
    "timestamp": "1977-04-29T16:44:09.032658",
    "measurements": {
      "temperature": 17.40642111889551,
      "humidity": 57.87882101785955
    }
  },
  {
    "device_id": "eeccb19f-cec8-4d5b-996b-213b880a08b7",
    "timestamp": "2017-12-12T04:02:19.651435",
    "measurements": {
      "temperature": 23.046873212378458,
      "humidity": 17.822500057269934
    }
  },
  {
    "device_id": "22d7965a-e07a-4faf-9347-363f9c1e7867",
    "timestamp": "1979-10-10T23:26:13.392379",
    "measurements": {
      "temperature": 22.538488845879005,
      "humidity": 42.10645209994575
    }
  },
  {
    "sensor_id": "d34495a3-5a26-4e51-a871-bfd85d206633",
    "time": "2010-04-28T01:57:31.748625",
    "readings": {
   

In [6]:
#This is for NLP based JSON data generator for IoT domain
import json
import random
from faker import Faker
from copy import deepcopy
from nltk.corpus import wordnet
from nltk import download

# Download necessary NLTK data
download('wordnet')
download('omw-1.4')

fake = Faker()

# Function to get synonyms from WordNet
def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

# Generate variations of terms used in JSON keys
term_variations = {
    "device_id": get_synonyms("device")[:5] + ["sensor_id", "equipment_id", "machine_id"],
    "timestamp": get_synonyms("timestamp")[:5] + ["time", "datetime", "log_time"],
    "measurements": get_synonyms("measurement")[:5] + ["readings", "data", "values"],
    "temperature": get_synonyms("temperature")[:5] + ["temp", "heat", "thermal"],
    "humidity": get_synonyms("humidity")[:5] + ["moisture", "dampness", "humid"],
}

# Base template for generating synthetic JSON data
base_template = {
    "device_id": None,
    "timestamp": None,
    "measurements": {
        "temperature": None,
        "humidity": None
    }
}

# Function to generate synthetic data with NLP variations
def generate_data(template, structure_variation=True, semantic_variation=True):
    data = deepcopy(template)

    # Apply term variations
    def apply_variations(d):
        for key in list(d.keys()):
            if isinstance(d[key], dict):
                apply_variations(d[key])
            new_key = random.choice(term_variations.get(key, [key]))
            if new_key != key:
                d[new_key] = d.pop(key)

    if structure_variation:
        apply_variations(data)

    # Semantically same but structurally different
    if structure_variation and not semantic_variation:
        pass  # Already handled by apply_variations

    # Structurally same but semantically different
    elif not structure_variation and semantic_variation:
        data["device_id"] = fake.uuid4()
        data["timestamp"] = fake.date_time().isoformat()
        data["measurements"]["temperature"] = random.uniform(-10, 50)
        data["measurements"]["humidity"] = random.uniform(10, 90)

    # Structural and semantically both different
    elif structure_variation and semantic_variation:
        data["device_id"] = fake.uuid4()
        data["timestamp"] = fake.date_time().isoformat()
        data["measurements"]["temperature"] = random.uniform(-10, 50)
        data["measurements"]["humidity"] = random.uniform(10, 90)

    # Structural and semantically both same
    else:
        data["device_id"] = fake.uuid4()
        data["timestamp"] = fake.date_time().isoformat()
        data["measurements"]["temperature"] = random.uniform(15, 30)
        data["measurements"]["humidity"] = random.uniform(30, 70)

    return data

# Generate a dataset with varying conditions
def generate_synthetic_dataset(num_samples=10):
    dataset = []

    for _ in range(num_samples):
        condition = random.choice([
            'structure_diff_semantic_same',
            'structure_same_semantic_diff',
            'structure_semantic_both_diff',
            'structure_semantic_both_same'
        ])

        if condition == 'structure_diff_semantic_same':
            data = generate_data(base_template, structure_variation=True, semantic_variation=False)

        elif condition == 'structure_same_semantic_diff':
            data = generate_data(base_template, structure_variation=False, semantic_variation=True)

        elif condition == 'structure_semantic_both_diff':
            data = generate_data(base_template, structure_variation=True, semantic_variation=True)

        else:  # 'structure_semantic_both_same'
            data = generate_data(base_template, structure_variation=False, semantic_variation=False)

        dataset.append(data)

    return dataset

# Generate and display synthetic data
synthetic_data = generate_synthetic_dataset(5)
print(json.dumps(synthetic_data, indent=2))


[
  {
    "device_id": "f70f6a2f-9afb-4e4e-b062-2466476d0105",
    "timestamp": "1995-05-12T08:43:15.351342",
    "measurements": {
      "temperature": 21.205621152364852,
      "humidity": 17.171543068727644
    }
  },
  {
    "device": null,
    "time": null,
    "mensuration": {
      "humidity": null,
      "temp": null
    }
  },
  {
    "device_id": "20cccd6f-5904-4373-8b59-ebc5abc0ec9d",
    "timestamp": "2002-10-23T02:30:10.833039",
    "measurements": {
      "temperature": 24.250107522842594,
      "humidity": 41.437023672510215
    }
  },
  {
    "twist": null,
    "time": null,
    "measure": {
      "temperature": null,
      "humidity": null
    }
  },
  {
    "sensor_id": null,
    "datetime": null,
    "measuring": {
      "temperature": null,
      "humidness": null
    }
  }
]


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
