<a href="https://colab.research.google.com/github/parwinderau/DataspaceConnector/blob/main/DataHeterogenity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import json
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

class CustomTokenizer:
    def __init__(self):
        self.tokenizer = Tokenizer(char_level=True)
        self.special_tokens = ['[', ']', '{', '}', ':', ',', '"']

    def fit_on_texts(self, texts):
        json_strings = [json.dumps(text) for text in texts]
        self.tokenizer.fit_on_texts(json_strings)

    def texts_to_sequences(self, texts):
        json_strings = [json.dumps(text) for text in texts]
        return self.tokenizer.texts_to_sequences(json_strings)

    def sequences_to_texts(self, sequences):
        return self.tokenizer.sequences_to_texts(sequences)

# Sample JSON data
data = [
    {'factory': {'name': 'Alice', 'product': 'car'}}
]

# Function to convert JSON to a sequence of tokens
def json_to_sequence(data):
    sequences = []
    for item in data:
        json_str = json.dumps(item)
        sequence = list(json_str)
        sequences.append(sequence)
    return sequences

# Tokenization
custom_tokenizer = CustomTokenizer()
custom_tokenizer.fit_on_texts(data)

# Convert JSON to sequences of integers
input_sequences = custom_tokenizer.texts_to_sequences(data)

# Pad sequences to a fixed length
max_length = max(len(seq) for seq in input_sequences)
input_sequences = pad_sequences(input_sequences, maxlen=max_length, padding='post')

# Create a simple sequence-to-sequence model
model = Sequential([
    Embedding(input_dim=len(custom_tokenizer.tokenizer.word_index) + 1, output_dim=64),
    LSTM(128, return_sequences=True),
    Dense(len(custom_tokenizer.tokenizer.word_index) + 1, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Training (replace with your training data)
input_sequences = np.array(input_sequences)
output_sequences = np.expand_dims(input_sequences, -1)  # Preparing output for sparse_categorical_crossentropy
model.fit(input_sequences, output_sequences, epochs=10, batch_size=32)

# Generate output (replace with your input JSON)
input_json = {'factory': {'name': 'Charlie', 'product': 'computer'}}
input_seq = custom_tokenizer.texts_to_sequences([input_json])[0]
input_seq = pad_sequences([input_seq], maxlen=max_length, padding='post')
predicted_sequence = model.predict(input_seq)[0]
predicted_text = custom_tokenizer.sequences_to_texts([np.argmax(predicted_sequence, axis=-1).tolist()])[0]

print(predicted_text)


Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.0625 - loss: 3.0905
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 182ms/step - accuracy: 0.2292 - loss: 3.0816
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step - accuracy: 0.2083 - loss: 3.0722
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 136ms/step - accuracy: 0.2083 - loss: 3.0618
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step - accuracy: 0.2083 - loss: 3.0494
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 126ms/step - accuracy: 0.2083 - loss: 3.0340
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step - accuracy: 0.2083 - loss: 3.0138
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step - accuracy: 0.2083 - loss: 2.9859
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[