<a href="https://colab.research.google.com/github/parwinderau/DataspaceConnector/blob/main/Sequential_DNN_Model_JSON.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Import necessary libraries
import json
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Step 1: Load and Flatten JSON Data
def flatten_json(y):
    out = {}
    def flatten(x, name=''):
        if type(x) is dict:
            for a in x:
                flatten(x[a], name + a + '_')
        elif type(x) is list:
            i = 0
            for a in x:
                flatten(a, name + str(i) + '_')
                i += 1
        else:
            out[name[:-1]] = x
    flatten(y)
    return out

# Example JSON Data
json_data = '''
[
  {
    "device_id": "sensor1",
    "timestamp": "2024-08-13T10:00:00Z",
    "measurements": {
      "temperature": 23.5,
      "humidity": 56.2
    }
  },
  {
    "device_id": "sensor2",
    "timestamp": "2024-08-13T10:01:00Z",
    "measurements": {
      "temperature": 22.8
    }
  }
]
'''

# Convert JSON to DataFrame
data = json.loads(json_data)
flat_data = [flatten_json(record) for record in data]
df = pd.DataFrame(flat_data)

# Step 2: Separate Numeric and Non-Numeric Data
numeric_cols = df.select_dtypes(include=[np.number]).columns
non_numeric_cols = df.select_dtypes(exclude=[np.number]).columns

# Handle Missing Data (Impute only numeric columns)
imputer = SimpleImputer(strategy='mean')
df_imputed_numeric = pd.DataFrame(imputer.fit_transform(df[numeric_cols]), columns=numeric_cols)

# Non-numeric data remains unchanged
df_imputed_non_numeric = df[non_numeric_cols]

# Concatenate the numeric and non-numeric data
df_imputed = pd.concat([df_imputed_numeric, df_imputed_non_numeric], axis=1)

# Step 3: Normalize Numeric Data
scaler = MinMaxScaler()
df_normalized_numeric = pd.DataFrame(scaler.fit_transform(df_imputed_numeric), columns=df_imputed_numeric.columns)

# Concatenate normalized numeric data with non-numeric data
df_normalized = pd.concat([df_normalized_numeric, df_imputed_non_numeric], axis=1)

# Step 4: Build a DNN Model for Schema Transformation
def build_dnn_model(input_dim):
    model = Sequential()
    model.add(Dense(128, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(10, activation='softmax'))  # Example output layer for classification
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Example: Assume the harmonized schema requires a 10-class output
input_dim = df_normalized_numeric.shape[1]
dnn_model = build_dnn_model(input_dim)

# Step 5: Train the DNN Model
# Generating synthetic labels for demonstration (Replace with actual labels)
labels = np.random.randint(0, 10, size=(df_normalized_numeric.shape[0], 1))
labels = tf.keras.utils.to_categorical(labels, num_classes=10)

# Training the model
dnn_model.fit(df_normalized_numeric, labels, epochs=10, batch_size=2)

# Step 6: Inference
# Example inference using the model (for the first data point)
prediction = dnn_model.predict(df_normalized_numeric.iloc[0:1])
print(f"Predicted schema category: {np.argmax(prediction)}")


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - accuracy: 0.0000e+00 - loss: 2.2928
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.0000e+00 - loss: 2.2910
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step - accuracy: 0.5000 - loss: 2.2720
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - accuracy: 0.5000 - loss: 2.2228
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.5000 - loss: 2.2421
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.5000 - loss: 2.2153
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 61ms/step - accuracy: 0.5000 - loss: 2.1957
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.5000 - loss: 2.2163
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s