In [13]:
siren_sounds = ['siren_sound_1.wav','siren_sound_2.wav','siren_sound_3.wav','traffic_sound_1.wav']
traffic_sounds = ['traffic_sound_1.wav', 'traffic_sound_2.wav']

In [4]:
from random import choice
import librosa
import cv2
import time
import numpy as np
import pathlib
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from sklearn.metrics import accuracy_score, classification_report
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Concatenate, Conv1D, MaxPooling1D
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import wave




Algorithm
1) Start
2) Detect the sound of Emergency Vehicle
3) If the frequency matches then set the camera
4) Capture the selected road image with vehicles
5) Compare the image with the dataset.
6) If there is match for emergency vehicle with the vehicle on the road then send the message to the DSS
7) After receiving the message from smart object, DSS checks for the sender address.
8) DSS takes the appropriate decision by clearing the lane traffic of the requesting smart object.
9) If any new message from any other or same smart objects then go to step 7.
Once the emergency vehicle passes away from the central junction (intersection), data will be added to cloud and normal routine
of controlling the congestion will be carried out.


In [5]:
data_train = pd.read_csv("Emergency_Vehicles/train.csv")
data_train.head()

Unnamed: 0,image_names,emergency_or_not
0,1503.jpg,0
1,1420.jpg,0
2,1764.jpg,0
3,1356.jpg,0
4,1117.jpg,0


In [10]:
#Preprocessing images  - Image to array
def preprocessing_img(file_path):
    img = load_img(f"Emergency_vehicles/train/{file_path}",target_size=(224,224))
    
    img_array = img_to_array(img)
    img_array /= 255.0
    return img_array

data_train["img_array"] = data_train["image_names"].apply(preprocessing_img)
data_train.head()

Unnamed: 0,image_names,emergency_or_not,img_array
0,1503.jpg,0,"[[[0.36862746, 0.3764706, 0.2901961], [0.38431..."
1,1420.jpg,0,"[[[0.84705883, 0.9137255, 0.9764706], [0.85098..."
2,1764.jpg,0,"[[[0.24313726, 0.2509804, 0.2], [0.22352941, 0..."
3,1356.jpg,0,"[[[0.7647059, 0.84705883, 0.7176471], [0.62352..."
4,1117.jpg,0,"[[[0.0, 0.03137255, 0.0], [0.1254902, 0.160784..."


In [11]:
# Preprocessing sound
def preprocessing_sound(file_path,type):
    if (type == 'siren'):
        audio,sr = librosa.load(f'SirenSounds/{file_path}',sr = 44100)
    else:
        audio,sr = librosa.load(f'TrafficSounds/{file_path}',sr = 44100)

    # Generate spectrogramq    ````````````````````````````
    spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr)
    spectrogram_db = librosa.power_to_db(spectrogram, ref=np.max)
    return spectrogram_db

In [14]:
#Dataset development 
sound_column=[]
new_column = []
for index,row in data_train.iterrows():
    if (row['emergency_or_not'] == 1):
        file_path = choice(siren_sounds)
        if (file_path[0] == 't'):
            sound_column.append(preprocessing_sound(file_path=file_path,type='traffic'))
            new_column.append(0)
        else:
            sound_column.append(preprocessing_sound(file_path=file_path,type='siren'))
            new_column.append(1)
    else:
        file_path = choice(traffic_sounds)        
        sound_column.append(preprocessing_sound(file_path=file_path,type = 'traffic'))
        new_column.append(0)
        
data_train['siren_or_not'] = new_column
data_train['sound'] = sound_column

In [15]:
data_train.head()

Unnamed: 0,image_names,emergency_or_not,img_array,siren_or_not,sound
0,1503.jpg,0,"[[[0.36862746, 0.3764706, 0.2901961], [0.38431...",0,"[[-14.50043, -11.244556, -11.512085, -10.55725..."
1,1420.jpg,0,"[[[0.84705883, 0.9137255, 0.9764706], [0.85098...",0,"[[-14.50043, -11.244556, -11.512085, -10.55725..."
2,1764.jpg,0,"[[[0.24313726, 0.2509804, 0.2], [0.22352941, 0...",0,"[[-14.50043, -11.244556, -11.512085, -10.55725..."
3,1356.jpg,0,"[[[0.7647059, 0.84705883, 0.7176471], [0.62352...",0,"[[-14.50043, -11.244556, -11.512085, -10.55725..."
4,1117.jpg,0,"[[[0.0, 0.03137255, 0.0], [0.1254902, 0.160784...",0,"[[-28.686245, -27.850946, -19.184986, -15.9539..."


In [25]:
#Setting up sound_model for binary classification based on the sound.
sound_model = Sequential()
sound_model.add(Conv1D(32, 3, activation='relu', input_shape=(128, 259)))
sound_model.add(MaxPooling1D(2))
sound_model.add(Flatten())
sound_model.add(Dense(128, activation='relu'))
sound_model.add(Dense(1, activation='sigmoid'))

# sound_model = Sequential()
# sound_model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)))
# sound_model.add(MaxPooling2D((2, 2)))
# sound_model.add(Conv2D(64, (3, 3), activation='relu'))
# sound_model.add(MaxPooling2D((2, 2)))
# sound_model.add(Conv2D(128, (3, 3), activation='relu'))
# sound_model.add(MaxPooling2D((2, 2)))
# sound_model.add(Flatten())
# sound_model.add(Dense(128, activation='relu'))
# sound_model.add(Dense(1, activation='sigmoid'))

In [26]:
#Compling DataSet for Sound
sound_model.compile(optimizer = "adam",loss="binary_crossentropy",metrics=['accuracy'])
#Fitting dataSets
x_train_sound = np.array(data_train['sound'].tolist())
y_train_sound = np.array(data_train['siren_or_not'].tolist())
x_train_reshaped = x_train_sound.reshape(-1, 128, 259)
sound_model.fit(x_train_sound,y_train_sound,epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1f04a306450>

In [27]:
#Setting up CNN for image modeling
img_model = Sequential()
img_model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)))
img_model.add(MaxPooling2D((2, 2)))
img_model.add(Conv2D(64, (3, 3), activation='relu'))
img_model.add(MaxPooling2D((2, 2)))
img_model.add(Conv2D(128, (3, 3), activation='relu'))
img_model.add(MaxPooling2D((2, 2)))
img_model.add(Flatten())
img_model.add(Dense(128, activation='relu'))
img_model.add(Dense(1, activation='sigmoid'))

In [28]:
#Compling DataSet for Image
img_model.compile(optimizer = "adam",loss="binary_crossentropy",metrics=['accuracy'])
#Fitting dataSets
x_train_img = np.array(data_train['img_array'].tolist())
y_train_img = np.array(data_train['emergency_or_not'].tolist())
img_model.fit(x_train_img,y_train_img,epochs=10, batch_size=32)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1f065cadf90>

In [29]:
from tensorflow.keras.layers import Input, Concatenate, Dense
from tensorflow.keras.models import Model

# Extract feature vectors from both models
sound_features = sound_model.layers[-2].output  # Last layer before sigmoid
img_features = img_model.layers[-2].output
y_train = np.array(data_train['siren_or_not'].tolist())
# Concatenate the feature vectors
combined_features = Concatenate()([sound_features, img_features])

# Add additional layers for joint representation
combined_features = Dense(128, activation='relu')(combined_features)
combined_output = Dense(1, activation='sigmoid')(combined_features)

# Create the combined model
combined_model = Model(inputs=[sound_model.input, img_model.input], outputs=combined_output)

# Compile the model
combined_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the combined model using both sound and image data
combined_model.fit([x_train_sound, x_train_img], y_train, epochs=10, batch_size=32)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x1f0f232c510>