In [80]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from sklearn.preprocessing import StandardScaler
import os

In [81]:
data = pd.read_csv("dataset.csv")
data = data.sample(frac=0.1, random_state=152).reset_index(drop=True)
data.head()

Unnamed: 0,IMAGE,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,...,mfcc_97,mfcc_98,mfcc_99,mfcc_100,mfcc_101,mfcc_102,mfcc_103,mfcc_104,CLASS1,CLASS2
0,images/restaurant8/restaurant8-100.png,15.165948,-2.141235,9.28642,1.276252,1.978727,-7.14175,-7.757782,-22.555491,-22.95111,...,-16.540592,9.08415,-32.186303,6.222683,-20.351334,4.762919,-34.971174,-7.625437,INDOORS,RESTAURANT
1,images/forest3/forest356.png,17.045248,-6.22993,-18.036703,-51.227805,-1.43546,3.107496,34.402218,-15.338986,-11.933055,...,-42.132071,26.142573,-12.519227,8.086807,-1.649185,-11.968917,-0.545824,-5.667114,OUTDOORS,FOREST
2,images/store4/store4-324.png,17.652627,-2.283363,-5.452459,0.594263,1.907081,5.639656,7.771414,4.310419,-2.162805,...,-38.625909,28.622475,-19.566231,-7.562156,-2.760357,-8.09156,10.170529,0.616509,INDOORS,GROCERY-STORE
3,images/jungle/jungle401.png,19.991208,0.009975,-15.222428,-40.892043,23.677698,-16.791392,6.900601,-14.606536,17.603168,...,-49.188256,7.746468,-18.921268,23.668902,-26.629859,-26.41588,30.286166,-22.516286,OUTDOORS,JUNGLE
4,images/guangzhou/guangzhou-263.png,21.205595,0.406901,-22.180693,-64.185971,-13.192887,-19.114017,43.447216,-5.498353,16.93292,...,-15.448265,38.640644,-14.117458,9.2459,-3.85471,6.79236,-12.728162,0.508508,OUTDOORS,CITY


In [82]:
mfcc_columns = [col for col in data.columns if 'mfcc' in col]
audio_features = data[mfcc_columns].replace('[^.0-9-]', '', regex=True).astype(float).values
vgg_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

In [83]:
scaler = StandardScaler()
audio_features_scaled = scaler.fit_transform(audio_features)

In [84]:
def train_and_evaluate(X, y, label):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"{label} Accuracy:", accuracy_score(y_test, y_pred))

## Sound

In [85]:
train_and_evaluate(audio_features_scaled, data['CLASS1'], "Audio CLASS1 (Outdoors/Indoors)")
train_and_evaluate(audio_features_scaled, data['CLASS2'], "Audio CLASS2 (FOREST, CITY, etc.)")

Audio CLASS1 (Outdoors/Indoors) Accuracy: 0.7333333333333333
Audio CLASS2 (FOREST, CITY, etc.) Accuracy: 0.5884057971014492


## Images

In [86]:
def extract_image_features_batch(image_paths, batch_size=32):
    features = []
    batch_images = []
    for i, path in enumerate(image_paths):
        if os.path.exists(path):
            img = load_img(path, target_size=(224, 224))
            img_array = img_to_array(img)
            img_array = preprocess_input(img_array)
            batch_images.append(img_array)
        else:
            features.append(np.zeros((7 * 7 * 512,)))
        
        if len(batch_images) == batch_size or i == len(image_paths) - 1:
            batch_images_array = np.array(batch_images)
            batch_features = vgg_model.predict(batch_images_array)
            features.extend(batch_features.reshape(batch_features.shape[0], -1))
            batch_images = []
    
    return np.array(features)

image_features = extract_image_features_batch(data['IMAGE'], batch_size=32)
image_features_scaled = scaler.fit_transform(image_features)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 6s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step
[1m1/1[0m [32m━━━

In [87]:
train_and_evaluate(image_features_scaled, data['CLASS1'], "Image CLASS1 (Outdoors/Indoors)")
train_and_evaluate(image_features_scaled, data['CLASS2'], "Image CLASS2 (FOREST, CITY, etc.)")

Image CLASS1 (Outdoors/Indoors) Accuracy: 0.8840579710144928
Image CLASS2 (FOREST, CITY, etc.) Accuracy: 0.7101449275362319


## Combined

In [88]:
combined_features = np.hstack([image_features, audio_features])
combined_features_scaled = np.hstack([image_features_scaled, audio_features_scaled])

In [89]:
train_and_evaluate(combined_features_scaled, data['CLASS1'], "Combined CLASS1 (Outdoors/Indoors)")
train_and_evaluate(combined_features_scaled, data['CLASS2'], "Combined CLASS2 (FOREST, CITY, etc.)")

Combined CLASS1 (Outdoors/Indoors) Accuracy: 0.8898550724637682
Combined CLASS2 (FOREST, CITY, etc.) Accuracy: 0.7159420289855073
