<a href="https://colab.research.google.com/github/mariampinel/Deception-Detector/blob/main/Untitled4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
# Import necessary packages
!pip install pydub
! pip install numpy librosa matplotlib
! pip install tensorflow

from google.colab import drive

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os, sys, re, pickle, glob
import urllib.request
import zipfile

import IPython.display as ipd
from tqdm import tqdm
import librosa
import librosa.display
import math

from pydub import AudioSegment
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout




In [9]:
#Install library - make sure you have version 1.0.0.4

!pip install mlend==1.0.0.4



In [10]:
#Import library and functions

import mlend
from mlend import download_deception_small, deception_small_load



In [11]:

#Download small data
datadir = download_deception_small(save_to='MLEnd', subset={}, verbose=1, overwrite=False)


Downloading 100 stories (audio files) from https://github.com/MLEndDatasets/Deception
  1%|[92m                                                  [0m|100\1|00001.wav



100%|[92m▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓▓[0m|100\100|00100.wav
Done!
Total 100 found in MLEnd/deception/MLEndDD_stories_small/


In [19]:
#Read file paths
TrainSet, TestSet, MAPs = deception_small_load(datadir_main=datadir, train_test_split=0.8, verbose=1, encode_labels=True)
# Extract file paths (X) and labels (y) from TrainSet
X_train_full = np.array(TrainSet['X_paths'])  # Full training file paths
y_train_full = np.array(TrainSet['Y_encoded'])  # Corresponding labels

X_test = np.array(TestSet['X_paths'])
y_test = np.array(TestSet['Y_encoded'])



Total 100 found in MLEnd/deception/MLEndDD_stories_small/


In [21]:
# Split into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full,
    y_train_full,
    test_size=0.25,  # 25% of TrainSet goes to Validation
    stratify=y_train_full,
    random_state=42
)

In [70]:
# Break down files into 30s files- 1) For training set, 2) for validation set 3) for test set

def split_audio(file_path, segment_length=30*1000):
    audio = AudioSegment.from_file(file_path)
    total_length = len(audio)
    num_segments = math.ceil(total_length  / segment_length )
    segment_paths = []

    for i in range(num_segments):
        start_time = max(0, i * segment_length )
        end_time = min(total_length, start_time + segment_length)
        segment = audio[start_time:end_time]

        output_file = f"{file_path[:-4]}_part{i+1}.wav"
        segment.export(output_file, format="wav")
        segment_paths.append(output_file)

    return segment_paths
# Step 2: Generate 30-second segments for each dataset
def generate_segments_and_labels(file_paths, labels):
    segment_paths = []
    segment_labels = []

    for file_path, label in zip(file_paths, labels):
        # Split audio into segments
        segments = split_audio(file_path, segment_length=30*1000)
        # Append segments and their corresponding labels
        segment_paths.extend(segments)
        segment_labels.extend([label] * len(segments))
    # Check if the number of labels matches the number of segments
    if len(segment_paths) != len(segment_labels):
        raise ValueError(f"Mismatch: {len(segment_paths)} segments and {len(segment_labels)} labels")

    return segment_paths, segment_labels

In [71]:
# Generate 30-second segments for each dataset - PUT THIS PART AT THE END. JUST BEFRO RUNNING MODEL
X_train_segments, y_train_segments = generate_segments_and_labels(X_train, y_train)
X_valid_segments, y_valid_segments = generate_segments_and_labels(X_valid, y_valid)
X_test_segments, y_test_segments = generate_segments_and_labels(X_test, y_test)




In [72]:
# Maria's extraction function

def extract_features(file_path, scale_audio=False, sr=22050):
    # Load the audio file (30 seconds duration)
    x, fs = librosa.load(file_path, sr=sr, duration=30.0)

    if scale_audio:
        x = x / np.max(np.abs(x))  # Normalize audio
    # Extract MFCCs from the full signal
    mfccs = librosa.feature.mfcc(y=x, sr=fs, n_mfcc=40)
    mfccs_first_5 = mfccs[:5, :]  # Slicing to keep the first 5 MFCC coefficients
    mfccs_mean = np.mean(mfccs_first_5, axis=1)  # Mean for each MFCC
    mfccs_std = np.std(mfccs_first_5, axis=1)

    # # Harmonics Plus Noise Model (HNM) - Using HPSS as an approximation
    # harmonic, noise = librosa.effects.hpss(x)

    # # Extract MFCCs from the harmonic and noise components
    # harmonic_mfccs = librosa.feature.mfcc(y=harmonic, sr=fs, n_mfcc=5)
    # harmonic_mfccs_mean = np.mean(harmonic_mfccs, axis=1)
    # harmonic_mfccs_std = np.std(harmonic_mfccs, axis=1)

    # noise_mfccs = librosa.feature.mfcc(y=noise, sr=fs, n_mfcc=5)
    # noise_mfccs_mean = np.mean(noise_mfccs, axis=1)
    # noise_mfccs_std = np.std(noise_mfccs, axis=1)

    # Combine all the features into one array
    features = np.hstack([ mfccs_mean, mfccs_std
      # Adding HNR to the feature vector
    ])
    return features



In [74]:
# OR THIS ONE:
def segment_feature_extraction(segment_paths):
    features = []
    for segment_path in segment_paths:
        features.append(extract_features(segment_path))
    return features


In [76]:

 #Extract features for each dataset - Put at the end
X_train_features = segment_feature_extraction(X_train_segments)
X_valid_features = segment_feature_extraction(X_valid_segments)
X_test_features = segment_feature_extraction(X_test_segments)

In [82]:
# Reshape features to add a channel dimension if using CNN
X_train_features = np.array(X_train_features)
X_valid_features =np.array(X_train_features)
X_test_features = np.array(X_test_features)
# X_train_features = X_train_features[...
# Convert the list to a NumPy array
y_train_segments = np.array(y_train_segments)
y_valid_segments = np.array(y_valid_segments)
y_test_segments = np.array(y_test_segments)

print(y_train_segments.shape[0], X_train_features.shape)

318 (318, 10)




# Model

In [93]:
# Reshape the input features to match the CNN input
X_train_features = X_train_features.reshape(X_train_features.shape[0], X_train_features.shape[1], 1)  # Shape becomes (318, 10, 1)
X_valid_features = X_valid_features.reshape(X_valid_features.shape[0], X_valid_features.shape[1], 1)  # Shape becomes (X_valid_features.shape[0], 10, 1)
X_test_features = X_test_features.reshape(X_test_features.shape[0], X_test_features.shape[1], 1)  # Shape becomes (X_test_features.shape[0], 10, 1)


In [97]:
from tensorflow.keras.utils import to_categorical

# Ensure y_train_segments contains the original labels before one-hot encoding
y_train_segments = to_categorical(y_train_segments, num_classes=2)

# Now y_train_segments should have shape (318, 2), which is correct for binary classification
print("Shape of y_train_segments:", y_train_segments.shape)  # Should be (318, 2)

y_valid_segments = to_categorical(y_valid_segments, num_classes=2)


Shape of y_train_segments: (318, 2, 2, 2)


In [94]:
# Check the shapes of your data
print("Shape of X_train_features:", X_train_features.shape)  # Should be (318, 10, 1)
print("Shape of y_train_segments:", y_train_segments.shape)  # Should be (318, 2)



Shape of X_train_features: (318, 10, 1)
Shape of y_train_segments: (318, 2, 2)


In [96]:
num_features= 9
#Build sequential CNN
CNN_model = Sequential()

# Add Conv1D layers (since you're working with 1D features)
CNN_model.add(Conv1D(32, kernel_size=3, activation='relu', input_shape=( 10, 1)))  # 10 features per sample, 1 channel
CNN_model.add(MaxPooling1D(pool_size=2))
CNN_model.add(Flatten())  # Flatten the output for the dense layer
CNN_model.add(Dense(2, activation='softmax'))

# CNN_model.add(Dense(1, activation='sigmoid'))  # Assuming binary classification (2 classes)



In [88]:
# #Compile the model
# CNN_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# Compile the model
CNN_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


In [92]:

# Fit the model
cnn_results = CNN_model.fit(X_train_features, y_train_segments,
                            batch_size=64,
                            epochs=25,
                            verbose=1,
                            validation_data=(X_valid_features, y_valid_segments))

Epoch 1/25


ValueError: Cannot take the length of shape with unknown rank.

In [30]:
# Compile the model with the desired loss function, optimizer, and metric to optimize
CNN_model.compile(loss = 'binary_crossentropy',
                  optimizer = 'Adam',
                  metrics = ['accuracy'])


In [41]:
# Scale data- to prevent model to fit volume level of recordings
from sklearn.preprocessing import StandardScaler
#Normalize the data
scaler = StandardScaler()
scaler.fit(X_train_features)
X_train_scalled = scaler.transform(X_train_features)
X_test_scalled = scaler.transform(X_test_features)

In [42]:
print(f"X_train_features shape: {X_train_features.shape}")
print(f"y_train_segments shape: {y_train_segments.shape}")

AttributeError: 'list' object has no attribute 'shape'

In [40]:
#Model fit
cnn_results = CNN_model.fit(X_train_features, y_train_segments,
              batch_size = 64,
              epochs = 25,
              verbose = 1,
              validation_data = (X_valid_features, y_valid_segments))

y_pred= CNN_model.predict(X_test_features)
y_pred_clases=np.argmax(y_pred,axis=1)

ValueError: Data cardinality is ambiguous. Make sure all arrays contain the same number of samples.'x' sizes: 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10
'y' sizes: 318
