In [2]:
from google.colab import drive
import pandas as pd

drive.mount('/content/drive')
!ls "/content/drive/My Drive"

#directories
train_labels_df = pd.read_csv('/content/drive/My Drive/urbansoundclassification/audio datasets2/train.csv')
test_labels_df = pd.read_csv('/content/drive/My Drive/urbansoundclassification/audio datasets1/test.csv')

Mounted at /content/drive
'Colab Notebooks'  'document '	 urbansoundclassification


In [1]:
import pandas as pd
import soundfile as sf
import librosa
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, cross_val_score
import joblib
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Function to extract features from an audio file
def extract_features(file_name):
    try:
        # Load the audio file using soundfile
        audio, sample_rate = sf.read(file_name)

        # Convert to mono if it's stereo
        if len(audio.shape) > 1:
            audio = audio.mean(axis=1)

        # Extract MFCC features using librosa
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)

        # Return the mean of the MFCCs
        return mfccs.mean(axis=1)
    except Exception as e:
        print(f"Error processing {file_name}: {e}")
        return None

# Function to process audio directory and extract features
def process_audio_directory(directory):
    features = []
    file_names = []

    if not os.path.exists(directory):
        print(f"Error: Directory '{directory}' not found.")
        return None

    for file_name in os.listdir(directory):
        if file_name.endswith('.wav'):
            file_path = os.path.join(directory, file_name)
            feature_vector = extract_features(file_path)
            if feature_vector is not None:
                features.append(feature_vector)
                file_names.append(file_name)

    if len(features) == 0:
        print(f"No .wav files found in '{directory}' or all files caused errors.")
        return None

    features_df = pd.DataFrame(features)
    features_df['file_name'] = file_names
    return features_df

# Load labels
def load_labels(file_path):
    try:
        return pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"Error: File '{file_path}' not found.")
        return None

# Function to extract ID from file name
def extract_id_from_filename(file_name):
    try:
        return int(file_name.split('.')[0])
    except (ValueError, IndexError):
        return None

# Optional plotting function
def plot_confusion_matrix(conf_matrix, y_labels):
    plt.figure(figsize=(10, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=y_labels, yticklabels=y_labels)
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

def plot_feature_importance(importances):
    plt.figure(figsize=(10, 6))
    plt.bar(range(len(importances)), importances)
    plt.title('Feature Importance')
    plt.xlabel('Feature Index')
    plt.ylabel('Importance')
    plt.show()

def plot_predicted_class_distribution(predictions):
    predictions['predicted_class'].value_counts().plot(kind='bar', figsize=(10, 6))
    plt.title('Predicted Class Distribution on New Test Set')
    plt.xlabel('Class')
    plt.ylabel('Frequency')
    plt.show()

# Set this flag to True to enable plotting
ENABLE_PLOTTING = True

# Process train dataset
train_audio_directory = '/content/drive/My Drive/urbansoundclassification/audio datasets2/Train'
train_features_df = process_audio_directory(train_audio_directory)

if train_features_df is not None:
    # Load train labels
    train_labels_df = load_labels('/content/drive/My Drive/urbansoundclassification/audio datasets2/train.csv')

    if train_labels_df is not None:
        # Extract IDs from file names and convert them to integers
        train_features_df['ID'] = train_features_df['file_name'].apply(extract_id_from_filename)
        train_features_df = train_features_df.dropna(subset=['ID'])
        train_features_df['ID'] = train_features_df['ID'].astype(int)
        train_labels_df['ID'] = train_labels_df['ID'].astype(int)

        # Merge features with labels
        merged_df = pd.merge(train_features_df, train_labels_df, on='ID')

        # Separate features and labels
        X = merged_df.drop(columns=['file_name', 'ID', 'Class'])  # Drop non-numeric columns
        y = merged_df['Class']

        # Split the data into 70% training and 30% testing
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Train RandomForestClassifier model with cross-validation
        model = RandomForestClassifier(n_estimators=100)  # Using 100 trees

        # Perform 5-fold cross-validation on the training set
        cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5)

        # Print cross-validation results
        print(f"Cross-validation accuracy scores: {cv_scores}")
        print(f"Average cross-validation accuracy: {cv_scores.mean():.4f}")

        # Train the final model on the entire training set
        model.fit(X_train_scaled, y_train)

        # Predict on the test set
        y_test_pred = model.predict(X_test_scaled)

        # Calculate test accuracy
        test_accuracy = accuracy_score(y_test, y_test_pred)

        # Print test set accuracy
        print(f"Test Set Accuracy: {test_accuracy:.4f}")

        # Confusion Matrix Plot
        if ENABLE_PLOTTING:
            conf_matrix = confusion_matrix(y_test, y_test_pred)
            plot_confusion_matrix(conf_matrix, np.unique(y))

        # Classification Report
        print("\nClassification Report:\n", classification_report(y_test, y_test_pred))

        # Feature Importance Plot
        if ENABLE_PLOTTING:
            feature_importances = model.feature_importances_
            plot_feature_importance(feature_importances)

        # Save the trained model and scaler after the final iteration
        joblib.dump(model, '/content/drive/My Drive/urbansoundclassification/random_forest_model.pkl')
        joblib.dump(scaler, '/content/drive/My Drive/urbansoundclassification/scaler.pkl')

        # Load the saved model and scaler for testing
        model = joblib.load('/content/drive/My Drive/urbansoundclassification/random_forest_model.pkl')
        scaler = joblib.load('/content/drive/My Drive/urbansoundclassification/scaler.pkl')

        # Process the test dataset for new predictions
        test_audio_directory = '/content/drive/My Drive/urbansoundclassification/audio datasets1/Test'
        test_features_df = process_audio_directory(test_audio_directory)

        if test_features_df is not None:
            # Scale new test features
            X_new_test = test_features_df.drop(columns=['file_name'])
            X_new_test_scaled = scaler.transform(X_new_test)

            # Predict on the new test set
            test_predictions = model.predict(X_new_test_scaled)

            # Create a DataFrame for the predictions
            output_df = pd.DataFrame({
                'file_name': test_features_df['file_name'],
                'predicted_class': test_predictions
            })

            # Save predictions to a CSV file
            output_csv_path = '/content/drive/My Drive/urbansoundclassification/predictions.csv'
            output_df.to_csv(output_csv_path, index=False)
            print(f"Predictions saved to '{output_csv_path}'")

            # Optional: Plot predicted class distribution
            if ENABLE_PLOTTING:
                plot_predicted_class_distribution(output_df)

else:
    print("Error: No valid features extracted from the training dataset.")


Error: Directory '/content/drive/My Drive/urbansoundclassification/audio datasets2/Train' not found.
Error: No valid features extracted from the training dataset.
