In [1]:
import numpy as np
import librosa
import os
import pickle
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Paths
data_directory = r'C:/Users/HomePC/Desktop/RainfallDetector/data/raw_audio'  # Adjusted for cross-platform use
model_save_path = r'C:/Users/HomePC/Desktop/RainfallDetector/app/model.pkl'

# Feature extraction function using MFCC
def extract_mfcc_features(file_path):
    """
    Extracts MFCC (Mel-frequency cepstral coefficients) features from an audio file.
    """
    audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
    mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40).T, axis=0)  # Get 40 MFCC features
    return mfccs

# Function to load audio files and their corresponding labels
def load_audio_data(data_folder):
    """
    Loads audio files and their associated labels from the given folder.
    Each label is the name of the folder containing the .wav file.
    """
    audio_files = []
    labels = []

    # Walk through the folder and get the .wav files
    for root, _, files in os.walk(data_folder):
        for file in files:
            if file.endswith('.wav'):  # Only process .wav files
                audio_path = os.path.join(root, file)
                label = os.path.basename(root)  # Label is the folder name
                audio_files.append(audio_path)
                labels.append(label)
    
    return audio_files, labels

# Load audio files and labels
audio_files, labels = load_audio_data(data_directory)

# Check if there are any files in the dataset
if len(audio_files) == 0:
    print("No audio files found in the directory.")
else:
    # Extract features for all audio files
    X = [extract_mfcc_features(file) for file in audio_files]
    y = labels

    # Check if features and labels are non-empty
    if len(X) == 0 or len(y) == 0:
        print("No features extracted. Check your audio files or extraction process.")
    else:
        # Split the data into training and testing sets (80% training, 20% testing)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Initialize and train the RandomForest model
        rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
        rf_model.fit(X_train, y_train)

        # Make predictions on the test set
        y_pred = rf_model.predict(X_test)

        # Evaluate the model's accuracy
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Model Accuracy: {accuracy:.4f}")

        # Save the trained model to a file
        with open(model_save_path, 'wb') as model_file:
            pickle.dump(rf_model, model_file)

        print("Model saved successfully!")


Model Accuracy: 0.7143
Model saved successfully!
