### Load the dataset from a CSV file and Create DataFrame
Print the first few rows of the DataFrame to quickly inspect the data

In [1]:
import pandas as pd

# Load the downloaded CSV file
data = pd.read_csv('final_Audio_dataset.csv')
print("Loaded Data:")
data.head()

Initially, the values in the RMS_Energy column are stored as strings, and they contain square brackets. This format is not suitable for numerical analysis, and hence, we need to clean and convert the data.

In [2]:
data['RMS_Energy'] = data['RMS_Energy'].astype(str).str.replace('[', '', regex=False).str.replace(']', '', regex=False).astype(float)

# Utills
Required Functions


---



###   **K_fold_Crossvalidation Function**:  

---



  *  This function provides an efficient way to compare classification models and evaluate various features and feature reduction techniques.

  *   Using the KFold class from sklearn.model_selection, the data is split into k folds (default is 5). This ensures that each fold is used as a test set once, and as part of the training set k-1 times.

  *   Standard scaling is applied to the feature sets using StandardScaler to normalize the data
  *   If a feature reduction function is provided, we fit it using the training data and then apply it to the test data. This step ensures that the feature reduction technique is appropriately trained on the training set before being applied to unseen test data.
  *  The average accuracy score across all folds are calculated to provide an overall performance metric. Additionally, the average confusion matrix is plotted to visualize the performance.

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

def k_fold_cross_validation(df, feature_names, target_name, model, feature_reduction_func, k=5):
    """
    Perform k-fold cross-validation on the given data.

    Parameters:
    df (pd.DataFrame): The input data frame.
    feature_names (list): List of feature column names.
    target_name (str): The name of the target column.
    model: The classification model to be used.
    feature_reduction_func: The feature reduction function to be applied.
    k (int): The number of folds for cross-validation (default is 5).

    Returns:
    float: The average accuracy score across all folds.
    """
    X = df[feature_names]
    y = df[target_name]

    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    accuracies = []
    confusion_matrices = []

    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        # Feature scaling
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        # Feature reduction
        if(feature_reduction_func):
          X_train_scaled = feature_reduction_func.fit_transform(X_train_scaled, y_train)
          X_test_scaled = feature_reduction_func.transform(X_test_scaled)


        model.fit(X_train_scaled, y_train)

        # Make predictions

        #test_accuracy = model.evaluate(X_test_scaled, y_test)
        y_pred = model.predict(X_test_scaled)

        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        accuracies.append(accuracy)

        # Generate confusion matrix
        conf_matrix = confusion_matrix(y_test, y_pred)
        confusion_matrices.append(conf_matrix)

    # Calculate average accuracy
    avg_accuracy = np.mean(accuracies)*100
    std_accuracy = np.std(accuracies)*100

    # Plot the average confusion matrix
    avg_conf_matrix = np.mean(confusion_matrices, axis=0)
    plt.figure(figsize=(5, 3))
    sns.heatmap(avg_conf_matrix, annot=True, fmt='.2f', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Average Confusion Matrix')
    plt.show()

    return avg_accuracy, std_accuracy


### **Visualization of Cross-Validation Results**
### **Plot_Results_Barchart**

---


To effectively compare the performance of various classification models and feature reduction techniques, we utilize a bar chart visualization. The Plot_Results_Barchart function is designed to generate and display these bar charts, providing clear and intuitive insights into the cross-validation results.

For each feature reduction method, the function filters the results and plots a bar chart showing the mean accuracy of different models for each feature set. The maximum accuracy for each feature reduction method is printed, and bars with the highest accuracy are highlighted with red edges.


In [4]:
def Plot_Results_Barchart(results_df):

  # List of feature reduction methods
  feature_reductions = results_df['Feature Reduction'].unique()

  # Set a pleasing color palette
  sns.set_palette("Set2")

  # Loop to plot bar charts for each feature reduction method
  for reduction in feature_reductions:
      # Filter data for the current feature reduction method
      filtered_results = results_df[results_df['Feature Reduction'] == reduction]
      max_value = filtered_results['Mean Accuracy'].max()
      print(f"Maximum accuracy if feature reduction({reduction}): {max_value:.2f}%")

      # Plot the bar chart
      plt.figure(figsize=(7, 5))
      ax = sns.barplot(x='Feature Set', y='Mean Accuracy', hue='Model', data=filtered_results, errorbar=None, width=0.4)


      # Add solid lines around each bar
      for bar in ax.patches:
          if bar.get_height() == max_value:
              bar.set_edgecolor('red')
              bar.set_linewidth(2)
          else:
              bar.set_edgecolor('black')
              bar.set_linewidth(0.5)

      # Set titles and labels
      plt.title(f'Cross-Validation Mean Accuracy for Different Models ({reduction})', fontsize=10)
      plt.xlabel('Feature Set', fontsize=8)
      plt.ylabel('Mean Accuracy', fontsize=8)
      plt.ylim(0, 100)
      plt.xticks(fontsize=8)
      plt.yticks(fontsize=8)
      plt.legend(title='Model', loc='upper center', bbox_to_anchor=(0.5, -0.2), ncol=3, fontsize=8)

      plt.tight_layout()
      plt.show()


### **Classification Using Different Models, Feature Reduction, and Feature Sets**
### **Run_Classification**

---
The Run_Classification function is a comprehensive approach designed to evaluate the performance of various classification models using different feature sets and feature reduction techniques. This function leverages *k-fold cross-validation*(**k=4**) to provide robust performance metrics for each combination of model, feature set, and feature reduction method.
*   Various classification models are defined for evaluation:
  *   K-Nearest Neighbors (KNN)
  *   Logistic Regression
  *   Support Vector Machine (SVM)


*   Feature Reduction Techniques:
  *   Linear Discriminant Analysis (LDA)
  *   Principal Component Analysis (PCA)
  *   No feature reduction (None)

*   Defining Feature Sets:
  *   mfcc_features: Mel-Frequency Cepstral Coefficients (MFCC) features.
  *   spectral_contrast_features: Spectral Contrast features.
  *   combined_features: A combination of MFCC and Spectral Contrast features.
  *   time_features: Time-domain features such as Zero Crossing Rate and RMS Energy.
  *   Spec_cent_Bw: Spectral Centroid and Spectral Bandwidth features.
  *   all_features: A combination of all the above feature sets.

The results DataFrame, containing the performance metrics for each combination.


In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

def Run_Classification(balanced_dataFrame):
  # Define the feature sets
  mfcc_features = [f'Mfcc_{i+1}' for i in range(13)]
  spectral_contrast_features = [f'Spectral_Contrast_{i+1}' for i in range(7)]
  combined_features = mfcc_features + spectral_contrast_features
  time_features = ['Zero_Crossing_Rate',	'RMS_Energy']
  Spec_cent_Bw = ['Spectral_Centroid',	'Spectral_Bandwidth']
  all_features = combined_features + time_features + Spec_cent_Bw

  balanced_dataFrame['RMS_Energy'] = balanced_dataFrame['RMS_Energy'].astype(str).str.replace('[', '', regex=False).str.replace(']', '', regex=False).astype(float)

  # Define the target column
  target_column = 'class'

  # Define the models
  models = {
      'KNN': KNeighborsClassifier(n_neighbors=5),
      'Logistic Regression': LogisticRegression( max_iter=200, random_state= 42),
      'SVM': SVC(kernel='linear', random_state=42)
  }

  # Define the feature sets
  feature_sets = {
      'time_domain_features' : time_features,
      'MFCC': mfcc_features,
      'Spectral Contrast': spectral_contrast_features,
      'MFCC&Sp_Contrast': combined_features,
      'Spec_Cent&BW': Spec_cent_Bw,
      'all_features' : all_features
  }

  feature_reductions = {
      'LDA': LinearDiscriminantAnalysis(),
      'PCA': PCA(),
      'None': None
  }

  # Run cross-validation and store results
  results = []
  for model_name, model in models.items():
      for feature_set_name, feature_columns in feature_sets.items():
          for reduction_name, reduction_func in feature_reductions.items():
              print(f'\nModel({model_name}) using feature({feature_set_name}) and feature reduction({reduction_name})\n')
              mean_accuracy, std_accuracy = k_fold_cross_validation(balanced_dataFrame, feature_columns, target_column, model, reduction_func, k=4)
              results.append({
                  'Model': model_name,
                  'Feature Set': feature_set_name,
                  'Feature Reduction': reduction_name,
                  'Mean Accuracy': mean_accuracy,
                  'Std Accuracy': std_accuracy
                  })

  results_df = pd.DataFrame(results)
  return results_df

# Random Student Set1
Select 6 random Student and Ensure each student has an equal number of samples and visualize some features using pairplots.

**Random Student Selection**: We begin by randomly selecting 6 students from the dataset. This ensures that our analysis is not biased towards any specific subset of students.

In [6]:
import random

random.seed(42)

# Get unique student IDs
student_ids = data['student_id'].unique()

# Randomly select 6 students
selected_students = random.sample(list(student_ids), 6)
print(f"Selected Students: {selected_students}")

**Equal Number of Samples**: For each of the selected students, we ensure that an equal number of samples are used for further analysis. This step is crucial to maintain consistency and fairness in our analysis.

In [7]:
from sklearn.utils import shuffle

filtered_data = data[data['student_id'].isin(selected_students)]

# Ensure each student has an equal number of samples
balanced_data = filtered_data.groupby('student_id').apply(lambda x: x.sample(n=filtered_data['student_id'].value_counts().min(), random_state=42)).reset_index(drop=True)
balanced_data = shuffle(balanced_data, random_state=42)
balanced_data.reset_index(drop=True, inplace=True)

print("Number of samples for each student in the balanced DataFrame:")
print(balanced_data['student_id'].value_counts())

### **Pairplot Visualization**:
Pairplots are used to visualize the relationships between different features for the selected students. This provides an intuitive and visual understanding of the data distribution and correlations.

**MFCC features**

In [8]:
import seaborn as sns


mfcc_features = [f'Mfcc_{i+1}' for i in range(13)]
# Plot pair plot
palette = sns.color_palette("tab10", balanced_data['student_id'].nunique())
sns.pairplot(balanced_data, hue='student_id', vars= mfcc_features, palette=palette)

# Display the plot
plt.show()


**Spectral_Contrast features**

In [9]:
import seaborn as sns

spectral_contrast_features = [f'Spectral_Contrast_{i+1}' for i in range(7)]
# Plot pair plot
palette = sns.color_palette("tab10", balanced_data['student_id'].nunique())
sns.pairplot(balanced_data, hue='student_id', vars= spectral_contrast_features, palette=palette)

# Display the plot
plt.show()

**Time domain features**: Zero Crossing Rate and RMS Energy

In [10]:
import seaborn as sns

time_features = ['Zero_Crossing_Rate',	'RMS_Energy']
# Plot pair plot
palette = sns.color_palette("tab10", balanced_data['student_id'].nunique())
sns.pairplot(balanced_data, hue='student_id', vars= time_features, palette=palette)

# Display the plot
plt.show()

## **Classification and Analysis for Student Set1**

**Mapping Student IDs to Class Labels**: We create a mapping from student IDs to class labels. Each selected student is assigned a unique class label, starting from 0 up to the number of selected students minus one.

In [11]:
# Map student IDs to class labels (e.g., 0 to 5)
class_mapping = {student_id: idx for idx, student_id in enumerate(selected_students)}
balanced_data['class'] = balanced_data['student_id'].map(class_mapping)


print("Balanced DataFrame with Class Column:")
balanced_data.tail()

### Classification Process and Average Confusion Matrix

#### **Calling the Classification Function**
The *Run_Classification* function is called with the balanced dataset (balanced_data) as its input. This function performs k-fold cross-validation for various combinations of classification models, feature sets, and feature reduction techniques.
The *Result_DF* DataFrame now contains detailed performance metrics for various combinations of models, feature sets, and feature reduction techniques. This structured summary facilitates easy comparison and selection of the most effective approaches for the classification task.

In [12]:
Result_DF = Run_Classification(balanced_data)

#### **Sorting and Displaying Top Results**
By sorting the results DataFrame by mean accuracy and displaying the top rows, we efficiently identify the best-performing combinations of models, feature sets, and feature reduction techniques.

In [13]:
df_sorted = Result_DF.sort_values(by='Mean Accuracy', ascending=False)
df_sorted.head()

#### **Ploting Results**
The line Plot_Results_Barchart(Result_DF) calls the Plot_Results_Barchart function and passes the Result_DF DataFrame as an argument. This function generates and displays bar charts that visually compare the performance of various classification models and feature reduction techniques based on the cross-validation results stored in Result_DF.

In [14]:
Plot_Results_Barchart(Result_DF)

# Random Student Set2

In [15]:
import random

random.seed(8)

# Get unique student IDs
student_ids = data['student_id'].unique()

# Randomly select 6 students
selected_students = random.sample(list(student_ids), 6)
print(f"Selected Students: {selected_students}")

In [16]:
from sklearn.utils import shuffle

filtered_data = data[data['student_id'].isin(selected_students)]

# Ensure each student has an equal number of samples
balanced_data = filtered_data.groupby('student_id').apply(lambda x: x.sample(n=filtered_data['student_id'].value_counts().min(), random_state=42)).reset_index(drop=True)
balanced_data = shuffle(balanced_data, random_state=42)
balanced_data.reset_index(drop=True, inplace=True)

print("Number of samples for each student in the balanced DataFrame:")
print(balanced_data['student_id'].value_counts())

## Classification and Analysis for Student Set2

In [17]:
# Map student IDs to class labels (e.g., 0 to 5)
class_mapping = {student_id: idx for idx, student_id in enumerate(selected_students)}
balanced_data['class'] = balanced_data['student_id'].map(class_mapping)

print("Balanced DataFrame with Class Column:")
balanced_data.tail()

### Classification Process and Average Confusion Matrix

#### **Calling the Classification Function**

In [18]:
Result_DF = Run_Classification(balanced_data)

#### **Sorting and displaying Top Results**

In [19]:
df_sorted = Result_DF.sort_values(by='Mean Accuracy', ascending=False)
df_sorted.head()

#### **Ploting Results**

In [20]:
Plot_Results_Barchart(Result_DF)

# Random Student Set3

In [21]:
import random

random.seed(1403)

# Get unique student IDs
student_ids = data['student_id'].unique()

# Randomly select 6 students
selected_students = random.sample(list(student_ids), 6)
print(f"Selected Students: {selected_students}")

In [22]:
from sklearn.utils import shuffle

filtered_data = data[data['student_id'].isin(selected_students)]

# Ensure each student has an equal number of samples
balanced_data = filtered_data.groupby('student_id').apply(lambda x: x.sample(n=filtered_data['student_id'].value_counts().min(), random_state=42)).reset_index(drop=True)
balanced_data = shuffle(balanced_data, random_state=42)
balanced_data.reset_index(drop=True, inplace=True)

print("Number of samples for each student in the balanced DataFrame:")
print(balanced_data['student_id'].value_counts())

## Classification and Analysis for Student Set3

In [23]:
# Map student IDs to class labels (e.g., 0 to 5)
class_mapping = {student_id: idx for idx, student_id in enumerate(selected_students)}
balanced_data['class'] = balanced_data['student_id'].map(class_mapping)

print("Balanced DataFrame with Class Column:")
balanced_data.tail()

### Classification Process and Average Confusion Matrix

#### **Calling the Classification Function**

In [24]:
Result_DF = Run_Classification(balanced_data)

#### **Sorting and displaying Top Results**

In [25]:
df_sorted = Result_DF.sort_values(by='Mean Accuracy', ascending=False)
df_sorted.head()

#### **Ploting Results**

In [26]:
Plot_Results_Barchart(Result_DF)