In [None]:
#Drive connection
!sudo echo -ne '\n' | sudo add-apt-repository ppa:alessandro-strada/ppa >/dev/null 2>&1 # note: >/dev/null 2>&1 is used to supress printing
!sudo apt update >/dev/null 2>&1
!sudo apt install google-drive-ocamlfuse >/dev/null 2>&1
!google-drive-ocamlfuse
!sudo apt-get install w3m >/dev/null 2>&1 # to act as web browser
!xdg-settings set default-web-browser w3m.desktop >/dev/null 2>&1 # to set default browser
%cd /content
!mkdir gdrive
%cd gdrive
!mkdir "My Drive"
!google-drive-ocamlfuse "/content/gdrive/My Drive"

/usr/bin/xdg-open: 869: www-browser: not found
/usr/bin/xdg-open: 869: links2: not found
/usr/bin/xdg-open: 869: elinks: not found
/usr/bin/xdg-open: 869: links: not found
/usr/bin/xdg-open: 869: lynx: not found
/usr/bin/xdg-open: 869: w3m: not found
xdg-open: no method available for opening 'https://accounts.google.com/o/oauth2/auth?client_id=564921029129.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fgd-ocaml-auth.appspot.com%2Foauth2callback&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force&state=JZQmjTF7B5aIGAq6imzcZ5KOwc1lTwX-e27y-fSOms4'
/bin/sh: 1: firefox: not found
/bin/sh: 1: google-chrome: not found
/bin/sh: 1: chromium-browser: not found
/bin/sh: 1: open: not found
Cannot retrieve auth tokens.
Failure("Error opening URL:https://accounts.google.com/o/oauth2/auth?client_id=564921029129.apps.googleusercontent.com&redirect_uri=https%3A%2F%2Fgd-ocaml-auth.appspot.com%2Foauth2callback&scope=https%3A%2F%2Fwww.

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import f1_score, make_scorer, balanced_accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import pickle
#ignore the warnings
import warnings
warnings.filterwarnings('ignore')

In [None]:


# Define the main directory
DIR = '/content/gdrive/My Drive/Meta_Asr/Models/'

# Define proposed Statistical machine learning Models
Statistical_ML_Models = {
    'DecisionTree': DecisionTreeClassifier(),
    'RandomForest': RandomForestClassifier(),
    'Knn': KNeighborsClassifier()
}

#define the candidate pre_trained models
CANDIDATE_PRE_TRAINED_MODELS={
    0:"QuartZnet_Model",
    1:"Wav2Vec50_Model",
    2:"Wav2Vec960_Model",
    3:"speech2Text_Model"
   }
# Config setup
def setup(filename):
  try:
    os.makedirs(filename)
  except OSError:
    pass


# Load data
def load_data(path):
  df = pd.read_csv(path)
  return df


# Data preparation
def data_preparation(df):
  models = []
  # Drop the unnecessary columns
  df1 = df.drop('cluster', axis=1)
  # Labeling the selection models columns with integers
  for i in range(len(df1)):
    if df1['selection_model_y'][i] == "QuartZnet_Model":
      models.append(0)
    elif df1['selection_model_y'][i] == "Wav2Vec50_Model":
      models.append(1)
    elif df1['selection_model_y'][i] == "Wav2Vec960_Model":
      models.append(2)
    elif df1['selection_model_y'][i] == "speech2Text_Model":
      models.append(3)
  df1['Target'] = models
  # Define X and Y of the data
  # Dropping the target and selection_model since we only need the measurements
  X = df1.drop(['Audio', 'Target', 'selection_model_y'], axis=1)
  y = df1.Target
  # Splitting into train and test sets (80% for the train set and 20% for the test set)
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
  # Fix the imbalanced data
  # Create a RandomOverSampler object
  ros = RandomOverSampler(random_state=42)
  # Resample the data
  X_resampled, y_resampled = ros.fit_resample(X_train, y_train)
  return X, y, X_resampled, y_resampled, X_test, y_test





def modeling(df, X_resampled, y_resampled, X_test, y_test, model):
  model.fit(X_resampled, y_resampled)
  y_pred = model.predict(X_test)
  f1_scorer = make_scorer(f1_score, average="macro" if len(df.selection_model_y.unique()) > 2 else "binary")
  F1_score = f1_scorer(model, X_test, y_test)
  balanced_acc = balanced_accuracy_score(y_test, y_pred)
  report = classification_report(y_test, y_pred)
  print(f"Classification report:\n{report}")
  print(f"F1-score: {F1_score:.4f}")
  print(f"Balanced accuracy: {balanced_acc:.4f}")
  return F1_score


def select_best_model(df, X_resampled, y_resampled, X_test, y_test):
  list_F1_auc_score = []
  for model_name, model in Statistical_ML_Models.items():
    f1_score = modeling(df, X_resampled, y_resampled, X_test, y_test, model)
    list_F1_auc_score.append((model_name, f1_score))
    best_model = max(list_F1_auc_score, key=lambda x: x[1])[0]
    print(f"The best ML model is: {best_model}")
  return best_model


def save_best_model(best_model, model_name):
  # Save the model to a file
  with open(f"{DIR}{model_name}.pkl", "wb") as file:
    pickle.dump(Statistical_ML_Models[best_model], file)


def general_meta_model_selection(data_path, model_name):
  # Create a directory called Models
  setup(DIR)
  # Load data
  df = load_data(data_path)
  # Data preparation
  X, y, X_resampled, y_resampled, X_test, y_test = data_preparation(df)
  # Modeling and select the best statistical ML model
  best_model = select_best_model(df, X_resampled, y_resampled, X_test, y_test)
  # Save the best model pickle
  save_best_model(best_model, model_name)
  print("Model is saved.")
  return best_model


In [None]:
if __name__ == "__main__":
  #Raw data path
  print("**** The Meta Model for Raw Data****")
  model_name = 'Raw_MetaModel'
  rawDataPath="/content/gdrive/My Drive/Meta_Asr/Main_Dataset/Main_RawMetaData.csv"
  #MetaLearning selection
  best_model = general_meta_model_selection(rawDataPath,model_name)
  print("-------------------------------------------------------------------------------------")
  print("**** The Meta Model for PreProcessed Data****")
  model_name='PreProcess_MetaModel'
  #PreProcessed Data path
  path = "/content/gdrive/My Drive/Meta_Asr/Main_Dataset/Main_PreProcessedMetaData.csv"
  #MetaLearning selection
  best_model = general_meta_model_selection(path,model_name)

**** The Meta Model for Raw Data****
Classification report:
              precision    recall  f1-score   support

           0       0.79      0.65      0.71        17
           1       0.92      0.92      0.92       215
           2       0.76      0.79      0.77        70

    accuracy                           0.87       302
   macro avg       0.82      0.78      0.80       302
weighted avg       0.87      0.87      0.87       302

F1-score: 0.8010
Balanced accuracy: 0.7846
The best ML model is: DecisionTree
Classification report:
              precision    recall  f1-score   support

           0       0.92      0.71      0.80        17
           1       0.93      0.97      0.95       215
           2       0.88      0.81      0.84        70

    accuracy                           0.92       302
   macro avg       0.91      0.83      0.86       302
weighted avg       0.92      0.92      0.92       302

F1-score: 0.8640
Balanced accuracy: 0.8292
The best ML model is: RandomForest