In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [15]:
import torch
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from transformers import BertForSequenceClassification, BertTokenizer, RobertaForSequenceClassification, RobertaTokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from torch.utils.data import DataLoader, RandomSampler, TensorDataset, Subset
from transformers import AdamW
from sklearn.model_selection import KFold
import pandas as pd

In [16]:
MODEL_PATH = '/content/drive/MyDrive/core/models/'
CORE_PATH = '/content/drive/MyDrive/core/'
DATA_PATH = '/content/drive/MyDrive/core/shuffled_data.csv'

In [None]:
roberta_models = [

    {
      'model' : RobertaForSequenceClassification.from_pretrained(MODEL_PATH + 'roberta/' + f'roberta-base_{i}'),
      'tokenizer' : RobertaTokenizer.from_pretrained(MODEL_PATH + 'roberta/' + f'roberta-base_{i}')
    }

    for i in range(5)]

bert_models = [

    {
      'model' : BertForSequenceClassification.from_pretrained(MODEL_PATH + 'bert/' + f'bert-base-cased_{i}'),
      'tokenizer' : BertTokenizer.from_pretrained(MODEL_PATH + 'bert/' + f'bert-base-cased_{i}')
    }

    for i in range(5)]

distil_models = [

    {
      'model' : BertForSequenceClassification.from_pretrained(MODEL_PATH + 'distilbert/' + f'distilbert-base-cased_{i}'),
      'tokenizer' : BertTokenizer.from_pretrained(MODEL_PATH + 'distilbert/' + f'distilbert-base-cased_{i}')
    }

    for i in range(5)]

ensembles = [
    {
        'roberta' : roberta_models[i],
        'bert' : bert_models[i],
        'distil' : distil_models[i]

    }
for i in range(5)]

In [None]:
import torch
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from transformers import BertForSequenceClassification, BertTokenizer, RobertaForSequenceClassification, RobertaTokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from torch.utils.data import DataLoader, RandomSampler, TensorDataset, Subset
from transformers import AdamW
from sklearn.model_selection import KFold
import pandas as pd
import os
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

def plot_confusion(conf_matrix, labels, model_name):
    title = f"{model_name}"
    if not os.path.exists(CORE_PATH + str("plots")):
      os.makedirs(CORE_PATH + str("plots"))
    path = f"{CORE_PATH}plots/{model_name}.png"
    plt.figure(figsize=(10, 7))
    sns.heatmap(conf_matrix, annot=True, fmt='d', xticklabels=labels, yticklabels=labels)
    plt.title(title)
    plt.savefig(path)

def extract_logits(model, tokenizer, dataset):
    model.eval()
    logits = []
    with torch.no_grad():
        for text in dataset:
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
            outputs = model(**inputs)
            logits.append(outputs.logits.numpy())
    return np.vstack(logits)

df = pd.read_csv(DATA_PATH)
dataset = df['text'].tolist()
labels = df['gt'].tolist()

train_texts, test_texts, train_labels, test_labels = train_test_split(dataset, labels, test_size=0.25, random_state=42)

trained_models = []
individual_accuracies = []

for i, ensemble in enumerate(ensembles):
    roberta_train_logits = extract_logits(ensemble['roberta']['model'], ensemble['roberta']['tokenizer'], train_texts)
    bert_train_logits = extract_logits(ensemble['bert']['model'], ensemble['bert']['tokenizer'], train_texts)
    distil_train_logits = extract_logits(ensemble['distil']['model'], ensemble['distil']['tokenizer'], train_texts)

    concatenated_train_logits = np.concatenate((roberta_train_logits, bert_train_logits, distil_train_logits), axis=1)

    bagging_regressor = BaggingRegressor(n_estimators=10, random_state=42)
    bagging_regressor.fit(concatenated_train_logits, train_labels)
    trained_models.append(bagging_regressor)

    roberta_test_logits = extract_logits(ensemble['roberta']['model'], ensemble['roberta']['tokenizer'], test_texts)
    bert_test_logits = extract_logits(ensemble['bert']['model'], ensemble['bert']['tokenizer'], test_texts)
    distil_test_logits = extract_logits(ensemble['distil']['model'], ensemble['distil']['tokenizer'], test_texts)

    concatenated_test_logits = np.concatenate((roberta_test_logits, bert_test_logits, distil_test_logits), axis=1)

    test_predictions = bagging_regressor.predict(concatenated_test_logits)
    test_predictions = np.round(test_predictions)

    individual_accuracy = accuracy_score(test_labels, test_predictions)
    precision = precision_score(test_labels, test_predictions)
    recall = recall_score(test_labels, test_predictions)
    f1 = f1_score(test_labels, test_predictions)
    individual_accuracies.append(individual_accuracy)
    matrix = confusion_matrix(test_labels, test_predictions)
    print(f"Ensemble {i} Bagging Regressor Accuracy: {individual_accuracy}")
    print(f"                               Recall: {recall}")
    print(f"                               f1: {f1}")
    print(f"                               Precision: {precision}")
    plot_confusion(matrix, ['Non-Dementia', 'Dementia'], f"Ensemble_{i}")
all_predictions = np.array([model.predict(concatenated_test_logits) for model in trained_models])
rounded_predictions = np.round(all_predictions).astype(int)
majority_vote_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=rounded_predictions)

majority_vote_accuracy = accuracy_score(test_labels, majority_vote_predictions)
individual_accuracy = accuracy_score(test_labels, majority_vote_predictions)
precision = precision_score(test_labels, majority_vote_predictions)
recall = recall_score(test_labels, majority_vote_predictions)
f1 = f1_score(test_labels, majority_vote_predictions)
individual_accuracies.append(majority_vote_accuracy)
matrix = confusion_matrix(test_labels, majority_vote_predictions)
print(f"Majority Voting Accuracy: {majority_vote_accuracy}")
print(f"                               Recall: {recall}")
print(f"                               f1: {f1}")
print(f"                               Precision: {precision}")
plot_confusion(matrix, ['Non-Dementia', 'Dementia'], f"Majority_Voting")

In [None]:
import torch
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from transformers import BertForSequenceClassification, BertTokenizer, RobertaForSequenceClassification, RobertaTokenizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix
from torch.utils.data import DataLoader, RandomSampler, TensorDataset, Subset
from transformers import AdamW
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
import pandas as pd
import os
from sklearn.ensemble import BaggingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

def plot_confusion(conf_matrix, labels, model_name):
    title = f"{model_name}"
    if not os.path.exists(CORE_PATH + str("plots")):
        os.makedirs(CORE_PATH + str("plots"))
    path = f"{CORE_PATH}plots/{model_name}.png"
    plt.figure(figsize=(10, 7))
    sns.heatmap(conf_matrix, annot=True, fmt='d', xticklabels=labels, yticklabels=labels)
    plt.title(title)
    plt.savefig(path)

def extract_logits(model, tokenizer, dataset):
    model.eval()
    logits = []
    with torch.no_grad():
        for text in dataset:
            inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
            outputs = model(**inputs)
            logits.append(outputs.logits.numpy())
    return np.vstack(logits)

def train_and_evaluate_model(classifier, train_features, train_labels, test_features, test_labels, model_name):
    classifier.fit(train_features, train_labels)
    test_predictions = classifier.predict(test_features)
    test_predictions = np.round(test_predictions)

    individual_accuracy = accuracy_score(test_labels, test_predictions)
    precision = precision_score(test_labels, test_predictions)
    recall = recall_score(test_labels, test_predictions)
    f1 = f1_score(test_labels, test_predictions)

    print(f"{model_name} Accuracy: {individual_accuracy}")
    print(f"             Recall: {recall}")
    print(f"             F1: {f1}")
    print(f"             Precision: {precision}")

    matrix = confusion_matrix(test_labels, test_predictions)
    plot_confusion(matrix, ['Non-Dementia', 'Dementia'], model_name)

df = pd.read_csv(DATA_PATH)
dataset = df['text'].tolist()
labels = df['gt'].tolist()

train_texts, test_texts, train_labels, test_labels = train_test_split(dataset, labels, test_size=0.25, random_state=42)

trained_models = []
individual_accuracies = []

classifiers = {
    'BaggingRegressor': (BaggingRegressor(random_state=42),
                         {'n_estimators': [10, 50, 100, 200],
                          'max_samples': [0.5, 0.7, 1.0],
                          'max_features': [0.5, 0.7, 1.0]}),

    'RandomForestClassifier': (RandomForestClassifier(random_state=42),
                               {'n_estimators': [50, 100, 200, 500],
                                'max_depth': [None, 10, 20, 30, 50],
                                'min_samples_split': [2, 5, 10],
                                'min_samples_leaf': [1, 2, 4]}),

    'GradientBoostingClassifier': (GradientBoostingClassifier(random_state=42),
                                   {'n_estimators': [50, 100, 200, 500],
                                    'learning_rate': [0.01, 0.05, 0.1, 0.2],
                                    'max_depth': [3, 5, 7, 10],
                                    'subsample': [0.7, 0.8, 0.9, 1.0]}),

    'SVC': (SVC(random_state=42, probability=True),
            {'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
             'C': [0.1, 1, 10, 100],
             'gamma': ['scale', 'auto']}),

    'KNeighborsClassifier': (KNeighborsClassifier(),
                             {'n_neighbors': [3, 5, 7, 9],
                              'weights': ['uniform', 'distance'],
                              'metric': ['euclidean', 'manhattan', 'minkowski']}),

    'LogisticRegression': (LogisticRegression(max_iter=1000, random_state=42),
                           {'C': [0.01, 0.1, 1, 10, 100],
                            'penalty': ['l1', 'l2', 'elasticnet', 'none'],
                            'solver': ['lbfgs', 'saga']}),

    'DecisionTreeClassifier': (DecisionTreeClassifier(random_state=42),
                               {'max_depth': [None, 10, 20, 30, 50],
                                'min_samples_split': [2, 5, 10],
                                'min_samples_leaf': [1, 2, 4],
                                'criterion': ['gini', 'entropy']})
}


roberta_train_logits = extract_logits(ensembles[0]['roberta']['model'], ensembles[0]['roberta']['tokenizer'], train_texts)
bert_train_logits = extract_logits(ensembles[0]['bert']['model'], ensembles[0]['bert']['tokenizer'], train_texts)
distil_train_logits = extract_logits(ensembles[0]['distil']['model'], ensembles[0]['distil']['tokenizer'], train_texts)

concatenated_train_logits = np.concatenate((roberta_train_logits, bert_train_logits, distil_train_logits), axis=1)

roberta_test_logits = extract_logits(ensemble['roberta']['model'], ensemble['roberta']['tokenizer'], test_texts)
bert_test_logits = extract_logits(ensemble['bert']['model'], ensemble['bert']['tokenizer'], test_texts)
distil_test_logits = extract_logits(ensemble['distil']['model'], ensemble['distil']['tokenizer'], test_texts)

concatenated_test_logits = np.concatenate((roberta_test_logits, bert_test_logits, distil_test_logits), axis=1)

for i, (classifier_name, (classifier, param_grid)) in enumerate(classifiers.items()):
    grid_search = GridSearchCV(classifier, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(concatenated_train_logits, train_labels)
    best_classifier = grid_search.best_estimator_

    model_name = f"Ensemble_{i}_{classifier_name}"
    train_and_evaluate_model(best_classifier, concatenated_train_logits, train_labels, concatenated_test_logits, test_labels, model_name)
    print(f"Best parameters for {classifier_name}: {grid_search.best_params_}")
    trained_models.append(best_classifier)

all_predictions = np.array([classifier.predict(concatenated_test_logits) for classifier in trained_models])
rounded_predictions = np.round(all_predictions).astype(int)

majority_vote_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax() if len(x) > 0 else 0, axis=0, arr=rounded_predictions)

majority_vote_accuracy = accuracy_score(test_labels, majority_vote_predictions)
precision = precision_score(test_labels, majority_vote_predictions)
recall = recall_score(test_labels, majority_vote_predictions)
f1 = f1_score(test_labels, majority_vote_predictions)

print(f"Majority Voting Accuracy: {majority_vote_accuracy}")
print(f"             Recall: {recall}")
print(f"             F1: {f1}")
print(f"             Precision: {precision}")

matrix = confusion_matrix(test_labels, majority_vote_predictions)
plot_confusion(matrix, ['Non-Dementia', 'Dementia'], "Majority_Voting")


In [None]:
!pip install joblib

In [21]:
import joblib

In [26]:
for i,classifier in enumerate(trained_models):
  path = MODEL_PATH + str(i) + ".joblib"
  joblib.dump(classifier, path)