In [None]:
from google.colab import drive
import sys
import os
import random
import numpy as np
import cv2

# Mount Google Drive
drive.mount('/content/drive/')

wd = '/content/drive/My Drive/Colab Notebooks/Capstone/'

Mounted at /content/drive/


In [None]:
!pip install datasets
!pip install transformers
!pip install sentencepiece

Collecting datasets
  Downloading datasets-2.0.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 7.3 MB/s 
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 3.5 MB/s 
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
[K     |████████████████████████████████| 212 kB 43.2 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 55.6 MB/s 
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting fsspec[http]>=2021.05.0
  Downloading fsspec-2022.3.0-py3-none-any.whl (136 kB)
[K     |████████████████████████████████| 136 kB 56.7 MB/s 
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25

In [None]:
#Non-Specific Imports
import os
import re
import pandas as pd
import numpy as np
import time
import concurrent.futures as cf
from tqdm.notebook import tqdm
import math
import csv



#NLP Imports
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer


#Modeling Imports
import torch
from sklearn.model_selection import train_test_split
from datasets import load_dataset
from datasets import load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import get_scheduler
from torch.utils.data import DataLoader
from torch.optim import AdamW



In [None]:
torch.cuda.is_available()

True

# Electra Predictions

In [None]:
#Load Electra - Model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("google/electra-base-discriminator")
model = AutoModelForSequenceClassification.from_pretrained(wd+"/Electra_epoch_2_reordered_pandemic_features_model_save")

#Load Data
from datasets import load_dataset
dataset = load_dataset('csv', data_files={'test': wd+'New_Samples_pandemic_reordered_3000_test.csv'})

###  Set Seeds  ###
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


###  Tokenize  ###
def tokenize_function(example):
    return tokenizer(example["Review_Body"],truncation=True)


#Prepare the data
tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#Remove raw text - already tokenized and masked
tokenized_datasets = tokenized_datasets.remove_columns(["Review_Body"])

#Note: Removing all non-text columns?
tokenized_datasets = tokenized_datasets.remove_columns(['Unnamed: 0'])

#Rename and reformat columns
tokenized_datasets = tokenized_datasets.rename_column("Review_rating", "labels")
tokenized_datasets.set_format("torch", columns=tokenized_datasets["test"].column_names)


###  Dataloaders  ###
test_dataloader = DataLoader(tokenized_datasets["test"],
                              batch_size=12, collate_fn=data_collator)



for batch in test_dataloader:
    break


device = torch.device("cuda")
model.to(device)
print(device)



###  Set-up  ###
num_training_steps = len(test_dataloader)
progress_bar = tqdm(range(num_training_steps))
pred_test = []

### Test Evaluation  ###
## For use in reporting
metric = load_metric("accuracy")
#model.eval()
for batch in test_dataloader:
    progress_bar.update(1)
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    #metric.add_batch(predictions=predictions, references=batch["labels"])
    #print(predictions.cpu().numpy())

    pred_test.append(predictions.cpu().numpy())

with open(wd+'Electra_test_pandemic_predictions.csv', 'w') as f: 
    write = csv.writer(f) 
    write.writerows(pred_test) 


print('\nTesting Accuracy:')
#print(metric.compute())

Downloading:   0%|          | 0.00/27.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/666 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Using custom data configuration default-4cbc911317aa8208


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-4cbc911317aa8208/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-4cbc911317aa8208/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?ba/s]

cuda


  0%|          | 0/2441 [00:00<?, ?it/s]

Downloading builder script:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

100%|██████████| 2441/2441 [04:15<00:00,  8.89it/s]


Testing Accuracy:


# Ernie Predictions

In [None]:
#Load Ernie - Model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-en")
model = AutoModelForSequenceClassification.from_pretrained(wd+"/BIG_ERNIE_epoch_2_reordered_pandemic_features_model_save_ERNIE")

#Load Data
from datasets import load_dataset
dataset = load_dataset('csv', data_files={'test': wd+'New_Samples_pandemic_reordered_3000_test.csv'})

###  Set Seeds  ###
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


###  Tokenize  ###
def tokenize_function(example):
    return tokenizer(example["Review_Body"],truncation=True,max_length=512, padding=True, )


#Prepare the data
tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#Remove raw text - already tokenized and masked
tokenized_datasets = tokenized_datasets.remove_columns(["Review_Body"])

#Note: Removing all non-text columns?
tokenized_datasets = tokenized_datasets.remove_columns(['Unnamed: 0'])

#Rename and reformat columns
tokenized_datasets = tokenized_datasets.rename_column("Review_rating", "labels")
tokenized_datasets.set_format("torch", columns=tokenized_datasets["test"].column_names)


###  Dataloaders  ###
test_dataloader = DataLoader(tokenized_datasets["test"],
                              batch_size=12, collate_fn=data_collator)


for batch in test_dataloader:
    break


device = torch.device("cuda")
model.to(device)
print(device)


###  Set-up  ###
num_training_steps = len(test_dataloader)
progress_bar = tqdm(range(num_training_steps))
pred_test = []

### Test Evaluation  ###
## For use in reporting
#metric = load_metric("accuracy")
#model.eval()
for batch in test_dataloader:
    progress_bar.update(1)
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    #metric.add_batch(predictions=predictions, references=batch["labels"])
    #print(predictions.cpu().numpy())

    pred_test.append(predictions.cpu().numpy())

with open(wd+'Ernie_test_pandemic_predictions.csv', 'w') as f: 
    write = csv.writer(f) 
    write.writerows(pred_test) 


print('\nTesting Accuracy:')
#print(metric.compute())

print('\n\n\n   Ernie Test Complete\n-----------\n\n')

Using custom data configuration default-4cbc911317aa8208
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-4cbc911317aa8208/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/30 [00:00<?, ?ba/s]

cuda


  0%|          | 0/2441 [00:00<?, ?it/s]


Testing Accuracy:



   Ernie Test Complete
-----------




# DeBERTa Predictions

In [None]:
torch.cuda.is_available()

True

In [None]:
#Load DeBERTa - Model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")
model = AutoModelForSequenceClassification.from_pretrained(wd+'/DebertaV3Small_epoch_2_reordered_pandemic_features_model_save')

#Load Data
from datasets import load_dataset
dataset = load_dataset('csv', data_files={'test': wd+'New_Samples_pandemic_reordered_3000_test.csv'})

###  Set Seeds  ###
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


###  Tokenize  ###
def tokenize_function(example):
    return tokenizer(example["Review_Body"],padding=True )


#Prepare the data
tokenized_datasets = dataset.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

#Remove raw text - already tokenized and masked
tokenized_datasets = tokenized_datasets.remove_columns(["Review_Body"])

#Note: Removing all non-text columns?
tokenized_datasets = tokenized_datasets.remove_columns(['Unnamed: 0'])

#Rename and reformat columns
tokenized_datasets = tokenized_datasets.rename_column("Review_rating", "labels")
tokenized_datasets.set_format("torch", columns=tokenized_datasets["test"].column_names)


###  Dataloaders  ###
test_dataloader = DataLoader(tokenized_datasets["test"],
                              batch_size=6, collate_fn=data_collator)



for batch in test_dataloader:
    break



device = torch.device("cuda")
model.to(device)
print(device)


###  Set-up  ###
num_training_steps = len(test_dataloader)
progress_bar = tqdm(range(num_training_steps))
pred_test = []

### Test Evaluation  ###
## For use in reporting
#metric = load_metric("accuracy")
#model.eval()
for batch in test_dataloader:
    progress_bar.update(1)
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    #metric.add_batch(predictions=predictions, references=batch["labels"])
    #print(predictions.cpu().numpy())

    pred_test.append(predictions.cpu().numpy())

with open(wd+'DeBERTa_test_pandemic_predictions.csv', 'w') as f: 
    write = csv.writer(f) 
    write.writerows(pred_test) 


print('\nTesting Accuracy:')
#print(metric.compute())

print('\n\n\n   DeBERTa Test Complete\n-----------\n\n')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Using custom data configuration default-4cbc911317aa8208
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-4cbc911317aa8208/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /root/.cache/huggingface/datasets/csv/default-4cbc911317aa8208/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-44b1cdea051f916b.arrow


cuda


  0%|          | 0/4882 [00:00<?, ?it/s]

 24%|██▍       | 1168/4882 [02:18<07:19,  8.44it/s]



Testing Accuracy:



   DeBERTa Test Complete
-----------




# Ensembling Section
---------------------

In [None]:
def load_predictions(file_in, col_name=''):
  predictions = pd.read_csv(wd+file_in, header=None)

  #Clean the predictions
  pred_melt = predictions.T.melt()
  pred_melt = pred_melt.drop('variable', axis=1)
  pred_melt = pred_melt.rename({'value':'Prediction'+col_name}, axis=1)
  pred_melt['Prediction'+col_name] = pred_melt['Prediction'+col_name]#+1
  pred_melt = pred_melt.dropna(axis=0)

  return pred_melt


electra_pred = load_predictions('Electra_reordered_val_pandemic_predictions.csv', col_name ='_Electra')
ernie_pred = load_predictions('Ernie_reordered_val_pandemic_predictions.csv',col_name ='_Ernie')
deberta_pred = load_predictions('Deberta_reordered_val_pandemic_predictions.csv',col_name ='_DeBERTa')

electra_test = load_predictions('Electra_test_pandemic_predictions.csv', col_name ='_Electra')
ernie_test = load_predictions('Ernie_test_pandemic_predictions.csv',col_name ='_Ernie')
deberta_test = load_predictions('DeBERTa_test_pandemic_predictions.csv',col_name ='_DeBERTa')

In [None]:

#Load the true labels
labels = pd.read_csv(wd+'New_Samples_pandemic_reordered_3000_validation.csv')
labels=labels["Review_rating"]
labels.head(10)


#Load the test labels
labels_test = pd.read_csv(wd+'New_Samples_pandemic_reordered_3000_test.csv')
labels_test=labels_test["Review_rating"]


In [None]:
training_df = pd.concat([electra_pred, ernie_pred, deberta_pred], axis=1)
print(training_df.head(15))
print(training_df.tail(15))
test_df = pd.concat([electra_test, ernie_test, deberta_test], axis=1)
#print(test_df.tail(15))

    Prediction_Electra  Prediction_Ernie  Prediction_DeBERTa
0                  2.0               3.0                 2.0
1                  4.0               4.0                 4.0
2                  0.0               0.0                 0.0
3                  0.0               0.0                 0.0
4                  0.0               0.0                 0.0
5                  3.0               4.0                 3.0
6                  4.0               4.0                 4.0
7                  3.0               3.0                 3.0
8                  3.0               4.0                 3.0
9                  0.0               0.0                 0.0
10                 4.0               4.0                 4.0
11                 4.0               4.0                 4.0
12                 2.0               1.0                 2.0
13                 3.0               3.0                 3.0
14                 2.0               1.0                 2.0
       Prediction_Electr

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import classification_report
from scipy.stats import pearsonr
from scipy.stats import spearmanr

import joblib

In [None]:
import sklearn
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [9]:

model = GradientBoostingClassifier()
distributions = {'min_samples_split':[2,3,4],
                 'min_weight_fraction_leaf':[0,0.05, 0.03],
                 'max_depth':[2,3,4],
                 'learning_rate':[0.1,0.05,0.01,0.001],
                "n_estimators":[100,300,500]
                 }
clf = GridSearchCV(model, distributions, cv=5, scoring='neg_mean_squared_error',)# verbose=10)
search = clf.fit(training_df, labels)


###
#Save the Random Forest
joblib.dump(search, wd+"ReorderedPandemicErnieElectraDeBertaGradBoost_ensemble2.joblib")



#{'cv': 5, 'error_score': nan, 'estimator__ccp_alpha': 0.0, 'estimator__criterion': 'friedman_mse', 'estimator__init': None, 'estimator__learning_rate': 0.1, 
#'estimator__loss': 'deviance', 'estimator__max_depth': 3, 'estimator__max_features': None, 'estimator__max_leaf_nodes': None, 
#'estimator__min_impurity_decrease': 0.0, 'estimator__min_samples_leaf': 1, 'estimator__min_samples_split': 2, 'estimator__min_weight_fraction_leaf': 0.0, 
#'estimator__n_estimators': 100, 'estimator__n_iter_no_change': None, 'estimator__random_state': None, 'estimator__subsample': 1.0, 'estimator__tol': 0.0001,
# 'estimator__validation_fraction': 0.1, 'estimator__verbose': 0, 'estimator__warm_start': False, 'estimator': GradientBoostingClassifier(), 
#'n_jobs': None, 

['/content/drive/My Drive/Colab Notebooks/Capstone/ReorderedPandemicErnieElectraDeBertaGradBoost_ensemble2.joblib']

In [10]:
# Predict the values
y_pred = search.predict(test_df)


#Check Accuracy
ensemble_score = accuracy_score(labels_test, y_pred)
print(ensemble_score)


0.7758126195028681


In [11]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(labels_test, y_pred))

print(mean_squared_error(labels_test, electra_test))
print(mean_squared_error(labels_test, ernie_test))
print(mean_squared_error(labels_test, deberta_test))


0.27349084949467356
0.2820950559956296
0.29049440043703906
0.29343075662387325


In [12]:
print(accuracy_score(labels_test, y_pred))

print(accuracy_score(labels_test, electra_test))
print(accuracy_score(labels_test, ernie_test))
print(accuracy_score(labels_test, deberta_test))

0.7758126195028681
0.7705544933078394
0.7658085222616772
0.7672425566785032


# Using the Ensemble
---------------------------

In [None]:
def model_feature_predict(model_in):
    """
    Run the accuracy test  - how well does the model perfom on the test data
    Prints the accuracy score, also returns pred and true values.

    Parameters
    ----------
    save_name : str
        the name of the save directory for the model - will also load the test data.

    Returns
    -------
    pred
        the predicted values for the test set.
    true
        the true values for the test set.

    """
    all_models = {
        'electra':["google/electra-base-discriminator", wd+"/OopsElectra_epoch_2_reordered_features_model_save"],
        'ernie':["nghuyong/ernie-2.0-en", wd+'/BIG_ERNIE_epoch_2_reordered_features_model_save_ERNIE'],
        'deberta':["microsoft/deberta-v3-small", wd+'/DebertaV3Small_epoch_2_reordered_features_model_save']
    }

    #Load Basic Electra - Model and tokenizer
    checkpoint = all_models[model_in]
    tokenizer = AutoTokenizer.from_pretrained(checkpoint[0])
    model = AutoModelForSequenceClassification.from_pretrained(checkpoint[1])


    #Load Data
    dataset = load_dataset('csv', data_files={'test': wd+'reorder_feature_sentence_matrix.csv'})
    
    
    ###  Tokenize  ###
    if model_in == 'electra':
      def tokenize_function(example):
          return tokenizer(example["Sentence"],truncation=True)
    elif model_in == 'ernie':
      def tokenize_function(example):
          return tokenizer(example["Sentence"],truncation=True,max_length=512, padding=True,)
    elif model_in == 'deberta':
      def tokenize_function(example):
          return tokenizer(example["Sentence"],padding=True,)
    else:
      raise ValueError('The model in is undefined')

    
        #Prepare the data
    #dataset = dataset.rename_column("Unnamed: 0", "labels")
    tokenized_datasets = dataset.map(tokenize_function, batched=True)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
    
    #Remove raw text - already tokenized and masked
    tokenized_datasets = tokenized_datasets.remove_columns(["Sentence", "Unnamed: 0"])
    
    #Removing all non-text columns
    tokenized_datasets = tokenized_datasets.remove_columns(['Feature', 'Word', 'Sentiment', 'State', 'Pandemic_Timing'])
    

    
    #Rename and reformat columns
    #tokenized_datasets = tokenized_datasets.rename_column("Review_rating", "labels")
    tokenized_datasets.set_format("torch", columns=tokenized_datasets["test"].column_names)
    tokenized_datasets["test"].column_names
    
    device = torch.device("cuda")
    model.to(device)
    device
    
    ###  Dataloader  ###

    eval_dataloader = DataLoader(tokenized_datasets["test"],
                                 batch_size=8, collate_fn=data_collator)
    
    
    for batch in eval_dataloader:
        break
    
    
    ###  Set-up  ###
    num_training_steps = len(eval_dataloader)
    progress_bar = tqdm(range(num_training_steps))
    
    
    pred = []
    index_val = []
    
    ###  Evaluations  ###
    
    #model.eval()
    for batch in eval_dataloader:
        progress_bar.update(1)
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)
            
    
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        #print(predictions.cpu().numpy())

        pred.append(predictions.cpu().numpy())
        #index_val.append(batch["labels"])
    


    return pred

In [None]:
#electra_pred = model_feature_predict(model_in = 'electra')
#pd.DataFrame(electra_pred).to_csv(wd+'electra_reordered_feature_matrix_pred.csv')

#ernie_pred = model_feature_predict(model_in = 'ernie')
#pd.DataFrame(ernie_pred).to_csv(wd+'ernie_reordered_feature_matrix_pred.csv')

deberta_pred = model_feature_predict(model_in = 'deberta')
pd.DataFrame(deberta_pred).to_csv(wd+'deberta_reordered_feature_matrix_pred.csv')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Using custom data configuration default-0015f5a8b41d6b9e
Reusing dataset csv (/root/.cache/huggingface/datasets/csv/default-0015f5a8b41d6b9e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1926 [00:00<?, ?ba/s]

100%|██████████| 240720/240720 [57:38<00:00, 69.61it/s]


In [None]:
def load_predictions(col_name=''):
  predictions = pd.read_csv(wd+col_name.lower()[1::]+'_reordered_feature_matrix_pred.csv', index_col=0)

  #Clean the predictions
  pred_melt = predictions.T.melt()
  pred_melt = pred_melt.drop('variable', axis=1)
  pred_melt = pred_melt.rename({'value':'Prediction'+col_name}, axis=1)
  pred_melt['Prediction'+col_name] = pred_melt['Prediction'+col_name]#+1

  return pred_melt


electra_tidy = load_predictions(col_name ='_Electra')
ernie_tidy = load_predictions(col_name ='_Ernie')
deberta_tidy = load_predictions(col_name ='_DeBERTa')


all_pred_df = pd.concat([electra_tidy, ernie_tidy, deberta_tidy], axis=1)
print(all_pred_df.head(15))
print(all_pred_df.tail(15))

all_pred_df.to_csv(wd+'reordered_model_predictions_features.csv')

    Prediction_Electra  Prediction_Ernie  Prediction_DeBERTa
0                    4                 3                   3
1                    4                 4                   3
2                    4                 4                   4
3                    3                 4                   3
4                    3                 3                   3
5                    4                 4                   4
6                    4                 4                   3
7                    4                 1                   4
8                    4                 4                   3
9                    4                 3                   3
10                   4                 4                   3
11                   3                 3                   3
12                   4                 4                   4
13                   4                 4                   4
14                   4                 4                   4
         Prediction_Elec

In [None]:
# Load the fit Gradient boost model
import joblib

loaded_gb = joblib.load(wd+'ReorderedErnieElectraDeBertaGradBoost_ensemble.joblib')
features_array = pd.read_csv(wd+'reordered_model_predictions_features.csv', index_col = 0)

features_array.head()


Unnamed: 0,Prediction_Electra,Prediction_Ernie,Prediction_DeBERTa
0,4,3,3
1,4,4,3
2,4,4,4
3,3,4,3
4,3,3,3


In [None]:
emsemble_predictions = loaded_gb.predict(features_array)

In [None]:
def tidy_predictions(pred_in, col_name=''):
 
  #Clean the predictions
  pred_melt = pd.DataFrame(pred_in).T.melt()
  pred_melt = pred_melt.drop('variable', axis=1)
  pred_melt = pred_melt.rename({'value':'Prediction'+col_name}, axis=1)
  pred_melt['Prediction'+col_name] = pred_melt['Prediction'+col_name]#+1

  return pred_melt


tidy_predictions(emsemble_predictions, col_name='_Ensemble').to_csv(wd+'reordered_ensemble_predictions.csv')

# Ensembling Individual Sentences
------------------------


In [None]:
###  Set Seeds  ###
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)




def ensemble_model(sentence_in):
  predictions_collect = []

  print(sentence_in)
  #Load Electra - Model and tokenizer
  tokenizer = AutoTokenizer.from_pretrained("google/electra-base-discriminator")
  model = AutoModelForSequenceClassification.from_pretrained(wd+"/OopsElectra_epoch_2_reordered_features_model_save")

  #Prepare the sentence
  input_ids_1 = torch.tensor(tokenizer.encode(sentence_in, truncation=True)).unsqueeze(0)  # Batch size 1

  #Print the predicted score
  prediction_1 = torch.argmax(model(input_ids_1).logits, dim=-1).numpy()
  predictions_collect.append(prediction_1)
  print(f'Electra: {prediction_1}')


  ## Ernie

  #Load Ernie - Model and tokenizer
  tokenizer = AutoTokenizer.from_pretrained("nghuyong/ernie-2.0-en")
  model = AutoModelForSequenceClassification.from_pretrained(wd+'/BIG_ERNIE_epoch_2_reordered_features_model_save_ERNIE')

  #Prepare the sentence
  input_ids_1 = torch.tensor(tokenizer.encode(sentence_in, truncation=True,max_length=512, padding=True,)).unsqueeze(0)  # Batch size 1

  #Print the predicted score
  prediction_1 = torch.argmax(model(input_ids_1).logits, dim=-1).numpy()
  predictions_collect.append(prediction_1)
  print(f'Ernie: {prediction_1}')



  ## DeBERTa

  #Load DeBERTa - Model and tokenizer
  tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-small")
  model = AutoModelForSequenceClassification.from_pretrained(wd+'/DebertaV3Small_epoch_2_reordered_features_model_save')

  #Prepare the sentence
  input_ids_1 = torch.tensor(tokenizer.encode(sentence_in, padding=True)).unsqueeze(0)  # Batch size 1

  #Print the predicted score
  prediction_1 = torch.argmax(model(input_ids_1).logits, dim=-1).numpy()
  predictions_collect.append(prediction_1)
  print(f'DeBERTa: {prediction_1}')

  predictions_collect = pd.DataFrame(predictions_collect, index = [	"Prediction_Electra",	"Prediction_Ernie",	"Prediction_DeBERTa"]).T




  import joblib

  loaded_gb = joblib.load(wd+'ReorderedErnieElectraDeBertaGradBoost_ensemble.joblib')
  prediction_1 = loaded_gb.predict(predictions_collect)

  predictions_collect['Prediction_Ensemble'] = prediction_1
  print(f'Ensemble: {prediction_1}')
  return(predictions_collect)



In [None]:
sentence_1 = "I wrote my capstone in this hotel. The desk was a little small. "
sentence_2 = "I wrote my capstone in this hotel.  The desk was a little small, but the room was spotless. "
sentence_3 = "I wrote my capstone in this hotel.  The desk was a little small, but the room was spotless. Breakfast was absolutely terrible though! There was no coffee!!"

samples = []
for sent in [sentence_1, sentence_2, sentence_3]:
  samp = ensemble_model(sent)
  samp['Sentence'] = sent
  samples.append(samp)


I wrote my capstone in this hotel. The desk was a little small. 
Electra: [3]
Ernie: [3]


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DeBERTa: [3]
Ensemble: [3]
I wrote my capstone in this hotel.  The desk was a little small, but the room was spotless. 
Electra: [4]
Ernie: [4]


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DeBERTa: [3]
Ensemble: [4]
I wrote my capstone in this hotel.  The desk was a little small, but the room was spotless. Breakfast was absolutely terrible though! There was no coffee!!
Electra: [3]
Ernie: [3]


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DeBERTa: [2]
Ensemble: [3]


In [None]:
pd.concat(samples)

Unnamed: 0,Prediction_Electra,Prediction_Ernie,Prediction_DeBERTa,Prediction_Ensemble,Sentence
0,3,3,3,3,I wrote my capstone in this hotel. The desk wa...
0,4,4,3,4,I wrote my capstone in this hotel. The desk w...
0,3,3,2,3,I wrote my capstone in this hotel. The desk w...


In [None]:
sentence_1 = "The lobby in this hotel is dated."
sentence_2 = "The lobby in this hotel is newly renovated."
sentence_3 = "The lobby in this hotel is very clean."
sentence_4 = "The lobby in this hotel is newly painted."
sentence_5 = "The lobby in this hotel has free coffee."

samples2 = []
for sent in [sentence_1, sentence_2, sentence_3, sentence_4, sentence_5]:
  samp = ensemble_model(sent)
  samp['Sentence'] = sent
  samples2.append(samp)

pd.concat(samples2)

The lobby in this hotel is dated.
Electra: [2]
Ernie: [2]


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DeBERTa: [2]
Ensemble: [2]
The lobby in this hotel is newly renovated.
Electra: [3]
Ernie: [3]


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DeBERTa: [3]
Ensemble: [3]
The lobby in this hotel is very clean.
Electra: [3]
Ernie: [4]


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DeBERTa: [3]
Ensemble: [3]
The lobby in this hotel is newly painted.
Electra: [2]
Ernie: [3]


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DeBERTa: [3]
Ensemble: [3]
The lobby in this hotel has free coffee.
Electra: [3]
Ernie: [3]


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DeBERTa: [3]
Ensemble: [3]


Unnamed: 0,Prediction_Electra,Prediction_Ernie,Prediction_DeBERTa,Prediction_Ensemble,Sentence
0,2,2,2,2,The lobby in this hotel is dated.
0,3,3,3,3,The lobby in this hotel is newly renovated.
0,3,4,3,3,The lobby in this hotel is very clean.
0,2,3,3,3,The lobby in this hotel is newly painted.
0,3,3,3,3,The lobby in this hotel has free coffee.


In [None]:
sentence_1 = "The bed was lumpy."
sentence_2 = "The bed was new."
sentence_3 = "The bed was lumpy, but the pillows were very soft."
sentence_4 = "The bed was lumpy, but the room has free coffee."

samples2 = []
for sent in [sentence_1, sentence_2, sentence_3, sentence_4]:
  samp = ensemble_model(sent)
  samp['Sentence'] = sent
  samples2.append(samp)

pd.concat(samples2)

The bed was lumpy.
Electra: [2]
Ernie: [2]


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DeBERTa: [2]
Ensemble: [2]
The bed was new.
Electra: [3]
Ernie: [4]


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DeBERTa: [3]
Ensemble: [3]
The bed was lumpy, but the pillows were very soft.
Electra: [2]
Ernie: [3]


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DeBERTa: [2]
Ensemble: [2]
The bed was lumpy, but the room has free coffee.
Electra: [2]
Ernie: [2]


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DeBERTa: [2]
Ensemble: [2]


Unnamed: 0,Prediction_Electra,Prediction_Ernie,Prediction_DeBERTa,Prediction_Ensemble,Sentence
0,2,2,2,2,The bed was lumpy.
0,3,4,3,3,The bed was new.
0,2,3,2,2,"The bed was lumpy, but the pillows were very s..."
0,2,2,2,2,"The bed was lumpy, but the room has free coffee."


In [None]:
sentence_1 = "The Ben was lumpy."
sentence_2 = "The Ben was new."
sentence_3 = "The Ben was lumpy, but the Wailords were very soft."
sentence_4 = "The Ben was lumpy, but the room has free coffee."

samples2 = []
for sent in [sentence_1, sentence_2, sentence_3, sentence_4]:
  samp = ensemble_model(sent)
  samp['Sentence'] = sent
  samples2.append(samp)

pd.concat(samples2)

The Ben was lumpy.
Electra: [2]
Ernie: [2]


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DeBERTa: [2]
Ensemble: [2]
The Ben was new.
Electra: [4]
Ernie: [4]


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DeBERTa: [4]
Ensemble: [4]
The Ben was lumpy, but the Wailords were very soft.
Electra: [2]
Ernie: [3]


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DeBERTa: [2]
Ensemble: [2]
The Ben was lumpy, but the room has free coffee.
Electra: [3]
Ernie: [3]


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


DeBERTa: [2]
Ensemble: [3]


Unnamed: 0,Prediction_Electra,Prediction_Ernie,Prediction_DeBERTa,Prediction_Ensemble,Sentence
0,2,2,2,2,The Ben was lumpy.
0,4,4,4,4,The Ben was new.
0,2,3,2,2,"The Ben was lumpy, but the Wailords were very ..."
0,3,3,2,3,"The Ben was lumpy, but the room has free coffee."
