In [None]:
# Oriol
from google.colab import drive
drive.mount('/content/drive', force_remount = True)
!cp "/content/drive/My Drive/Colab Notebooks/NLP/data/spotify_songs.csv.zip" .
!cp "/content/drive/My Drive/Colab Notebooks/NLP/helper_functions.py" .

In [None]:
# Import Transformers
!pip install --upgrade transformers
!pip install simpletransformers

In [None]:
# Install Weights and Biases library for hyperparameter optimization
!pip install wandb

In [None]:
# Import Libraries
import logging
import numpy as np
import pandas as pd
import zipfile
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from simpletransformers.classification import ClassificationModel, ClassificationArgs, MultiLabelClassificationModel
import wandb # weights and biases

In [None]:

# Loading data to a pandas dataframe
#with zipfile.ZipFile('data/spotify_songs.csv.zip') as zip:
with zipfile.ZipFile('spotify_songs.csv.zip') as zip:
    with zip.open('spotify_songs.csv') as myZip:
        data = pd.read_csv(myZip) 

In [None]:
from helper_functions import preprocess
df = preprocess(data)
df.head()

In [None]:
# Split train and test
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'])

# Split train and validation
train_df, eval_df = train_test_split(train_df, test_size=0.2, stratify=train_df['label'])

print(train_df.shape)
print(eval_df.shape)
print(test_df.shape)

In [None]:
# Hyperparemters to optimize - Sweep configuration
sweep_config = {
    "name": "optim param",
    "method": "bayes",
    "metric": {"name": "f1", "goal": "maximize"},
    "parameters": {
        "num_train_epochs": {"min": 1, "max": 15},
        "learning_rate": {"min": 0, "max": 4e-4},
        "train_batch_size": {"values":[8, 12, 16]},
    },
    "early_terminate": {"type": "hyperband", "min_iter": 6,},
}

sweep_id = wandb.sweep(sweep_config, project="RTE - Hyperparameter Optimization")

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
# Model Arguments - 
model_args = ClassificationArgs()
model_args.manual_seed = 42
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.use_multiprocessing = True
model_args.evaluate_during_training = True
model_args.evaluate_during_training_silent = False
model_args.evaluate_during_training_steps = 1000
model_args.eval_batch_size = 8
model_args.wandb_project = "RTE - Hyperparameter Optimization"
model_args.learning_rate = 4e-4
model_args.num_train_epochs = 10
# model_args.use_early_stopping = True -> Tune early_stopping parameters 



# Model 
#model = ClassificationModel(model_type='bert', model_name='bert-base-cased', use_cuda=True, num_labels=6, args=model_args)
#model = MultiLabelClassificationModel('roberta', 'roberta-base', args={'reprocess_input_data': True})

In [None]:
# This is a test
sweep_config = {
    "name": "optim param",
    "method": "bayes",
    "metric": {"name": "f1", "goal": "maximize"},
    "parameters": {
        "num_train_epochs": {"min": 1, "max": 2},
    },
    "early_terminate": {"type": "hyperband", "min_iter": 6,},
}

sweep_id = wandb.sweep(sweep_config, project="RTE - Hyperparameter Optimization")

logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
def train_test():

    # Initialize a new wandb run
    wandb.init()

    # Compute class weights
    weights = list(train_df.groupby(['label']).count()['text']/(train_df.shape[0]))

    # Create a TransformerModel
    model = ClassificationModel(model_type='bert', model_name='bert-base-cased', use_cuda=True, num_labels=6, args=model_args, sweep_config=wandb.config, weight=weights)

    # Train the model
    model.train_model(
        train_df,
        eval_df=eval_df,
        f1=lambda truth, predictions: f1_score(truth, [round(p) for p in predictions],average='weighted')
    )

    # Sync wandb
    wandb.join()

In [None]:
import time
start = time.time()

# Run Sweep
wandb.agent(sweep_id, train_test)

end = time.time()
print(end - start)

In [None]:
# Model

def train():
    # Initialize a new wandb run
    wandb.init()

    #

    # Create a TransformerModel
    model = ClassificationModel(model_type='bert', model_name='bert-base-cased', use_cuda=True, num_labels=6, args=model_args, sweep_config=wandb.config,) 
    # Train the model
    model.train_model(
        train_df,
        eval_df=eval_df,
        f1=lambda truth, predictions: f1_score(truth, [round(p) for p in predictions],average='weighted')
    )

    # Sync wandb
    wandb.join()

In [None]:
import time
start = time.time()

# Run Sweep
wandb.agent(sweep_id, train)

end = time.time()
print(end - start)

In [None]:
# Train the model
#model.train_model(train_df)

In [None]:
result, model_outputs, wrong_preds = model.eval_model(validation_df)

In [None]:
from sklearn.metrics import f1_score, accuracy_score

def f1_multiclass(labels, preds):
    return f1_score(labels, preds, average='micro')

result, model_outputs, wrong_predictions = model.eval_model(validation_df, f1=f1_multiclass, acc=accuracy_score)

In [None]:
print(result)

In [None]:
predictions, raw_outputs = model.predict(['I love rock n roll'])

In [None]:
predictions