In [1]:
#Connect to wandb
#TO DO : how to save models on the weight and bias platform
import wandb
wandb.login()
wandb.init(project="svm", 
           entity="benchmark-nlp",
           name='emotion datasets svc') #CHANGE

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjtonglet[0m ([33mbenchmark-nlp[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
import os
os.chdir('../../..')

In [3]:
#Load packages
import warnings
import io
import numpy as np
import pandas as pd
from codecarbon import EmissionsTracker
import yaml
from util.dataloader import DataLoader
from preprocessing.preprocessor import Preprocessor
from util.datasplitter import data_splitter
from preprocessing.fasttext_embeddings import FastTextEmbeddings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score
warnings.filterwarnings("ignore")

In [4]:
#Set constant values
SEED=42 
OPT_ITER=10  #Change??

## Load data

In [5]:
dl = DataLoader(['emotion'])
data = dl.load()


tweet_preprocessor = Preprocessor(is_tweet=True)
preprocessor = Preprocessor()

#We are not interested in the test sets for hyperparameter optimization
train_eval_emotion, val_eval_emotion, _ = data_splitter(data['tweetEval'],
                                 tweet_preprocessor,  #Need to rerun this one
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)
train_carer, val_carer, _ = data_splitter(data['CARER'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)
train_silicone, val_silicone, _ = data_splitter(data['silicone'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)

87170 rows preprocessed in 39.37953495979309 seconds
7740 rows preprocessed in 2.6140074729919434 seconds
8069 rows preprocessed in 2.6870648860931396 seconds


In [9]:
%%time
#fasttext 
fasttext = FastTextEmbeddings()
fasttext.load_model('fasttext/cc.en.300.bin')

CPU times: total: 18 s
Wall time: 34.2 s




In [10]:
embedded_train_eval_emotion = fasttext.generate_sentence_embeddings(train_eval_emotion['text'])
embedded_val_eval_emotion = fasttext.generate_sentence_embeddings(val_eval_emotion['text'])
embedded_train_eval_emotion['label'] = train_eval_emotion['label'].to_list()
embedded_val_eval_emotion['label'] = val_eval_emotion['label'].to_list()

Starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 3257/3257 [00:22<00:00, 142.50it/s]


Starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████████| 374/374 [00:01<00:00, 226.23it/s]


In [11]:
embedded_train_carer = fasttext.generate_sentence_embeddings(train_carer['text'])
embedded_val_carer = fasttext.generate_sentence_embeddings(val_carer['text'])
embedded_train_carer['label'] = train_carer['label'].to_list()
embedded_val_carer['label'] = val_carer['label'].to_list()

Starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████| 16000/16000 [00:45<00:00, 351.19it/s]


Starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:06<00:00, 323.26it/s]


In [10]:
embedded_train_silicone = fasttext.generate_sentence_embeddings(train_silicone['text'])
embedded_val_silicone = fasttext.generate_sentence_embeddings(val_silicone['text'])
embedded_train_silicone['label'] = train_silicone['label'].to_list()
embedded_val_silicone['label'] = val_silicone['label'].to_list()

Starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████| 87170/87170 [01:51<00:00, 781.30it/s]


Starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 8069/8069 [00:13<00:00, 600.92it/s]


## Hyperopt

In [6]:
import yaml
#Load the template yaml sweep config file for logistic regression
#If the value range for an hyperparameter needs to be changed, better to do it in the .yaml file than in a notebook
with open("config/svm_sweep.yaml", 'r') as stream:
    sweep_config = yaml.safe_load(stream)

In [9]:
#The config is displayed as a nested dictionary
sweep_config

{'method': 'random',
 'entity': 'benchmark-nlp',
 'project': 'hyperopt',
 'metric': {'name': 'loss', 'goal': 'minimize'},
 'parameters': {'C': {'min': 0, 'max': 10, 'distribution': 'uniform'},
  'kernel': {'values': ['linear', 'rbf']},
  'probability': {'value': True},
  'random_state': {'value': 42}}}

#### eval emotion

In [15]:
#Don't forget to name the sweep instance  
name = 'svm_tfidf_eval_emotion' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_tfidf(config = None,
          train=train_eval_emotion, #Change here
          val=val_eval_emotion): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = SVC(C=config.C, 
                  kernel=config.kernel,
                  probability=config.probability,
                  random_state=config.random_state) #set the hyperparams here
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER) #Count : number of iterations
tracker.stop()

Create sweep with ID: 3ea77bj6
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/3ea77bj6


[34m[1mwandb[0m: Agent Starting Run: 448e4xcl with config:
[34m[1mwandb[0m: 	C: 9.423066515664598
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.66043
f1 macro,0.55692


[34m[1mwandb[0m: Agent Starting Run: 6b9hffns with config:
[34m[1mwandb[0m: 	C: 8.151009962757636
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.59358
f1 macro,0.50662


[34m[1mwandb[0m: Agent Starting Run: 3pfi23eb with config:
[34m[1mwandb[0m: 	C: 3.572417139778181
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.61497
f1 macro,0.527


[34m[1mwandb[0m: Agent Starting Run: 4m4icp3r with config:
[34m[1mwandb[0m: 	C: 9.255215577186055
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.59893
f1 macro,0.5109


[34m[1mwandb[0m: Agent Starting Run: n7qcl9t0 with config:
[34m[1mwandb[0m: 	C: 9.498832367873874
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.66043
f1 macro,0.55692


[34m[1mwandb[0m: Agent Starting Run: k12mxygk with config:
[34m[1mwandb[0m: 	C: 5.444442256852879
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.65508
f1 macro,0.55261


[34m[1mwandb[0m: Agent Starting Run: rzg1ef1u with config:
[34m[1mwandb[0m: 	C: 9.46883720433424
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.59626
f1 macro,0.50898


[34m[1mwandb[0m: Agent Starting Run: i1lv1ljc with config:
[34m[1mwandb[0m: 	C: 5.5710953641947185
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.60428
f1 macro,0.51506


[34m[1mwandb[0m: Agent Starting Run: zrhk3841 with config:
[34m[1mwandb[0m: 	C: 8.345753913270567
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.65775
f1 macro,0.55519


[34m[1mwandb[0m: Agent Starting Run: x6ajvmh4 with config:
[34m[1mwandb[0m: 	C: 9.4060073807554
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.66043
f1 macro,0.55692


0.0015878593236364306

In [16]:
#Don't forget to name the sweep instance   
name = 'svm_ft_eval_emotion' #change here
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_fasttext(config = None,
          train=embedded_train_eval_emotion, #Change here
          val=embedded_val_eval_emotion): #change here
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config

        clf = SVC(C=config.C, 
                  kernel=config.kernel,
                  probability=config.probability,
                  random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()

Create sweep with ID: r0w06auo
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/r0w06auo


[34m[1mwandb[0m: Agent Starting Run: 808ttoyo with config:
[34m[1mwandb[0m: 	C: 8.10364331881112
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.64973
f1 macro,0.57857


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 7rbmro3f with config:
[34m[1mwandb[0m: 	C: 1.089636323090759
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.61765
f1 macro,0.47861


[34m[1mwandb[0m: Agent Starting Run: l6aykz8c with config:
[34m[1mwandb[0m: 	C: 6.049734560168851
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.62567
f1 macro,0.52761


[34m[1mwandb[0m: Agent Starting Run: gpaho7cy with config:
[34m[1mwandb[0m: 	C: 2.522989768757964
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.63904
f1 macro,0.5409


[34m[1mwandb[0m: Agent Starting Run: 7nf4s4ik with config:
[34m[1mwandb[0m: 	C: 7.706289152149179
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.62834
f1 macro,0.53133


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: hjjxqgci with config:
[34m[1mwandb[0m: 	C: 2.924654669690782
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.63102
f1 macro,0.5331


[34m[1mwandb[0m: Agent Starting Run: v10rcn3h with config:
[34m[1mwandb[0m: 	C: 2.253768592876686
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.63369
f1 macro,0.53618


[34m[1mwandb[0m: Agent Starting Run: 3y91xi6p with config:
[34m[1mwandb[0m: 	C: 5.882658926708409
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.65241
f1 macro,0.58687


[34m[1mwandb[0m: Agent Starting Run: kuhlys6r with config:
[34m[1mwandb[0m: 	C: 2.911120222171768
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.6631
f1 macro,0.58445


[34m[1mwandb[0m: Agent Starting Run: os701n1f with config:
[34m[1mwandb[0m: 	C: 2.242588536733482
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.63102
f1 macro,0.53363


0.0020062178776431004

#### CARER

In [17]:
#Don't forget to name the sweep instance  
name = 'svm_tfidf_carer' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_tfidf(config = None,
          train=train_carer, #Change here
          val=val_carer): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = SVC(C=config.C, 
                  kernel=config.kernel,
                  probability=config.probability,
                  random_state=config.random_state) #set the hyperparams here
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER)
tracker.stop()

Create sweep with ID: lt2papke
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/lt2papke


[34m[1mwandb[0m: Agent Starting Run: zapnqcyf with config:
[34m[1mwandb[0m: 	C: 7.796410356663496
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.8745
f1 macro,0.83782


[34m[1mwandb[0m: Agent Starting Run: ryip2ka0 with config:
[34m[1mwandb[0m: 	C: 1.8965406731910208
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.891
f1 macro,0.86241


[34m[1mwandb[0m: Agent Starting Run: q8srcf2j with config:
[34m[1mwandb[0m: 	C: 0.0799464097102387
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.6355
f1 macro,0.43442


[34m[1mwandb[0m: Agent Starting Run: dr4c81a3 with config:
[34m[1mwandb[0m: 	C: 2.4320421070600453
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.876
f1 macro,0.84154


[34m[1mwandb[0m: Agent Starting Run: kg58gvsc with config:
[34m[1mwandb[0m: 	C: 1.1673928664844258
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.893
f1 macro,0.86222


[34m[1mwandb[0m: Agent Starting Run: 28wmt8gv with config:
[34m[1mwandb[0m: 	C: 8.719510965998182
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.8765
f1 macro,0.84719


[34m[1mwandb[0m: Agent Starting Run: igcw00q2 with config:
[34m[1mwandb[0m: 	C: 0.5188800778544767
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.8865
f1 macro,0.85664


[34m[1mwandb[0m: Agent Starting Run: ezozfyvd with config:
[34m[1mwandb[0m: 	C: 3.069112423010596
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.876
f1 macro,0.84044


[34m[1mwandb[0m: Agent Starting Run: 4jxxzs3s with config:
[34m[1mwandb[0m: 	C: 9.039720058600263
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.876
f1 macro,0.84721


[34m[1mwandb[0m: Agent Starting Run: t2ywciwh with config:
[34m[1mwandb[0m: 	C: 7.1701053138700575
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.8795
f1 macro,0.84981


0.008614541476856117

In [18]:
#Don't forget to name the sweep instance   
name = 'svm_ft_carer' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_fasttext(config = None,
          train=embedded_train_carer, #CHANGE HERE
          val=embedded_val_carer): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = SVC(C=config.C, 
                  kernel=config.kernel,
                  probability=config.probability,
                  random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()

Create sweep with ID: uguo86hy
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/uguo86hy


[34m[1mwandb[0m: Agent Starting Run: nys6p7au with config:
[34m[1mwandb[0m: 	C: 6.831610531306077
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.671
f1 macro,0.55779


[34m[1mwandb[0m: Agent Starting Run: hgdz65gc with config:
[34m[1mwandb[0m: 	C: 1.8923282168886535
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.6775
f1 macro,0.56607


[34m[1mwandb[0m: Agent Starting Run: u2wx7bpg with config:
[34m[1mwandb[0m: 	C: 5.471602742107594
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.6675
f1 macro,0.54946


[34m[1mwandb[0m: Agent Starting Run: nizmirsd with config:
[34m[1mwandb[0m: 	C: 4.797015721599608
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.69
f1 macro,0.60055


[34m[1mwandb[0m: Agent Starting Run: ls931xbo with config:
[34m[1mwandb[0m: 	C: 0.2107229806851585
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.554
f1 macro,0.27749


[34m[1mwandb[0m: Agent Starting Run: lb8wvfzc with config:
[34m[1mwandb[0m: 	C: 9.38561314675159
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.68
f1 macro,0.59826


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: xkyv80ji with config:
[34m[1mwandb[0m: 	C: 7.52363227178957
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.682
f1 macro,0.59623


[34m[1mwandb[0m: Agent Starting Run: 6kwcrn1z with config:
[34m[1mwandb[0m: 	C: 7.089953639481972
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.6695
f1 macro,0.55555


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: gg3ip5il with config:
[34m[1mwandb[0m: 	C: 7.871551473979233
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.6715
f1 macro,0.56288


[34m[1mwandb[0m: Agent Starting Run: o5fcliw2 with config:
[34m[1mwandb[0m: 	C: 9.533113109707688
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.673
f1 macro,0.56968


0.015461782987135905

#### silicone

In [7]:
#Don't forget to name the sweep instance  
name = 'svm_tfidf_silicone' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_tfidf(config = None,
          train=train_silicone, #Change here
          val=val_silicone): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = LinearSVC(C=config.C, 
                        loss='hinge',
#                   kernel=config.kernel,
#                   probability=config.probability,
                  random_state=config.random_state) #set the hyperparams here
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
#         pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
#         if train['label'].nunique() <=2:
#             aucpc =  average_precision_score(val['label'],pred_prob_val)
#             auc = roc_auc_score(val['label'],pred_prob_val)
#         else:
        aucpc = '-'
        auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf, count=OPT_ITER)
tracker.stop()

Create sweep with ID: 35hdqhr1
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/35hdqhr1


[34m[1mwandb[0m: Agent Starting Run: unrw1945 with config:
[34m[1mwandb[0m: 	C: 2.145705009196047
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.89193
f1 macro,0.26193


[34m[1mwandb[0m: Agent Starting Run: ypwjssoe with config:
[34m[1mwandb[0m: 	C: 1.4196474004701476
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.89354
f1 macro,0.25569


[34m[1mwandb[0m: Agent Starting Run: h09w9r8b with config:
[34m[1mwandb[0m: 	C: 0.6963354258778209
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.89292
f1 macro,0.23059


[34m[1mwandb[0m: Agent Starting Run: 7gvequma with config:
[34m[1mwandb[0m: 	C: 5.344457441820717
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88995
f1 macro,0.27572


[34m[1mwandb[0m: Agent Starting Run: s3df7hun with config:
[34m[1mwandb[0m: 	C: 0.3817044305985984
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88983
f1 macro,0.21595


[34m[1mwandb[0m: Agent Starting Run: cw80g452 with config:
[34m[1mwandb[0m: 	C: 7.325344026502989
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88784
f1 macro,0.27577


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 1ox07sef with config:
[34m[1mwandb[0m: 	C: 9.851779111184296
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.8866
f1 macro,0.27535


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 6s5t1kuj with config:
[34m[1mwandb[0m: 	C: 4.015338559063678
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.89057
f1 macro,0.27074


[34m[1mwandb[0m: Agent Starting Run: 7xjyez2h with config:
[34m[1mwandb[0m: 	C: 7.305191251744239
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88797
f1 macro,0.27584


[34m[1mwandb[0m: Agent Starting Run: ovv40dhv with config:
[34m[1mwandb[0m: 	C: 6.779042487218171
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88846
f1 macro,0.27648


0.001157240238882834

In [11]:
#Don't forget to name the sweep instance   
name = 'svm_ft_silicone' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_fasttext(config = None,
          train=embedded_train_silicone, #CHANGE HERE
          val=embedded_val_silicone): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = LinearSVC(C=config.C, 
                  loss='hinge',
#                   kernel=config.kernel,
#                   probability=config.probability,
                  random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
#         pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
#         if train['label'].nunique() <=2:
#             aucpc =  average_precision_score(val['label'],pred_prob_val)
#             auc = roc_auc_score(val['label'],pred_prob_val)
#             #Log predictions on WandB
#         else:
        aucpc = '-'
        auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext, count=OPT_ITER)
tracker.stop()

Create sweep with ID: jz1vlwnl
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/jz1vlwnl


[34m[1mwandb[0m: Agent Starting Run: ymeo00tf with config:
[34m[1mwandb[0m: 	C: 6.0466256908660725
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88251
f1 macro,0.15524


[34m[1mwandb[0m: Agent Starting Run: yvvmo4mf with config:
[34m[1mwandb[0m: 	C: 1.7868754545190857
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88264
f1 macro,0.15649


[34m[1mwandb[0m: Agent Starting Run: izuau0hr with config:
[34m[1mwandb[0m: 	C: 9.852511872787597
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88276
f1 macro,0.15683


[34m[1mwandb[0m: Agent Starting Run: byc1dplx with config:
[34m[1mwandb[0m: 	C: 3.998514004167699
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88251
f1 macro,0.15493


[34m[1mwandb[0m: Agent Starting Run: zqn6gacb with config:
[34m[1mwandb[0m: 	C: 8.53188644178246
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88239
f1 macro,0.15489


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ad86j4jx with config:
[34m[1mwandb[0m: 	C: 1.340780741247206
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88152
f1 macro,0.14954


[34m[1mwandb[0m: Agent Starting Run: 7xeqwhh6 with config:
[34m[1mwandb[0m: 	C: 8.31383230540582
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88165
f1 macro,0.15089


[34m[1mwandb[0m: Agent Starting Run: emlr5thv with config:
[34m[1mwandb[0m: 	C: 9.880787830669794
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88152
f1 macro,0.14954


[34m[1mwandb[0m: Agent Starting Run: 1t7qiuo3 with config:
[34m[1mwandb[0m: 	C: 7.247940091193216
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88251
f1 macro,0.15615


[34m[1mwandb[0m: Agent Starting Run: pbzpa5da with config:
[34m[1mwandb[0m: 	C: 2.1106469446999787
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88239
f1 macro,0.1552


0.0024874125108176118