In [5]:
#Connect to wandb
#TO DO : how to save models on the weight and bias platform
import wandb
wandb.login()
wandb.init(project="hyperopt", 
           entity="benchmark-nlp",
           name='emotion datasets') #CHANGE



VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

In [6]:
#Load packages
import warnings
import io
import numpy as np
import pandas as pd
from codecarbon import EmissionsTracker
import yaml
from util.dataloader import DataLoader
from preprocessing import Preprocessor
from util.datasplitter import data_splitter
from fasttext_embeddings import FastTextEmbeddings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score
warnings.filterwarnings("ignore")

In [7]:
#Set constant values
SEED=42 
OPT_ITER=10

## Load data

In [12]:
dl = DataLoader(['emotion'])
data = dl.load()


tweet_preprocessor = Preprocessor(is_tweet=True)
preprocessor = Preprocessor()

#We are not interested in the test sets for hyperparameter optimization
train_eval_emotion, val_eval_emotion, _ = data_splitter(data['eval_emotion'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)
train_carer, val_carer, _ = data_splitter(data['CARER'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)
train_silicone, val_silicone, _ = data_splitter(data['silicone'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)

3257 rows preprocessed in 3.2080190181732178 seconds
1421 rows preprocessed in 0.7737503051757812 seconds
374 rows preprocessed in 0.22943925857543945 seconds
16000 rows preprocessed in 9.679208278656006 seconds
2000 rows preprocessed in 0.8314423561096191 seconds
2000 rows preprocessed in 0.9522378444671631 seconds
87170 rows preprocessed in 26.219103813171387 seconds
7740 rows preprocessed in 2.22611141204834 seconds
8069 rows preprocessed in 2.11386775970459 seconds


In [9]:
%%time
#fasttext 
fasttext = FastTextEmbeddings()
fasttext.load_model('fasttext/cc.en.300.bin')



Wall time: 49.4 s


In [10]:
embedded_train_eval_emotion = fasttext.generate_sentence_embeddings(train_eval_emotion['text'])
embedded_val_eval_emotion = fasttext.generate_sentence_embeddings(val_eval_emotion['text'])
embedded_train_eval_emotion['label'] = train_eval_emotion['label'].to_list()
embedded_val_eval_emotion['label'] = val_eval_emotion['label'].to_list()

starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 3257/3257 [00:16<00:00, 194.17it/s]


starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████████| 374/374 [00:01<00:00, 311.13it/s]


In [13]:
embedded_train_carer = fasttext.generate_sentence_embeddings(train_carer['text'])
embedded_val_carer = fasttext.generate_sentence_embeddings(val_carer['text'])
embedded_train_carer['label'] = train_carer['label'].to_list()
embedded_val_carer['label'] = val_carer['label'].to_list()

starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████| 16000/16000 [00:53<00:00, 297.39it/s]


starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 2000/2000 [00:05<00:00, 342.39it/s]


In [14]:
embedded_train_silicone = fasttext.generate_sentence_embeddings(train_silicone['text'])
embedded_val_silicone = fasttext.generate_sentence_embeddings(val_silicone['text'])
embedded_train_silicone['label'] = train_silicone['label'].to_list()
embedded_val_silicone['label'] = val_silicone['label'].to_list()

starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████| 87170/87170 [03:35<00:00, 405.28it/s]


starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 8069/8069 [00:18<00:00, 433.14it/s]


## Hyperopt

In [None]:
import yaml
#Load the template yaml sweep config file for logistic regression
#If the value range for an hyperparameter needs to be changed, better to do it in the .yaml file than in a notebook
with open("config/lr_sweep.yaml", 'r') as stream:
    sweep_config = yaml.safe_load(stream)

In [None]:
#The config is displayed as a nested dictionary
sweep_config

#### eval emotion

In [11]:
#Don't forget to name the sweep instance  
name = 'lr_tfidf_eval_emotion' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_tfidf(config = None,
          train=train_eval_emotion, #Change here
          val=val_eval_emotion): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = LogisticRegression(C = config.C,
                                 penalty = config.penalty,
                                 solver = config.solver,
                                 random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER) #Count : number of iterations
tracker.stop()



Create sweep with ID: biqapfvd
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/biqapfvd


[34m[1mwandb[0m: Agent Starting Run: i94rbph9 with config:
[34m[1mwandb[0m: 	C: 1.056866099612528
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.66664
AUC-PC,0.70335
accuracy,0.62042
f1 macro,0.62042


[34m[1mwandb[0m: Agent Starting Run: bjhlisid with config:
[34m[1mwandb[0m: 	C: 9.55048241527096
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.6688
AUC-PC,0.69298
accuracy,0.61649
f1 macro,0.61527


[34m[1mwandb[0m: Agent Starting Run: n1njk7mr with config:
[34m[1mwandb[0m: 	C: 1.155638178048689
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.66713
AUC-PC,0.7035
accuracy,0.62304
f1 macro,0.62303


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: dgqkept9 with config:
[34m[1mwandb[0m: 	C: 8.54461350434136
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.66908
AUC-PC,0.69388
accuracy,0.61518
f1 macro,0.61402


[34m[1mwandb[0m: Agent Starting Run: fo9d14qm with config:
[34m[1mwandb[0m: 	C: 7.114458570178693
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.66912
AUC-PC,0.69495
accuracy,0.6178
f1 macro,0.61664


[34m[1mwandb[0m: Agent Starting Run: 95bvmmqs with config:
[34m[1mwandb[0m: 	C: 1.6334231421943424
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.66821
AUC-PC,0.70344
accuracy,0.61518
f1 macro,0.61497


[34m[1mwandb[0m: Agent Starting Run: pg4yizgp with config:
[34m[1mwandb[0m: 	C: 8.388096600186808
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.66911
AUC-PC,0.69386
accuracy,0.61649
f1 macro,0.61538


[34m[1mwandb[0m: Agent Starting Run: z94999i6 with config:
[34m[1mwandb[0m: 	C: 6.751694093883
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.66914
AUC-PC,0.69504
accuracy,0.61911
f1 macro,0.6179


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: lhstew2i with config:
[34m[1mwandb[0m: 	C: 6.900613882199941
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.66928
AUC-PC,0.695
accuracy,0.61911
f1 macro,0.6179


[34m[1mwandb[0m: Agent Starting Run: 4b3ziuy6 with config:
[34m[1mwandb[0m: 	C: 4.847031198421675
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.66903
AUC-PC,0.69694
accuracy,0.61649
f1 macro,0.61549


[34m[1mwandb[0m: Agent Starting Run: 1idk7ym0 with config:
[34m[1mwandb[0m: 	C: 7.386080855938955
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.66923
AUC-PC,0.6946
accuracy,0.61911
f1 macro,0.6179


[34m[1mwandb[0m: Agent Starting Run: 9w497av5 with config:
[34m[1mwandb[0m: 	C: 5.829564325693516
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.66921
AUC-PC,0.69639
accuracy,0.6178
f1 macro,0.61664


[34m[1mwandb[0m: Agent Starting Run: b04hac4b with config:
[34m[1mwandb[0m: 	C: 7.177907446454338
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.66914
AUC-PC,0.6947
accuracy,0.6178
f1 macro,0.61664


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: g05zzjq3 with config:
[34m[1mwandb[0m: 	C: 5.387087709141829
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.66925
AUC-PC,0.69686
accuracy,0.61911
f1 macro,0.61801


[34m[1mwandb[0m: Agent Starting Run: up8y632y with config:
[34m[1mwandb[0m: 	C: 9.908298732605308
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.66874
AUC-PC,0.69272
accuracy,0.6178
f1 macro,0.61653


[34m[1mwandb[0m: Agent Starting Run: 1einez6o with config:
[34m[1mwandb[0m: 	C: 0.4878835041906193
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.66228
AUC-PC,0.70319
accuracy,0.60209
f1 macro,0.60065


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: zmu01o4f with config:
[34m[1mwandb[0m: 	C: 3.2486046936855417
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.66881
AUC-PC,0.69979
accuracy,0.61126
f1 macro,0.61053


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: iysjjokq with config:
[34m[1mwandb[0m: 	C: 9.487199190529989
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.66879
AUC-PC,0.69303
accuracy,0.61649
f1 macro,0.61527


[34m[1mwandb[0m: Agent Starting Run: 7eucz7v4 with config:
[34m[1mwandb[0m: 	C: 5.927375807283645
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.66919
AUC-PC,0.69644
accuracy,0.6178
f1 macro,0.61664


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 8tntierd with config:
[34m[1mwandb[0m: 	C: 6.294502558919288
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.66933
AUC-PC,0.69599
accuracy,0.6178
f1 macro,0.61664


0.0024484422628172895

In [13]:
emissions = pd.read_csv('output/emissions_hyperopt.csv')
emissions.head()

Unnamed: 0,timestamp,project_name,run_id,duration,emissions,emissions_rate,cpu_power,gpu_power,ram_power,cpu_energy,...,python_version,cpu_count,cpu_model,gpu_count,gpu_model,longitude,latitude,ram_total_size,tracking_mode,on_cloud
0,2022-08-03T10:23:21,lr_tfidf_sem_eval_A,ec709f36-05a9-4214-b01f-8e2cd89bb2b9,362.060907,0.000907,0.002504,42.5,0.0,4.457148,0.004274,...,3.7.13,4,Intel(R) Core(TM) i5-7200U CPU @ 2.50GHz,1,1 x GeForce 940MX,4.3945,50.7151,11.885727,machine,N


####  twentynews fasttext

In [38]:
#Don't forget to name the sweep instance   
name = 'lr_ft_eval_emotion' #change here
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_fasttext(config = None,
          train=embedded_train_eval_emotion, #Change here
          val=embedded_val_eval_emotion): #change here
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        clf = LogisticRegression(C = config.C,
                                 penalty = config.penalty,
                                 solver = config.solver,
                                 random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()



Create sweep with ID: py2bzp5f
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/py2bzp5f


[34m[1mwandb[0m: Agent Starting Run: w6uqzkpq with config:
[34m[1mwandb[0m: 	C: 0.001
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.61088
AUC-PC,0.63758
accuracy,0.45812
f1 macro,0.31418


[34m[1mwandb[0m: Agent Starting Run: aew3m0dz with config:
[34m[1mwandb[0m: 	C: 0.01
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.6144
AUC-PC,0.64263
accuracy,0.51963
f1 macro,0.46356


[34m[1mwandb[0m: Agent Starting Run: hk2ejizy with config:
[34m[1mwandb[0m: 	C: 0.1
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.63565
AUC-PC,0.66675
accuracy,0.59686
f1 macro,0.59483


[34m[1mwandb[0m: Agent Starting Run: gp0glhsd with config:
[34m[1mwandb[0m: 	C: 1
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.6738
AUC-PC,0.70349
accuracy,0.61649
f1 macro,0.61649


[34m[1mwandb[0m: Agent Starting Run: wn3qo5gp with config:
[34m[1mwandb[0m: 	C: 10
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.68126
AUC-PC,0.70546
accuracy,0.6322
f1 macro,0.63202


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


0.0006315412025605075

#### CARER

In [39]:
#Don't forget to name the sweep instance  
name = 'lr_tfidf_carer' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_tfidf(config = None,
          train=train_carer, #Change here
          val=val_carer): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = LogisticRegression(C = config.C,
                                 penalty = config.penalty,
                                 solver = config.solver,
                                 random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER)
tracker.stop()



Create sweep with ID: 0gju34lm
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/0gju34lm


[34m[1mwandb[0m: Agent Starting Run: 5hx32lbi with config:
[34m[1mwandb[0m: 	C: 0.001
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.45812
f1 macro,0.15709


[34m[1mwandb[0m: Agent Starting Run: eip7uvqu with config:
[34m[1mwandb[0m: 	C: 0.01
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.45812
f1 macro,0.15709


[34m[1mwandb[0m: Agent Starting Run: 8qz6u55c with config:
[34m[1mwandb[0m: 	C: 0.1
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.52618
f1 macro,0.24804


[34m[1mwandb[0m: Agent Starting Run: f8dmoo2e with config:
[34m[1mwandb[0m: 	C: 1
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.56283
f1 macro,0.29922


[34m[1mwandb[0m: Agent Starting Run: llr72gcx with config:
[34m[1mwandb[0m: 	C: 10
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.56675
f1 macro,0.32862


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


0.0006036175387369601

In [40]:
#Don't forget to name the sweep instance   
name = 'lr_ft_carer' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_fasttext(config = None,
          train=embedded_train_carer, #CHANGE HERE
          val=embedded_val_carer): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = LogisticRegression(C = config.C,
                                 penalty = config.penalty,
                                 solver = config.solver,
                                 random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()



Create sweep with ID: ylevv7se
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/ylevv7se


[34m[1mwandb[0m: Agent Starting Run: cc21g8kn with config:
[34m[1mwandb[0m: 	C: 0.001
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.45812
f1 macro,0.15709


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: d4aaxi0o with config:
[34m[1mwandb[0m: 	C: 0.01
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.45812
f1 macro,0.1586


[34m[1mwandb[0m: Agent Starting Run: n4g7dw9a with config:
[34m[1mwandb[0m: 	C: 0.1
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.53796
f1 macro,0.26719


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: xb924z9x with config:
[34m[1mwandb[0m: 	C: 1
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.56021
f1 macro,0.29694


[34m[1mwandb[0m: Agent Starting Run: ky4jo764 with config:
[34m[1mwandb[0m: 	C: 10
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.5733
f1 macro,0.34083


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


0.0006700701508274893

#### silicone

In [41]:
#Don't forget to name the sweep instance  
name = 'lr_tfidf_silicone' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_tfidf(config = None,
          train=train_silicone, #Change here
          val=val_silicone): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = LogisticRegression(C = config.C,
                                 penalty = config.penalty,
                                 solver = config.solver,
                                 random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf, count=OPT_ITER)
tracker.stop()



Create sweep with ID: 0inre770
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/0inre770


[34m[1mwandb[0m: Agent Starting Run: bp67aa44 with config:
[34m[1mwandb[0m: 	C: 0.001
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.60044
AUC-PC,0.38587
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: 9wlcp3mq with config:
[34m[1mwandb[0m: 	C: 0.01
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.60054
AUC-PC,0.38625
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: v26ws112 with config:
[34m[1mwandb[0m: 	C: 0.1
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.60154
AUC-PC,0.3895
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: 8szitlpk with config:
[34m[1mwandb[0m: 	C: 1
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.60429
AUC-PC,0.39823
accuracy,0.71614
f1 macro,0.43166


[34m[1mwandb[0m: Agent Starting Run: wdchaclx with config:
[34m[1mwandb[0m: 	C: 10
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.59739
AUC-PC,0.39006
accuracy,0.70173
f1 macro,0.52275


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


0.0006646967107482943

In [42]:
#Don't forget to name the sweep instance   
name = 'lr_ft_silicone' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_fasttext(config = None,
          train=embedded_train_silicone, #CHANGE HERE
          val=embedded_val_silicone): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = LogisticRegression(C = config.C,
                                 penalty = config.penalty,
                                 solver = config.solver,
                                 random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext, count=OPT_ITER)
tracker.stop()



Create sweep with ID: 1vqwlakx
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/1vqwlakx


[34m[1mwandb[0m: Agent Starting Run: hmzagdyz with config:
[34m[1mwandb[0m: 	C: 0.001
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.56793
AUC-PC,0.35435
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: bag0xroo with config:
[34m[1mwandb[0m: 	C: 0.01
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=0.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.5765
AUC-PC,0.36069
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: rf79gzb0 with config:
[34m[1mwandb[0m: 	C: 0.1
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.60359
AUC-PC,0.38088
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: rq4u5s5w with config:
[34m[1mwandb[0m: 	C: 1
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.62112
AUC-PC,0.40106
accuracy,0.71326
f1 macro,0.42115


[34m[1mwandb[0m: Agent Starting Run: k64wiijd with config:
[34m[1mwandb[0m: 	C: 10
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.62389
AUC-PC,0.39848
accuracy,0.7147
f1 macro,0.47675


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


0.0007418438058920015