In [1]:
#Connect to wandb
#TO DO : how to save models on the weight and bias platform
from collections.abc import MutableMapping

import wandb
wandb.login()
wandb.init(project="hyperopt", 
           entity="benchmark-nlp",
           name='polarity datasets rf') #CHANGE

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjtonglet[0m ([33mbenchmark-nlp[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
import os 
os.chdir('../../..')

In [3]:
#Load packages
import warnings
import io
import numpy as np
import pandas as pd
from codecarbon import EmissionsTracker
import yaml
from util.dataloader import DataLoader
from preprocessing.preprocessor import Preprocessor
from util.datasplitter import data_splitter
from preprocessing.fasttext_embeddings import FastTextEmbeddings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score
warnings.filterwarnings("ignore")

In [4]:
#Set constant values
SEED=42 
OPT_ITER=10

## Load data

In [5]:
dl = DataLoader(['polarity'])
data = dl.load()


tweet_preprocessor = Preprocessor(is_tweet=True)
preprocessor = Preprocessor()

data['yelp']['train'] = data['yelp']['train'].sample(frac=0.2,random_state=SEED).reset_index(drop=True)
data['yelp']['test'] = data['yelp']['test'].sample(frac=0.2,random_state=SEED).reset_index(drop=True)
#We are not interested in the test sets for hyperparameter optimization
train_imdb, val_imdb, _ = data_splitter(data['imdb'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)
train_yelp, val_yelp, _ = data_splitter(data['yelp'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)
train_sst2, val_sst2, _ = data_splitter(data['sst2'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)

112000 rows preprocessed in 153.40699648857117 seconds
7600 rows preprocessed in 11.952985286712646 seconds


In [6]:
%%time
#fasttext 
fasttext = FastTextEmbeddings()
fasttext.load_model('fasttext/cc.en.300.bin')

CPU times: total: 37.4 s
Wall time: 1min 5s




In [7]:
embedded_train_imdb = fasttext.generate_sentence_embeddings(train_imdb['text'])
embedded_val_imdb = fasttext.generate_sentence_embeddings(val_imdb['text'])
embedded_train_imdb['label'] = train_imdb['label'].to_list()
embedded_val_imdb['label'] = val_imdb['label'].to_list()

In [8]:
embedded_train_yelp = fasttext.generate_sentence_embeddings(train_yelp['text'])
embedded_val_yelp = fasttext.generate_sentence_embeddings(val_yelp['text'])
embedded_train_yelp['label'] = train_yelp['label'].to_list()
embedded_val_yelp['label'] = val_yelp['label'].to_list()

Starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████| 448000/448000 [40:43<00:00, 183.35it/s]


Starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████| 112000/112000 [09:56<00:00, 187.82it/s]


In [7]:
embedded_train_sst2 = fasttext.generate_sentence_embeddings(train_sst2['text'])
embedded_val_sst2= fasttext.generate_sentence_embeddings(val_sst2['text'])
embedded_train_sst2['label'] = train_sst2['label'].to_list()
embedded_val_sst2['label'] = val_sst2['label'].to_list()

Starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████| 67349/67349 [01:34<00:00, 710.64it/s]


Starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████████| 872/872 [00:01<00:00, 445.54it/s]


## Hyperopt

In [8]:
import yaml
#Load the template yaml sweep config file for logistic regression
#If the value range for an hyperparameter needs to be changed, better to do it in the .yaml file than in a notebook
with open("config/rf_sweep.yaml", 'r') as stream:
    sweep_config = yaml.safe_load(stream)

In [9]:
#The config is displayed as a nested dictionary. Check that it is the correct dictionary
sweep_config

{'method': 'random',
 'entity': 'benchmark-nlp',
 'project': 'hyperopt',
 'metric': {'name': 'loss', 'goal': 'minimize'},
 'parameters': {'n_estimators': {'min': 10,
   'max': 200,
   'distribution': 'int_uniform'},
  'max_features': {'values': ['log2', 'sqrt']},
  'random_state': {'value': 42}}}

#### imdb

In [12]:
#Don't forget to name the sweep instance  
name = 'rf_tfidf_imdb' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_tfidf(config = None,
          train=train_imdb, #Change here
          val=val_imdb): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = RandomForestClassifier(n_estimators=config.n_estimators,
                                max_features=config.max_features,
                                 random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER) #Count : number of iterations
tracker.stop()

Create sweep with ID: 63v1pz8z
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/63v1pz8z


[34m[1mwandb[0m: Agent Starting Run: mk7qk5n4 with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 155
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.9223
AUC-PC,0.91708
accuracy,0.841
f1 macro,0.84093


[34m[1mwandb[0m: Agent Starting Run: 4rnqkw37 with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 82
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.92091
AUC-PC,0.91109
accuracy,0.8418
f1 macro,0.84176


[34m[1mwandb[0m: Agent Starting Run: 1z2ky02t with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 181
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.92617
AUC-PC,0.92239
accuracy,0.846
f1 macro,0.84593


[34m[1mwandb[0m: Agent Starting Run: 6rtqldfz with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 66
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.91689
AUC-PC,0.90614
accuracy,0.8402
f1 macro,0.84017


[34m[1mwandb[0m: Agent Starting Run: vxwlzo2f with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 40
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.90274
AUC-PC,0.88858
accuracy,0.8238
f1 macro,0.82367


[34m[1mwandb[0m: Agent Starting Run: ddu5ufyo with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 38
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.8655
AUC-PC,0.84951
accuracy,0.7872
f1 macro,0.78709


[34m[1mwandb[0m: Agent Starting Run: u5f7o65t with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 16
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.79786
AUC-PC,0.77104
accuracy,0.7288
f1 macro,0.72787


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: klnr1vr8 with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 134
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.91784
AUC-PC,0.91185
accuracy,0.8354
f1 macro,0.83537


[34m[1mwandb[0m: Agent Starting Run: y48wy5sn with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 47
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.87851
AUC-PC,0.86529
accuracy,0.7972
f1 macro,0.79716


[34m[1mwandb[0m: Agent Starting Run: cbscjgav with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 160
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.92357
AUC-PC,0.91861
accuracy,0.8438
f1 macro,0.84378


0.0016801285692087998

In [13]:
#Don't forget to name the sweep instance   
name = 'rf_ft_imdb' #change here
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_fasttext(config = None,
          train=embedded_train_imdb, #Change here
          val=embedded_val_imdb): #change here
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        clf = RandomForestClassifier(n_estimators=config.n_estimators,
                                max_features=config.max_features,
                                 random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()

Create sweep with ID: v2xdezsx
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/v2xdezsx


[34m[1mwandb[0m: Agent Starting Run: 2a16jf5m with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 165
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.86374
AUC-PC,0.84989
accuracy,0.7836
f1 macro,0.78348


[34m[1mwandb[0m: Agent Starting Run: xsrorikr with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 49
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.8513
AUC-PC,0.83443
accuracy,0.7736
f1 macro,0.77347


[34m[1mwandb[0m: Agent Starting Run: anuggtt7 with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 125
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.86197
AUC-PC,0.84799
accuracy,0.7838
f1 macro,0.78366


[34m[1mwandb[0m: Agent Starting Run: ejwoxcxm with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 119
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.86179
AUC-PC,0.84747
accuracy,0.7842
f1 macro,0.78408


[34m[1mwandb[0m: Agent Starting Run: puonn3ft with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 101
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.85472
AUC-PC,0.83684
accuracy,0.773
f1 macro,0.77287


[34m[1mwandb[0m: Agent Starting Run: n01nlkea with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 170
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.85999
AUC-PC,0.84429
accuracy,0.7808
f1 macro,0.78073


[34m[1mwandb[0m: Agent Starting Run: u6ay4k49 with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 196
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.86444
AUC-PC,0.8508
accuracy,0.7836
f1 macro,0.78351


[34m[1mwandb[0m: Agent Starting Run: 3mltyxg9 with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 103
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.86058
AUC-PC,0.84556
accuracy,0.7846
f1 macro,0.78447


[34m[1mwandb[0m: Agent Starting Run: d22p2z6z with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 165
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.86374
AUC-PC,0.84989
accuracy,0.7836
f1 macro,0.78348


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: xfyg7kxx with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 139
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.85866
AUC-PC,0.84285
accuracy,0.7754
f1 macro,0.77526


0.0015362497353820191

#### YELP

In [None]:
#Don't forget to name the sweep instance  
name = 'rf_tfidf_yelp' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_tfidf(config = None,
          train=train_yelp, #Change here
          val=val_yelp): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = RandomForestClassifier(n_estimators=config.n_estimators,
                                max_features=config.max_features,
                                 random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

# #Track emissions
# tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
#                            output_file='output/emissions_hyperopt.csv')
# #Launch the agent
# tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER)
# tracker.stop()



Create sweep with ID: 49o4urgd
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/49o4urgd


[34m[1mwandb[0m: Agent Starting Run: ee38rpi0 with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 46
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.94526
AUC-PC,0.94114
accuracy,0.87116
f1 macro,0.87104


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: p5esb8p9 with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 52
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.94708
AUC-PC,0.94351
accuracy,0.8754
f1 macro,0.87529


[34m[1mwandb[0m: Agent Starting Run: kvet0vpp with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 186
[34m[1mwandb[0m: 	random_state: 42


In [None]:
#Don't forget to name the sweep instance   
name = 'rf_ft_yelp' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_fasttext(config = None,
          train=embedded_train_yelp, #CHANGE HERE
          val=embedded_val_yelp): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = RandomForestClassifier(n_estimators=config.n_estimators,
                                max_features=config.max_features,
                                 random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()

#### sst2

In [10]:
#Don't forget to name the sweep instance  
name = 'rf_tfidf_sst2' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_tfidf(config = None,
          train=train_sst2, #Change here
          val=val_sst2): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = RandomForestClassifier(n_estimators=config.n_estimators,
                                max_features=config.max_features,
                                 random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf, count=OPT_ITER)
tracker.stop()

Create sweep with ID: infpa5r9
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/infpa5r9


[34m[1mwandb[0m: Agent Starting Run: y81etzum with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 194
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.84485
AUC-PC,0.8469
accuracy,0.77523
f1 macro,0.77503


[34m[1mwandb[0m: Agent Starting Run: qd6yywum with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 24
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.84861
AUC-PC,0.84
accuracy,0.77752
f1 macro,0.77705


[34m[1mwandb[0m: Agent Starting Run: qx77sbyy with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 18
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.83062
AUC-PC,0.82911
accuracy,0.7695
f1 macro,0.76933


[34m[1mwandb[0m: Agent Starting Run: u0ydpmym with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 70
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.85166
AUC-PC,0.85411
accuracy,0.77867
f1 macro,0.77852


[34m[1mwandb[0m: Agent Starting Run: av9hedf7 with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 113
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.86867
AUC-PC,0.86326
accuracy,0.79014
f1 macro,0.78947


[34m[1mwandb[0m: Agent Starting Run: hyj1l51a with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 127
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.86995
AUC-PC,0.86494
accuracy,0.78899
f1 macro,0.78824


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: h5njt0qe with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 118
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.86956
AUC-PC,0.86518
accuracy,0.78899
f1 macro,0.78824


[34m[1mwandb[0m: Agent Starting Run: ok1dgmhq with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 69
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.86469
AUC-PC,0.85755
accuracy,0.78555
f1 macro,0.78493


[34m[1mwandb[0m: Agent Starting Run: 93n7l37l with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 76
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.85212
AUC-PC,0.85627
accuracy,0.78096
f1 macro,0.78086


[34m[1mwandb[0m: Agent Starting Run: qfjihw7e with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 14
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.83073
AUC-PC,0.81335
accuracy,0.76261
f1 macro,0.76252


0.0049305007679321475

In [11]:
#Don't forget to name the sweep instance   
name = 'rf_ft_sst2' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_fasttext(config = None,
          train=embedded_train_sst2, #CHANGE HERE
          val=embedded_val_sst2): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = RandomForestClassifier(n_estimators=config.n_estimators,
                                max_features=config.max_features,
                                 random_state=config.random_state) #set the hyperparams here

        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext, count=OPT_ITER)
tracker.stop()

Create sweep with ID: 8obdrf43
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/8obdrf43


[34m[1mwandb[0m: Agent Starting Run: 248sqgth with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 172
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.84229
AUC-PC,0.86053
accuracy,0.74885
f1 macro,0.74633


[34m[1mwandb[0m: Agent Starting Run: od8b77am with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 130
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.84021
AUC-PC,0.85822
accuracy,0.74427
f1 macro,0.74145


[34m[1mwandb[0m: Agent Starting Run: lu8lhkcy with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 15
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.79061
AUC-PC,0.79273
accuracy,0.70069
f1 macro,0.69724


[34m[1mwandb[0m: Agent Starting Run: tfu23y68 with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 180
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.843
AUC-PC,0.86081
accuracy,0.74656
f1 macro,0.74377


[34m[1mwandb[0m: Agent Starting Run: c7ciqknu with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 90
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.83943
AUC-PC,0.855
accuracy,0.74427
f1 macro,0.74157


[34m[1mwandb[0m: Agent Starting Run: hd9yhkad with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 149
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.84209
AUC-PC,0.8597
accuracy,0.74427
f1 macro,0.74157


[34m[1mwandb[0m: Agent Starting Run: 61hg8qrz with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 149
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.83636
AUC-PC,0.8466
accuracy,0.75229
f1 macro,0.7505


[34m[1mwandb[0m: Agent Starting Run: isvx57bo with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 135
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.84135
AUC-PC,0.85971
accuracy,0.75
f1 macro,0.74719


[34m[1mwandb[0m: Agent Starting Run: 1lylnkyh with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 88
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.83603
AUC-PC,0.84581
accuracy,0.75
f1 macro,0.74809


[34m[1mwandb[0m: Agent Starting Run: bavg5x0l with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 195
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.83957
AUC-PC,0.84999
accuracy,0.75344
f1 macro,0.75179


0.004083005284044849