In [1]:
#Connect to wandb
#TO DO : how to save models on the weight and bias platform
import wandb
wandb.login()
wandb.init(project="xgb", 
           entity="benchmark-nlp",
           name='topic datasets') #CHANGE

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjtonglet[0m ([33mbenchmark-nlp[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
import os
os.chdir('../../..')

In [3]:
#Load packages
import warnings
import io
import numpy as np
import pandas as pd
from codecarbon import EmissionsTracker
import yaml
from util.dataloader import DataLoader
from preprocessing.preprocessor import Preprocessor
from util.datasplitter import data_splitter
from preprocessing.fasttext_embeddings import FastTextEmbeddings
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score
warnings.filterwarnings("ignore")

In [4]:
#Set constant values
SEED=42 
OPT_ITER=20

## Load data

In [5]:
dl = DataLoader(['topic'])
data = dl.load()


tweet_preprocessor = Preprocessor(is_tweet=True)
preprocessor = Preprocessor()

#We are not interested in the test sets for hyperparameter optimization
train_twentynews, val_twentynews, _ = data_splitter(data['twentynews'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)
train_agnews, val_agnews, _ = data_splitter(data['agnews'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)
train_yahoo, val_yahoo, _ = data_splitter(data['yahoo'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)

120000 rows preprocessed in 66.11223077774048 seconds
7600 rows preprocessed in 3.4647884368896484 seconds


In [6]:
%%time
#fasttext 
fasttext = FastTextEmbeddings()
fasttext.load_model('fasttext/cc.en.300.bin')

CPU times: total: 30.2 s
Wall time: 48.2 s






In [7]:
embedded_train_twentynews = fasttext.generate_sentence_embeddings(train_twentynews['text'])
embedded_val_twentynews = fasttext.generate_sentence_embeddings(val_twentynews['text'])
embedded_train_twentynews['label'] = train_twentynews['label'].to_list()
embedded_val_twentynews['label'] = val_twentynews['label'].to_list()

Starting to generate sentence embeddings


100%|██████████████████████████████████████████████████████████████████████████████| 9051/9051 [01:59<00:00, 75.62it/s]


Starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 2263/2263 [00:17<00:00, 127.40it/s]


In [7]:
embedded_train_agnews = fasttext.generate_sentence_embeddings(train_agnews['text'])
embedded_val_agnews = fasttext.generate_sentence_embeddings(val_agnews['text'])
embedded_train_agnews['label'] = train_agnews['label'].to_list()
embedded_val_agnews['label'] = val_agnews['label'].to_list()

Starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████| 96000/96000 [04:35<00:00, 348.01it/s]


Starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████| 24000/24000 [01:14<00:00, 320.24it/s]


In [8]:
embedded_train_yahoo = fasttext.generate_sentence_embeddings(train_yahoo['text'])
embedded_val_yahoo = fasttext.generate_sentence_embeddings(val_yahoo['text'])
embedded_train_yahoo['label'] = train_yahoo['label'].to_list()
embedded_val_yahoo['label'] = val_yahoo['label'].to_list()

starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████| 1120000/1120000 [1:10:51<00:00, 263.45it/s]


starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████| 280000/280000 [19:35<00:00, 238.10it/s]


## Hyperopt

In [8]:
import yaml
#Load the template yaml sweep config file for logistic regression
#If the value range for an hyperparameter needs to be changed, better to do it in the .yaml file than in a notebook
with open("config/xgb_sweep.yaml", 'r') as stream:
    sweep_config = yaml.safe_load(stream)

In [9]:
#The config is displayed as a nested dictionary
sweep_config

{'method': 'random',
 'entity': 'benchmark-nlp',
 'project': 'hyperopt',
 'metric': {'name': 'loss', 'goal': 'minimize'},
 'parameters': {'gamma': {'min': 0, 'max': 1, 'distribution': 'uniform'},
  'n_estimators': {'min': 10, 'max': 150, 'distribution': 'int_uniform'},
  'learning_rate': {'min': 0.001, 'max': 0.1, 'distribution': 'uniform'},
  'max_depth': {'min': 0, 'max': 10, 'distribution': 'int_uniform'},
  'random_state': {'value': 42}}}

#### twenty news   Tf-Idf

In [10]:
#Don't forget to name the sweep instance  
name = 'xgb_tfidf_twentynews' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="xgb")

def train_tfidf(config = None,
          train=train_twentynews, #Change here
          val=val_twentynews): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = XGBClassifier(gamma=config.gamma,
                                 n_estimators=config.n_estimators,
                                 learning_rate=config.learning_rate,
                                 max_depth=config.max_depth,
                                 random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER) #Count : number of iterations
tracker.stop()

Create sweep with ID: 4fbrlrzg
Sweep URL: https://wandb.ai/benchmark-nlp/xgb/sweeps/4fbrlrzg


[34m[1mwandb[0m: Agent Starting Run: k4dgcf3w with config:
[34m[1mwandb[0m: 	gamma: 0.03752195732255115
[34m[1mwandb[0m: 	learning_rate: 0.054755228213609314
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	n_estimators: 93
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.63544
f1 macro,0.63184


[34m[1mwandb[0m: Agent Starting Run: f9nacfrv with config:
[34m[1mwandb[0m: 	gamma: 0.1009587978946167
[34m[1mwandb[0m: 	learning_rate: 0.07181920317070158
[34m[1mwandb[0m: 	max_depth: 7
[34m[1mwandb[0m: 	n_estimators: 77
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.62483
f1 macro,0.62238


[34m[1mwandb[0m: Agent Starting Run: clku5eas with config:
[34m[1mwandb[0m: 	gamma: 0.5873760886693551
[34m[1mwandb[0m: 	learning_rate: 0.07956611912253285
[34m[1mwandb[0m: 	max_depth: 7
[34m[1mwandb[0m: 	n_estimators: 26
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.59832
f1 macro,0.60218


[34m[1mwandb[0m: Agent Starting Run: 7zdm7pav with config:
[34m[1mwandb[0m: 	gamma: 0.2582487795573053
[34m[1mwandb[0m: 	learning_rate: 0.05188828756738027
[34m[1mwandb[0m: 	max_depth: 1
[34m[1mwandb[0m: 	n_estimators: 51
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.4989
f1 macro,0.52904


[34m[1mwandb[0m: Agent Starting Run: y87eah3v with config:
[34m[1mwandb[0m: 	gamma: 0.5786453612675135
[34m[1mwandb[0m: 	learning_rate: 0.021766232496961344
[34m[1mwandb[0m: 	max_depth: 4
[34m[1mwandb[0m: 	n_estimators: 127
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.58772
f1 macro,0.59916


[34m[1mwandb[0m: Agent Starting Run: lbkq85jb with config:
[34m[1mwandb[0m: 	gamma: 0.6703376151741676
[34m[1mwandb[0m: 	learning_rate: 0.07299364468315286
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	n_estimators: 102
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.62174
f1 macro,0.62234


[34m[1mwandb[0m: Agent Starting Run: 1eopezrl with config:
[34m[1mwandb[0m: 	gamma: 0.21815475203467105
[34m[1mwandb[0m: 	learning_rate: 0.0950134673992325
[34m[1mwandb[0m: 	max_depth: 1
[34m[1mwandb[0m: 	n_estimators: 88
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.57623
f1 macro,0.59191


[34m[1mwandb[0m: Agent Starting Run: u5q0khyf with config:
[34m[1mwandb[0m: 	gamma: 0.20246721634835563
[34m[1mwandb[0m: 	learning_rate: 0.03807599707558131
[34m[1mwandb[0m: 	max_depth: 8
[34m[1mwandb[0m: 	n_estimators: 135
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.62749
f1 macro,0.62456


[34m[1mwandb[0m: Agent Starting Run: 4r6qa1a6 with config:
[34m[1mwandb[0m: 	gamma: 0.12807670684833183
[34m[1mwandb[0m: 	learning_rate: 0.09295295120584476
[34m[1mwandb[0m: 	max_depth: 4
[34m[1mwandb[0m: 	n_estimators: 26
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.58285
f1 macro,0.59437


[34m[1mwandb[0m: Agent Starting Run: m858vdz3 with config:
[34m[1mwandb[0m: 	gamma: 0.08767575158127094
[34m[1mwandb[0m: 	learning_rate: 0.07426731584035426
[34m[1mwandb[0m: 	max_depth: 2
[34m[1mwandb[0m: 	n_estimators: 122
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.61821
f1 macro,0.62257


[34m[1mwandb[0m: Agent Starting Run: unu0tgst with config:
[34m[1mwandb[0m: 	gamma: 0.45260804983283354
[34m[1mwandb[0m: 	learning_rate: 0.04535154450584765
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	n_estimators: 44
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.58727
f1 macro,0.59412


[34m[1mwandb[0m: Agent Starting Run: 4hzq3sgp with config:
[34m[1mwandb[0m: 	gamma: 0.3472962961363365
[34m[1mwandb[0m: 	learning_rate: 0.023220566700419772
[34m[1mwandb[0m: 	max_depth: 8
[34m[1mwandb[0m: 	n_estimators: 138
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.61246
f1 macro,0.61306


[34m[1mwandb[0m: Agent Starting Run: l5x3tkgr with config:
[34m[1mwandb[0m: 	gamma: 0.21180847008166925
[34m[1mwandb[0m: 	learning_rate: 0.06356437381252693
[34m[1mwandb[0m: 	max_depth: 8
[34m[1mwandb[0m: 	n_estimators: 68
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.62439
f1 macro,0.62202


[34m[1mwandb[0m: Agent Starting Run: qbblfqr2 with config:
[34m[1mwandb[0m: 	gamma: 0.00983438970230377
[34m[1mwandb[0m: 	learning_rate: 0.05632648645818482
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 36
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.58506
f1 macro,0.59456


[34m[1mwandb[0m: Agent Starting Run: 1d6eej65 with config:
[34m[1mwandb[0m: 	gamma: 0.65564405311308
[34m[1mwandb[0m: 	learning_rate: 0.08690721019397968
[34m[1mwandb[0m: 	max_depth: 4
[34m[1mwandb[0m: 	n_estimators: 11
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.54441
f1 macro,0.5654


[34m[1mwandb[0m: Agent Starting Run: bq4rrf2x with config:
[34m[1mwandb[0m: 	gamma: 0.8390885013857353
[34m[1mwandb[0m: 	learning_rate: 0.0363261127606314
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	n_estimators: 116
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.61423
f1 macro,0.61353


[34m[1mwandb[0m: Agent Starting Run: 6u0hbsbq with config:
[34m[1mwandb[0m: 	gamma: 0.4452685250165819
[34m[1mwandb[0m: 	learning_rate: 0.05332238516175772
[34m[1mwandb[0m: 	max_depth: 2
[34m[1mwandb[0m: 	n_estimators: 59
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.55413
f1 macro,0.573


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: r5uv60a7 with config:
[34m[1mwandb[0m: 	gamma: 0.06291333891130735
[34m[1mwandb[0m: 	learning_rate: 0.05459297622219
[34m[1mwandb[0m: 	max_depth: 8
[34m[1mwandb[0m: 	n_estimators: 113
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.63632
f1 macro,0.63234


[34m[1mwandb[0m: Agent Starting Run: cyk1ored with config:
[34m[1mwandb[0m: 	gamma: 0.3312356548350702
[34m[1mwandb[0m: 	learning_rate: 0.04700386955694736
[34m[1mwandb[0m: 	max_depth: 0
[34m[1mwandb[0m: 	n_estimators: 122
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.04507
f1 macro,0.00431


[34m[1mwandb[0m: Agent Starting Run: n5nh11lp with config:
[34m[1mwandb[0m: 	gamma: 0.8104623257014634
[34m[1mwandb[0m: 	learning_rate: 0.009157205322550096
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	n_estimators: 87
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.5232
f1 macro,0.55099


0.01084978951487711

In [11]:
#Don't forget to name the sweep instance   
name = 'xgb_ft_twentynews' #change here
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="xgb")

def train_fasttext(config = None,
          train=embedded_train_twentynews, #Change here
          val=embedded_val_twentynews): #change here
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        clf = XGBClassifier(gamma=config.gamma,
                                 n_estimators=config.n_estimators,
                                 learning_rate=config.learning_rate,
                                 max_depth=config.max_depth,
                                 random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()



Create sweep with ID: dhu4wzbf
Sweep URL: https://wandb.ai/benchmark-nlp/xgb/sweeps/dhu4wzbf


[34m[1mwandb[0m: Agent Starting Run: 4348ggqo with config:
[34m[1mwandb[0m: 	gamma: 0.2656303509439727
[34m[1mwandb[0m: 	learning_rate: 0.0018291344176754349
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	n_estimators: 45
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.43129
f1 macro,0.42427


[34m[1mwandb[0m: Agent Starting Run: helb9qs5 with config:
[34m[1mwandb[0m: 	gamma: 0.576933686154143
[34m[1mwandb[0m: 	learning_rate: 0.0265968217591904
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 14
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.48917
f1 macro,0.4783


[34m[1mwandb[0m: Agent Starting Run: 2mgwpbf9 with config:
[34m[1mwandb[0m: 	gamma: 0.7427754533250384
[34m[1mwandb[0m: 	learning_rate: 0.08163783859612563
[34m[1mwandb[0m: 	max_depth: 0
[34m[1mwandb[0m: 	n_estimators: 71
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.04507
f1 macro,0.00431


[34m[1mwandb[0m: Agent Starting Run: glvh8gtj with config:
[34m[1mwandb[0m: 	gamma: 0.4293873525308529
[34m[1mwandb[0m: 	learning_rate: 0.048343362478680894
[34m[1mwandb[0m: 	max_depth: 8
[34m[1mwandb[0m: 	n_estimators: 81
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.58683
f1 macro,0.57663


[34m[1mwandb[0m: Agent Starting Run: 4eu2fncz with config:
[34m[1mwandb[0m: 	gamma: 0.9185914410067226
[34m[1mwandb[0m: 	learning_rate: 0.024452388435396873
[34m[1mwandb[0m: 	max_depth: 2
[34m[1mwandb[0m: 	n_estimators: 13
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.36854
f1 macro,0.34853


[34m[1mwandb[0m: Agent Starting Run: puchf6ga with config:
[34m[1mwandb[0m: 	gamma: 0.6239519308880789
[34m[1mwandb[0m: 	learning_rate: 0.015299350935067416
[34m[1mwandb[0m: 	max_depth: 1
[34m[1mwandb[0m: 	n_estimators: 28
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.29165
f1 macro,0.2655


[34m[1mwandb[0m: Agent Starting Run: sm0q8662 with config:
[34m[1mwandb[0m: 	gamma: 0.6500074720772155
[34m[1mwandb[0m: 	learning_rate: 0.0364597846732026
[34m[1mwandb[0m: 	max_depth: 1
[34m[1mwandb[0m: 	n_estimators: 98
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.43394
f1 macro,0.4072


[34m[1mwandb[0m: Agent Starting Run: 43t6gcwq with config:
[34m[1mwandb[0m: 	gamma: 0.7021365585809042
[34m[1mwandb[0m: 	learning_rate: 0.007144112754452061
[34m[1mwandb[0m: 	max_depth: 7
[34m[1mwandb[0m: 	n_estimators: 38
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.48343
f1 macro,0.47336


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: nj0w85p8 with config:
[34m[1mwandb[0m: 	gamma: 0.5336555363528805
[34m[1mwandb[0m: 	learning_rate: 0.08539858602459152
[34m[1mwandb[0m: 	max_depth: 7
[34m[1mwandb[0m: 	n_estimators: 22
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.55281
f1 macro,0.54386


[34m[1mwandb[0m: Agent Starting Run: a4kkgu6e with config:
[34m[1mwandb[0m: 	gamma: 0.08724323313643845
[34m[1mwandb[0m: 	learning_rate: 0.021789325155024495
[34m[1mwandb[0m: 	max_depth: 1
[34m[1mwandb[0m: 	n_estimators: 118
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.4167
f1 macro,0.38843


[34m[1mwandb[0m: Agent Starting Run: jnkzmnh0 with config:
[34m[1mwandb[0m: 	gamma: 0.22853110160310489
[34m[1mwandb[0m: 	learning_rate: 0.0239818926216122
[34m[1mwandb[0m: 	max_depth: 4
[34m[1mwandb[0m: 	n_estimators: 60
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.53557
f1 macro,0.52258


[34m[1mwandb[0m: Agent Starting Run: dm7nu58s with config:
[34m[1mwandb[0m: 	gamma: 0.33276228589374035
[34m[1mwandb[0m: 	learning_rate: 0.012146040915416038
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	n_estimators: 60
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.52143
f1 macro,0.51139


[34m[1mwandb[0m: Agent Starting Run: 2v9mcymu with config:
[34m[1mwandb[0m: 	gamma: 0.19478286339283524
[34m[1mwandb[0m: 	learning_rate: 0.0984985136408804
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 28
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.56209
f1 macro,0.55237


[34m[1mwandb[0m: Agent Starting Run: fqqd3zem with config:
[34m[1mwandb[0m: 	gamma: 0.5579020913592622
[34m[1mwandb[0m: 	learning_rate: 0.008843209138817046
[34m[1mwandb[0m: 	max_depth: 1
[34m[1mwandb[0m: 	n_estimators: 149
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.35616
f1 macro,0.32681


[34m[1mwandb[0m: Agent Starting Run: gw4krdvf with config:
[34m[1mwandb[0m: 	gamma: 0.6484552660742816
[34m[1mwandb[0m: 	learning_rate: 0.06910601533687517
[34m[1mwandb[0m: 	max_depth: 0
[34m[1mwandb[0m: 	n_estimators: 22
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.04507
f1 macro,0.00431


[34m[1mwandb[0m: Agent Starting Run: b193e7re with config:
[34m[1mwandb[0m: 	gamma: 0.36570276899522114
[34m[1mwandb[0m: 	learning_rate: 0.06991222516895339
[34m[1mwandb[0m: 	max_depth: 1
[34m[1mwandb[0m: 	n_estimators: 139
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.5148
f1 macro,0.49358


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 7zy8nfk4 with config:
[34m[1mwandb[0m: 	gamma: 0.6993427019691648
[34m[1mwandb[0m: 	learning_rate: 0.061940018050278864
[34m[1mwandb[0m: 	max_depth: 8
[34m[1mwandb[0m: 	n_estimators: 120
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.60981
f1 macro,0.59866


[34m[1mwandb[0m: Agent Starting Run: whd1xyio with config:
[34m[1mwandb[0m: 	gamma: 0.36635502836232814
[34m[1mwandb[0m: 	learning_rate: 0.03796675994726074
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	n_estimators: 144
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.57446
f1 macro,0.55815


[34m[1mwandb[0m: Agent Starting Run: enkv12f6 with config:
[34m[1mwandb[0m: 	gamma: 0.4065655892429949
[34m[1mwandb[0m: 	learning_rate: 0.02038825587498175
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	n_estimators: 25
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.44764
f1 macro,0.4297


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 3xq75g7o with config:
[34m[1mwandb[0m: 	gamma: 0.2011395388105972
[34m[1mwandb[0m: 	learning_rate: 0.010623386023852993
[34m[1mwandb[0m: 	max_depth: 4
[34m[1mwandb[0m: 	n_estimators: 56
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.49006
f1 macro,0.47531


0.016024379952925192

In [12]:
del train_twentynews
del val_twentynews
del embedded_train_twentynews
del embedded_val_twentynews

#### agnews tfidf

In [8]:
#Don't forget to name the sweep instance  
name = 'xgb_tfidf_agnews' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="xgb")

def train_tfidf(config = None,
          train=train_agnews, #Change here
          val=val_agnews): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = XGBClassifier(gamma=config.gamma,
                                 n_estimators=config.n_estimators,
                                 learning_rate=config.learning_rate,
                                 max_depth=config.max_depth,
                                 random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER)
tracker.stop()



Create sweep with ID: gkpw9mj1
Sweep URL: https://wandb.ai/benchmark-nlp/xgb/sweeps/gkpw9mj1


[34m[1mwandb[0m: Agent Starting Run: 8ynsxcw1 with config:
[34m[1mwandb[0m: 	gamma: 0.5023710420193381
[34m[1mwandb[0m: 	learning_rate: 0.01223540129501244
[34m[1mwandb[0m: 	max_depth: 8
[34m[1mwandb[0m: 	n_estimators: 92
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.72671
f1 macro,0.73546


[34m[1mwandb[0m: Agent Starting Run: dfmuiyaq with config:
[34m[1mwandb[0m: 	gamma: 0.6309087925819006
[34m[1mwandb[0m: 	learning_rate: 0.09972042883246456
[34m[1mwandb[0m: 	max_depth: 0
[34m[1mwandb[0m: 	n_estimators: 21
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.24629
f1 macro,0.09881


[34m[1mwandb[0m: Agent Starting Run: 68pc3rpi with config:
[34m[1mwandb[0m: 	gamma: 0.9385380469042464
[34m[1mwandb[0m: 	learning_rate: 0.09807680973793714
[34m[1mwandb[0m: 	max_depth: 0
[34m[1mwandb[0m: 	n_estimators: 111
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.24629
f1 macro,0.09881


[34m[1mwandb[0m: Agent Starting Run: caf3pju6 with config:
[34m[1mwandb[0m: 	gamma: 0.1858443772857248
[34m[1mwandb[0m: 	learning_rate: 0.0980341427093368
[34m[1mwandb[0m: 	max_depth: 1
[34m[1mwandb[0m: 	n_estimators: 119
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.728
f1 macro,0.73407


[34m[1mwandb[0m: Agent Starting Run: nywb28af with config:
[34m[1mwandb[0m: 	gamma: 0.11682629174261304
[34m[1mwandb[0m: 	learning_rate: 0.08735350894887137
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 138
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.87729
f1 macro,0.87703


[34m[1mwandb[0m: Agent Starting Run: hbsqal1i with config:
[34m[1mwandb[0m: 	gamma: 0.34124112233222703
[34m[1mwandb[0m: 	learning_rate: 0.08268553900425706
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	n_estimators: 134
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.85262
f1 macro,0.85257


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: cceigwto with config:
[34m[1mwandb[0m: 	gamma: 0.6626108437382487
[34m[1mwandb[0m: 	learning_rate: 0.03005748802031511
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 41
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.6975
f1 macro,0.70903


[34m[1mwandb[0m: Agent Starting Run: h9h2wwfe with config:
[34m[1mwandb[0m: 	gamma: 0.8337268379583772
[34m[1mwandb[0m: 	learning_rate: 0.0733388300635269
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 48
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.81767
f1 macro,0.81889


[34m[1mwandb[0m: Agent Starting Run: ni01lsv0 with config:
[34m[1mwandb[0m: 	gamma: 0.6198485784934435
[34m[1mwandb[0m: 	learning_rate: 0.032646000744880076
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	n_estimators: 21
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.67167
f1 macro,0.6853


[34m[1mwandb[0m: Agent Starting Run: ghlbg3m3 with config:
[34m[1mwandb[0m: 	gamma: 0.64752727298498
[34m[1mwandb[0m: 	learning_rate: 0.07819778419830907
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 145
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.84542
f1 macro,0.84545


[34m[1mwandb[0m: Agent Starting Run: vbdgjphn with config:
[34m[1mwandb[0m: 	gamma: 0.8118149779367887
[34m[1mwandb[0m: 	learning_rate: 0.05644265581662261
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 146
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.86287
f1 macro,0.86276


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: by292zkm with config:
[34m[1mwandb[0m: 	gamma: 0.4961575437702028
[34m[1mwandb[0m: 	learning_rate: 0.059838636124657354
[34m[1mwandb[0m: 	max_depth: 1
[34m[1mwandb[0m: 	n_estimators: 28
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.59488
f1 macro,0.60797


[34m[1mwandb[0m: Agent Starting Run: ckvj47z1 with config:
[34m[1mwandb[0m: 	gamma: 0.0942581793447448
[34m[1mwandb[0m: 	learning_rate: 0.05955929847967543
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 66
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.82308
f1 macro,0.82414


[34m[1mwandb[0m: Agent Starting Run: xzpb96mb with config:
[34m[1mwandb[0m: 	gamma: 0.9580315330558484
[34m[1mwandb[0m: 	learning_rate: 0.05334227545696052
[34m[1mwandb[0m: 	max_depth: 1
[34m[1mwandb[0m: 	n_estimators: 114
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.68571
f1 macro,0.69562


[34m[1mwandb[0m: Agent Starting Run: g0dt5if2 with config:
[34m[1mwandb[0m: 	gamma: 0.2733806699045307
[34m[1mwandb[0m: 	learning_rate: 0.052587737488062586
[34m[1mwandb[0m: 	max_depth: 0
[34m[1mwandb[0m: 	n_estimators: 118
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.24629
f1 macro,0.09881


[34m[1mwandb[0m: Agent Starting Run: iu6rdv08 with config:
[34m[1mwandb[0m: 	gamma: 0.7070107968609943
[34m[1mwandb[0m: 	learning_rate: 0.05111085253168371
[34m[1mwandb[0m: 	max_depth: 1
[34m[1mwandb[0m: 	n_estimators: 68
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.63075
f1 macro,0.64455


[34m[1mwandb[0m: Agent Starting Run: wt5zwryz with config:
[34m[1mwandb[0m: 	gamma: 0.3192685273212982
[34m[1mwandb[0m: 	learning_rate: 0.05731013061815765
[34m[1mwandb[0m: 	max_depth: 7
[34m[1mwandb[0m: 	n_estimators: 87
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.81663
f1 macro,0.81781


[34m[1mwandb[0m: Agent Starting Run: s7kkzqua with config:
[34m[1mwandb[0m: 	gamma: 0.7422939699893782
[34m[1mwandb[0m: 	learning_rate: 0.0898243017009522
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	n_estimators: 58
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.81204
f1 macro,0.81317


[34m[1mwandb[0m: Agent Starting Run: 41h6l5yj with config:
[34m[1mwandb[0m: 	gamma: 0.13302856345369685
[34m[1mwandb[0m: 	learning_rate: 0.05247245014944386
[34m[1mwandb[0m: 	max_depth: 1
[34m[1mwandb[0m: 	n_estimators: 122
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.68817
f1 macro,0.69822


[34m[1mwandb[0m: Agent Starting Run: z1uc9akl with config:
[34m[1mwandb[0m: 	gamma: 0.791633212142687
[34m[1mwandb[0m: 	learning_rate: 0.026603168950532944
[34m[1mwandb[0m: 	max_depth: 0
[34m[1mwandb[0m: 	n_estimators: 108
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.24629
f1 macro,0.09881


0.009679389894775407

In [None]:
#Don't forget to name the sweep instance   
name = 'xgb_ft_agnews' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="xgb")

def train_fasttext(config = None,
          train=embedded_train_agnews, #CHANGE HERE
          val=embedded_val_agnews): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = XGBClassifier(gamma=config.gamma,
                                 n_estimators=config.n_estimators,
                                 learning_rate=config.learning_rate,
                                 max_depth=config.max_depth,
                                 random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()  #13 iterations remaining



Create sweep with ID: rzu2pnun
Sweep URL: https://wandb.ai/benchmark-nlp/xgb/sweeps/rzu2pnun


[34m[1mwandb[0m: Agent Starting Run: xtruiii0 with config:
[34m[1mwandb[0m: 	gamma: 0.746700151413228
[34m[1mwandb[0m: 	learning_rate: 0.045952256010118264
[34m[1mwandb[0m: 	max_depth: 8
[34m[1mwandb[0m: 	n_estimators: 77
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.87242
f1 macro,0.87187


[34m[1mwandb[0m: Agent Starting Run: yn2ey9vy with config:
[34m[1mwandb[0m: 	gamma: 0.70142947397209
[34m[1mwandb[0m: 	learning_rate: 0.07035052028549085
[34m[1mwandb[0m: 	max_depth: 8
[34m[1mwandb[0m: 	n_estimators: 30
[34m[1mwandb[0m: 	random_state: 42


In [None]:
del train_agnews
del val_agnews
del embedded_train_agnews
del embedded_val_agnews

#### yahoo

In [15]:
#Don't forget to name the sweep instance  
name = 'xgb_tfidf_yahoo' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="xgb")

def train_tfidf(config = None,
          train=train_yahoo, #Change here
          val=val_yahoo): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = XGBClassifier(gamma=config.gamma,
                                 n_estimators=config.n_estimators,
                                 learning_rate=config.learning_rate,
                                 max_depth=config.max_depth,
                                 random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf, count=OPT_ITER)
tracker.stop()



Create sweep with ID: x86sj11j
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/x86sj11j


[34m[1mwandb[0m: Agent Starting Run: af0cnjkn with config:
[34m[1mwandb[0m: 	C: 5.905437512178784
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.44918
f1 macro,0.50039


[34m[1mwandb[0m: Agent Starting Run: h7uylwu8 with config:
[34m[1mwandb[0m: 	C: 7.619241650157256
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.4472
f1 macro,0.4983


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: atjbi3h5 with config:
[34m[1mwandb[0m: 	C: 3.601994999279711
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.44867
f1 macro,0.50021


[34m[1mwandb[0m: Agent Starting Run: ods491nt with config:
[34m[1mwandb[0m: 	C: 9.729832956448467
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.44368
f1 macro,0.49548


[34m[1mwandb[0m: Agent Starting Run: gvpns7wj with config:
[34m[1mwandb[0m: 	C: 3.0689457184945255
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.44908
f1 macro,0.50044


[34m[1mwandb[0m: Agent Starting Run: muc8lvn9 with config:
[34m[1mwandb[0m: 	C: 3.753341077904725
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.44362
f1 macro,0.49523


[34m[1mwandb[0m: Agent Starting Run: v3wi4780 with config:
[34m[1mwandb[0m: 	C: 6.451182502004491
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.44826
f1 macro,0.49979


[34m[1mwandb[0m: Agent Starting Run: mpt14ayf with config:
[34m[1mwandb[0m: 	C: 6.274196182767826
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.44861
f1 macro,0.50026


[34m[1mwandb[0m: Agent Starting Run: cwdgf5d1 with config:
[34m[1mwandb[0m: 	C: 7.717716747047102
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.44783
f1 macro,0.49873


[34m[1mwandb[0m: Agent Starting Run: lqe8qb21 with config:
[34m[1mwandb[0m: 	C: 8.830693397109062
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.44843
f1 macro,0.49955


0.01648682815770653

In [16]:
#Don't forget to name the sweep instance   
name = 'xgb_ft_yahoo' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_fasttext(config = None,
          train=embedded_train_yahoo, #CHANGE HERE
          val=embedded_val_yahoo): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = XGBClassifier(gamma=config.gamma,
                                 n_estimators=config.n_estimators,
                                 learning_rate=config.learning_rate,
                                 max_depth=config.max_depth,
                                 random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext, count=OPT_ITER)
tracker.stop()



Create sweep with ID: 8q83yjoe
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/8q83yjoe


[34m[1mwandb[0m: Agent Starting Run: lzriiexe with config:
[34m[1mwandb[0m: 	C: 3.426147217376755
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.42979
f1 macro,0.47649


[34m[1mwandb[0m: Agent Starting Run: r2tymvzh with config:
[34m[1mwandb[0m: 	C: 5.357984774600081
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.42971
f1 macro,0.47631


[34m[1mwandb[0m: Agent Starting Run: e5yawj06 with config:
[34m[1mwandb[0m: 	C: 0.8858751580342827
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.43111
f1 macro,0.47768


[34m[1mwandb[0m: Agent Starting Run: ouaz4xtr with config:
[34m[1mwandb[0m: 	C: 9.689045815348598
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.42945
f1 macro,0.47553


[34m[1mwandb[0m: Agent Starting Run: eu6pbdck with config:
[34m[1mwandb[0m: 	C: 1.857309303019292
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.43075
f1 macro,0.47707


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 2ed585x7 with config:
[34m[1mwandb[0m: 	C: 1.6616166050862513
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.42847
f1 macro,0.46134


[34m[1mwandb[0m: Agent Starting Run: 9pp58a7v with config:
[34m[1mwandb[0m: 	C: 4.295195507836894
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.42986
f1 macro,0.47629


[34m[1mwandb[0m: Agent Starting Run: stnw89x5 with config:
[34m[1mwandb[0m: 	C: 7.83531786647795
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.4298
f1 macro,0.47585


[34m[1mwandb[0m: Agent Starting Run: pjlw5f0k with config:
[34m[1mwandb[0m: 	C: 2.6710627849389743
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.43143
f1 macro,0.47794


[34m[1mwandb[0m: Agent Starting Run: 7lwpcbgk with config:
[34m[1mwandb[0m: 	C: 8.537706451301336
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.43017
f1 macro,0.47666


0.006280692263080593