In [1]:
#Connect to wandb
#TO DO : how to save models on the weight and bias platform
import wandb
wandb.login()
wandb.init(project="hyperopt", 
           entity="benchmark-nlp",
           name='polarity datasets') #CHANGE

[34m[1mwandb[0m: Currently logged in as: [33mjtonglet[0m ([33mbenchmark-nlp[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
#Load packages
import warnings
import io
import numpy as np
import pandas as pd
from codecarbon import EmissionsTracker
import yaml
from util.dataloader import DataLoader
from preprocessing import Preprocessor
from util.datasplitter import data_splitter
from fasttext_embeddings import FastTextEmbeddings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score
warnings.filterwarnings("ignore")

In [3]:
#Set constant values
SEED=42 
OPT_ITER=10

## Load data

In [4]:
dl = DataLoader(['polarity'])
data = dl.load()


tweet_preprocessor = Preprocessor(is_tweet=True)
preprocessor = Preprocessor()

#We are not interested in the test sets for hyperparameter optimization
train_imdb, val_imdb, _ = data_splitter(data['imdb'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)
train_yelp, val_yelp, _ = data_splitter(data['yelp'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)
train_sst2, val_sst2, _ = data_splitter(data['sst2'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)

25000 rows preprocessed in 71.85923075675964 seconds
25000 rows preprocessed in 57.39511275291443 seconds
560000 rows preprocessed in 834.1037421226501 seconds
38000 rows preprocessed in 52.17623996734619 seconds
67349 rows preprocessed in 14.392334938049316 seconds
1821 rows preprocessed in 0.47516846656799316 seconds
872 rows preprocessed in 0.22498226165771484 seconds


In [5]:
%%time
#fasttext 
fasttext = FastTextEmbeddings()
fasttext.load_model('fasttext/cc.en.300.bin')

Wall time: 1min 13s




In [6]:
embedded_train_imdb = fasttext.generate_sentence_embeddings(train_imdb['text'])
embedded_val_imdb = fasttext.generate_sentence_embeddings(val_imdb['text'])
embedded_train_imdb['label'] = train_imdb['label'].to_list()
embedded_val_imdb['label'] = val_imdb['label'].to_list()

starting to generate sentence embeddings


100%|████████████████████████████████████████████████████████████████████████████| 20000/20000 [04:01<00:00, 82.98it/s]


starting to generate sentence embeddings


100%|██████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:53<00:00, 93.23it/s]


In [7]:
embedded_train_yelp = fasttext.generate_sentence_embeddings(train_yelp['text'])
embedded_val_yelp = fasttext.generate_sentence_embeddings(val_yelp['text'])
embedded_train_yelp['label'] = train_yelp['label'].to_list()
embedded_val_yelp['label'] = val_yelp['label'].to_list()

starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████| 448000/448000 [46:49<00:00, 159.49it/s]


starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████| 112000/112000 [12:55<00:00, 144.47it/s]


In [8]:
embedded_train_sst2 = fasttext.generate_sentence_embeddings(train_sst2['text'])
embedded_val_sst2= fasttext.generate_sentence_embeddings(val_sst2['text'])
embedded_train_sst2['label'] = train_sst2['label'].to_list()
embedded_val_sst2['label'] = val_sst2['label'].to_list()

starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████| 67349/67349 [01:40<00:00, 668.88it/s]


starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████████| 872/872 [00:01<00:00, 504.63it/s]


## Hyperopt

In [9]:
import yaml
#Load the template yaml sweep config file for logistic regression
#If the value range for an hyperparameter needs to be changed, better to do it in the .yaml file than in a notebook
with open("config/lr_sweep.yaml", 'r') as stream:
    sweep_config = yaml.safe_load(stream)

In [10]:
#The config is displayed as a nested dictionary
sweep_config

{'method': 'random',
 'entity': 'benchmark-nlp',
 'project': 'hyperopt',
 'metric': {'name': 'loss', 'goal': 'minimize'},
 'parameters': {'C': {'min': 0, 'max': 10, 'distribution': 'uniform'},
  'penalty': {'value': 'l2'},
  'solver': {'value': 'lbfgs'},
  'random_state': {'value': 42}}}

#### imdb

In [11]:
#Don't forget to name the sweep instance  
name = 'lr_tfidf_imdb' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_tfidf(config = None,
          train=train_imdb, #Change here
          val=val_imdb): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = LogisticRegression(C = config.C,
                                 penalty = config.penalty,
                                 solver = config.solver,
                                 random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER) #Count : number of iterations
tracker.stop()



Create sweep with ID: twuga8ed
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/twuga8ed


[34m[1mwandb[0m: Agent Starting Run: 99rtcyt6 with config:
[34m[1mwandb[0m: 	C: 3.0563526549926334
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.95966
AUC-PC,0.95975
accuracy,0.891
f1 macro,0.89099


[34m[1mwandb[0m: Agent Starting Run: kseq1ask with config:
[34m[1mwandb[0m: 	C: 3.7600026336234498
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.95981
AUC-PC,0.95975
accuracy,0.8914
f1 macro,0.89139


[34m[1mwandb[0m: Agent Starting Run: zr27inmf with config:
[34m[1mwandb[0m: 	C: 6.53109462863204
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.95974
AUC-PC,0.95926
accuracy,0.8912
f1 macro,0.8912


[34m[1mwandb[0m: Agent Starting Run: fvgxbxit with config:
[34m[1mwandb[0m: 	C: 4.952042584185397
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.95988
AUC-PC,0.95961
accuracy,0.8916
f1 macro,0.89159


[34m[1mwandb[0m: Agent Starting Run: m1mtkipm with config:
[34m[1mwandb[0m: 	C: 2.1431786147489817
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.95906
AUC-PC,0.9594
accuracy,0.8916
f1 macro,0.89159


[34m[1mwandb[0m: Agent Starting Run: 3xh4qf84 with config:
[34m[1mwandb[0m: 	C: 8.990002936461254
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.95945
AUC-PC,0.95872
accuracy,0.8926
f1 macro,0.8926


[34m[1mwandb[0m: Agent Starting Run: p036rdpt with config:
[34m[1mwandb[0m: 	C: 7.548385776306038
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.95964
AUC-PC,0.95904
accuracy,0.8908
f1 macro,0.8908


[34m[1mwandb[0m: Agent Starting Run: q9ylit20 with config:
[34m[1mwandb[0m: 	C: 3.460312118191098
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.95978
AUC-PC,0.95977
accuracy,0.8908
f1 macro,0.89079


[34m[1mwandb[0m: Agent Starting Run: nku0k9nk with config:
[34m[1mwandb[0m: 	C: 1.0629885419775398
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.9567
AUC-PC,0.95752
accuracy,0.8862
f1 macro,0.88619


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ktw2xyz7 with config:
[34m[1mwandb[0m: 	C: 9.844350119400763
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.95931
AUC-PC,0.95851
accuracy,0.8916
f1 macro,0.8916


0.001247983165839763

In [13]:
#Don't forget to name the sweep instance   
name = 'lr_ft_imdb' #change here
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_fasttext(config = None,
          train=embedded_train_imdb, #Change here
          val=embedded_val_imdb): #change here
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        clf = LogisticRegression(C = config.C,
                                 penalty = config.penalty,
                                 solver = config.solver,
                                 random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()



Create sweep with ID: 1twhx5cx
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/1twhx5cx


[34m[1mwandb[0m: Agent Starting Run: r4i5tkoa with config:
[34m[1mwandb[0m: 	C: 6.233617206428239
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.91035
AUC-PC,0.90979
accuracy,0.8326
f1 macro,0.8326


[34m[1mwandb[0m: Agent Starting Run: 30o0m3f8 with config:
[34m[1mwandb[0m: 	C: 8.743592209804959
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.91189
AUC-PC,0.91102
accuracy,0.8358
f1 macro,0.8358


[34m[1mwandb[0m: Agent Starting Run: r8skvt96 with config:
[34m[1mwandb[0m: 	C: 5.820003403154262
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.90998
AUC-PC,0.90951
accuracy,0.832
f1 macro,0.832


[34m[1mwandb[0m: Agent Starting Run: b1ejmuvn with config:
[34m[1mwandb[0m: 	C: 5.416166272137475
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.90954
AUC-PC,0.90915
accuracy,0.8304
f1 macro,0.8304


[34m[1mwandb[0m: Agent Starting Run: 2avs3stx with config:
[34m[1mwandb[0m: 	C: 7.846318892221208
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.91135
AUC-PC,0.91052
accuracy,0.833
f1 macro,0.833


[34m[1mwandb[0m: Agent Starting Run: 2xtbp1z9 with config:
[34m[1mwandb[0m: 	C: 1.3390218000241103
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.89759
AUC-PC,0.89803
accuracy,0.8172
f1 macro,0.81718


[34m[1mwandb[0m: Agent Starting Run: n8zt0647 with config:
[34m[1mwandb[0m: 	C: 2.2782119530527742
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.9031
AUC-PC,0.90336
accuracy,0.825
f1 macro,0.825


[34m[1mwandb[0m: Agent Starting Run: pw07eggl with config:
[34m[1mwandb[0m: 	C: 0.4601572117929309
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.88185
AUC-PC,0.88159
accuracy,0.797
f1 macro,0.79696


[34m[1mwandb[0m: Agent Starting Run: 3k65lktq with config:
[34m[1mwandb[0m: 	C: 9.75186009968818
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.91217
AUC-PC,0.91119
accuracy,0.8358
f1 macro,0.8358


[34m[1mwandb[0m: Agent Starting Run: 32j6mk5i with config:
[34m[1mwandb[0m: 	C: 6.366567402065536
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.91043
AUC-PC,0.9099
accuracy,0.8332
f1 macro,0.8332


0.001255376893697898

#### YELP

In [None]:
#Don't forget to name the sweep instance  
name = 'lr_tfidf_yelp' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_tfidf(config = None,
          train=train_yelp, #Change here
          val=val_yelp): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = LogisticRegression(C = config.C,
                                 penalty = config.penalty,
                                 solver = config.solver,
                                 random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER)
tracker.stop()



Create sweep with ID: b8cuwrax
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/b8cuwrax


[34m[1mwandb[0m: Agent Starting Run: s3fk6udg with config:
[34m[1mwandb[0m: 	C: 2.320538563715875
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.97958
AUC-PC,0.9796
accuracy,0.92815
f1 macro,0.92815


[34m[1mwandb[0m: Agent Starting Run: 1i2lntex with config:
[34m[1mwandb[0m: 	C: 0.1069823567321082
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.97342
AUC-PC,0.97342
accuracy,0.91529
f1 macro,0.91529


[34m[1mwandb[0m: Agent Starting Run: kn6ptspa with config:
[34m[1mwandb[0m: 	C: 4.760894361370025
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


In [None]:
#Don't forget to name the sweep instance   
name = 'lr_ft_yelp' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_fasttext(config = None,
          train=embedded_train_yelp, #CHANGE HERE
          val=embedded_val_yelp): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = LogisticRegression(C = config.C,
                                 penalty = config.penalty,
                                 solver = config.solver,
                                 random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()

#### sst2

In [None]:
#Don't forget to name the sweep instance  
name = 'lr_tfidf_sst2' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_tfidf(config = None,
          train=train_sst2, #Change here
          val=val_sst2): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = LogisticRegression(C = config.C,
                                 penalty = config.penalty,
                                 solver = config.solver,
                                 random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf, count=OPT_ITER)
tracker.stop()

In [None]:
#Don't forget to name the sweep instance   
name = 'lr_ft_sst2' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_fasttext(config = None,
          train=embedded_train_sst2, #CHANGE HERE
          val=embedded_val_sst2): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = LogisticRegression(C = config.C,
                                 penalty = config.penalty,
                                 solver = config.solver,
                                 random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext, count=OPT_ITER)
tracker.stop()