In [None]:
#Connect to wandb
#TO DO : how to save models on the weight and bias platform
import wandb
wandb.login()
wandb.init(project="svm", 
           entity="benchmark-nlp",
           name='polarity datasets svm') #CHANGE

In [2]:
import os 
os.chdir('../../..')

In [3]:
#Load packages
import warnings
import io
import numpy as np
import pandas as pd
from codecarbon import EmissionsTracker
import yaml
from util.dataloader import DataLoader
from preprocessing.preprocessor import Preprocessor
from util.datasplitter import data_splitter
from preprocessing.fasttext_embeddings import FastTextEmbeddings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score
warnings.filterwarnings("ignore")

In [4]:
#Set constant values
SEED=42 
OPT_ITER=10

## Load data

In [5]:
dl = DataLoader(['polarity'])
data = dl.load()


tweet_preprocessor = Preprocessor(is_tweet=True)
preprocessor = Preprocessor()

#We are not interested in the test sets for hyperparameter optimization
train_imdb, val_imdb, _ = data_splitter(data['imdb'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)
train_yelp, val_yelp, _ = data_splitter(data['yelp'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)
train_sst2, val_sst2, _ = data_splitter(data['sst2'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)

560000 rows preprocessed in 697.8831522464752 seconds
38000 rows preprocessed in 52.93141007423401 seconds


In [6]:
%%time
#fasttext 
fasttext = FastTextEmbeddings()
fasttext.load_model('fasttext/cc.en.300.bin')

CPU times: total: 23.4 s




Wall time: 48 s


In [8]:
embedded_train_imdb = fasttext.generate_sentence_embeddings(train_imdb['text'])
embedded_val_imdb = fasttext.generate_sentence_embeddings(val_imdb['text'])
embedded_train_imdb['label'] = train_imdb['label'].to_list()
embedded_val_imdb['label'] = val_imdb['label'].to_list()

Starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████| 20000/20000 [03:19<00:00, 100.34it/s]


Starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:45<00:00, 110.96it/s]


In [11]:
embedded_train_yelp = fasttext.generate_sentence_embeddings(train_yelp['text'])
embedded_val_yelp = fasttext.generate_sentence_embeddings(val_yelp['text'])
embedded_train_yelp['label'] = train_yelp['label'].to_list()
embedded_val_yelp['label'] = val_yelp['label'].to_list()

Starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████| 448000/448000 [39:12<00:00, 190.42it/s]


Starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████| 112000/112000 [10:00<00:00, 186.62it/s]


In [15]:
embedded_train_sst2 = fasttext.generate_sentence_embeddings(train_sst2['text'])
embedded_val_sst2= fasttext.generate_sentence_embeddings(val_sst2['text'])
embedded_train_sst2['label'] = train_sst2['label'].to_list()
embedded_val_sst2['label'] = val_sst2['label'].to_list()

Starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████| 67349/67349 [01:26<00:00, 781.13it/s]


Starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████████| 872/872 [00:01<00:00, 564.46it/s]


## Hyperopt

In [7]:
import yaml
#Load the template yaml sweep config file for logistic regression
#If the value range for an hyperparameter needs to be changed, better to do it in the .yaml file than in a notebook
with open("config/svm_sweep.yaml", 'r') as stream:
    sweep_config = yaml.safe_load(stream)

In [8]:
#The config is displayed as a nested dictionary
sweep_config

{'method': 'random',
 'entity': 'benchmark-nlp',
 'project': 'hyperopt',
 'metric': {'name': 'loss', 'goal': 'minimize'},
 'parameters': {'C': {'min': 0, 'max': 10, 'distribution': 'uniform'},
  'kernel': {'values': ['linear', 'rbf']},
  'probability': {'value': True},
  'random_state': {'value': 42}}}

#### imdb

In [None]:
#Don't forget to name the sweep instance  
name = 'svm_tfidf_imdb' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_tfidf(config = None,
          train=train_imdb, #Change here
          val=val_imdb): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = LinearSVC(C=config.C, 
#                   kernel=config.kernel,
#                   probability=config.probability,
                        loss='hinge',
                  random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
#         pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
#         if train['label'].nunique() <=2:
#             aucpc =  average_precision_score(val['label'],pred_prob_val)
#             auc = roc_auc_score(val['label'],pred_prob_val)
#         else:
        aucpc = '-'
        auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER) #Count : number of iterations
tracker.stop()

In [13]:
#Don't forget to name the sweep instance   
name = 'svm_ft_imdb' #change here
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_fasttext(config = None,
          train=embedded_train_imdb, #Change here
          val=embedded_val_imdb): #change here
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        clf = LinearSVC(C=config.C, 
#                   kernel=config.kernel,
#                   probability=config.probability,
                  loss='hinge',
                  random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
#         pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
#         if train['label'].nunique() <=2:
#             aucpc =  average_precision_score(val['label'],pred_prob_val)
#             auc = roc_auc_score(val['label'],pred_prob_val)
#             #Log predictions on WandB
#         else:
        aucpc = '-'
        auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()

Create sweep with ID: ih3ctye8
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/ih3ctye8


[34m[1mwandb[0m: Agent Starting Run: iicsoxsf with config:
[34m[1mwandb[0m: 	C: 0.06805647002905202
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.7546
f1 macro,0.75431


[34m[1mwandb[0m: Agent Starting Run: rhkdv2ib with config:
[34m[1mwandb[0m: 	C: 4.766800355622763
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.8354
f1 macro,0.83539


[34m[1mwandb[0m: Agent Starting Run: 6c88y8yk with config:
[34m[1mwandb[0m: 	C: 9.327928819289154
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.839
f1 macro,0.839


[34m[1mwandb[0m: Agent Starting Run: s9zpee1r with config:
[34m[1mwandb[0m: 	C: 1.7826875563704103
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.8278
f1 macro,0.82779


[34m[1mwandb[0m: Agent Starting Run: qj9r4bz4 with config:
[34m[1mwandb[0m: 	C: 2.512330150786343
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.83
f1 macro,0.82999


[34m[1mwandb[0m: Agent Starting Run: zltgipmy with config:
[34m[1mwandb[0m: 	C: 0.21376078643191576
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.7946
f1 macro,0.79451


[34m[1mwandb[0m: Agent Starting Run: tbt6r0na with config:
[34m[1mwandb[0m: 	C: 5.363659978640019
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.836
f1 macro,0.836


[34m[1mwandb[0m: Agent Starting Run: rlcbs96i with config:
[34m[1mwandb[0m: 	C: 6.035134317795739
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.8372
f1 macro,0.8372


[34m[1mwandb[0m: Agent Starting Run: l0yanfgd with config:
[34m[1mwandb[0m: 	C: 0.3877561965117293
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.8088
f1 macro,0.80872


[34m[1mwandb[0m: Agent Starting Run: 7el3f5rf with config:
[34m[1mwandb[0m: 	C: 6.791603728023141
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.838
f1 macro,0.83799


0.0011444768805907285

In [14]:
del embedded_train_imdb
del embedded_val_imdb
del train_imdb
del val_imdb

#### YELP

In [10]:
#Don't forget to name the sweep instance  
name = 'svm_tfidf_yelp' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_tfidf(config = None,
          train=train_yelp, #Change here
          val=val_yelp): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = LinearSVC(C=config.C, 
                        loss='hinge',
#                   probability=config.probability,
#                   kernel=config.kernel,
                  random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
#         pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
#         if train['label'].nunique() <=2:
#             aucpc =  average_precision_score(val['label'],pred_prob_val)
#             auc = roc_auc_score(val['label'],pred_prob_val)
#         else:
        aucpc = '-'
        auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER)
tracker.stop()

Create sweep with ID: 3wijlw0j
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/3wijlw0j


[34m[1mwandb[0m: Agent Starting Run: ybyhhxpc with config:
[34m[1mwandb[0m: 	C: 8.214923016494106
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.92116
f1 macro,0.92116


[34m[1mwandb[0m: Agent Starting Run: c42jf1c8 with config:
[34m[1mwandb[0m: 	C: 5.2409333542447225
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.92347
f1 macro,0.92347


[34m[1mwandb[0m: Agent Starting Run: ajgcl569 with config:
[34m[1mwandb[0m: 	C: 4.27646674585453
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.92429
f1 macro,0.92429


[34m[1mwandb[0m: Agent Starting Run: 7b6z1e30 with config:
[34m[1mwandb[0m: 	C: 7.685534629564791
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.92156
f1 macro,0.92156


[34m[1mwandb[0m: Agent Starting Run: e1w8ntpi with config:
[34m[1mwandb[0m: 	C: 7.059580221824521
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.92204
f1 macro,0.92204


[34m[1mwandb[0m: Agent Starting Run: mr9e46lz with config:
[34m[1mwandb[0m: 	C: 7.912992585557484
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.92142
f1 macro,0.92142


[34m[1mwandb[0m: Agent Starting Run: qi7othm5 with config:
[34m[1mwandb[0m: 	C: 7.41724271129764
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.92175
f1 macro,0.92175


[34m[1mwandb[0m: Agent Starting Run: d1ei6lh6 with config:
[34m[1mwandb[0m: 	C: 4.739147912776934
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.92395
f1 macro,0.92395


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: joh0aw55 with config:
[34m[1mwandb[0m: 	C: 9.800031259713627
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.91992
f1 macro,0.91992


[34m[1mwandb[0m: Agent Starting Run: 7ewxg6mm with config:
[34m[1mwandb[0m: 	C: 7.056958741611182
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.92204
f1 macro,0.92204


0.0027452851972863835

In [12]:
#Don't forget to name the sweep instance   
name = 'svm_ft_yelp' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_fasttext(config = None,
          train=embedded_train_yelp, #CHANGE HERE
          val=embedded_val_yelp): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = LinearSVC(C=config.C, 
                        loss='hinge',
#                   probability=config.probability,
#                   kernel=config.kernel,
                  random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
#         pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
#         if train['label'].nunique() <=2:
#             aucpc =  average_precision_score(val['label'],pred_prob_val)
#             auc = roc_auc_score(val['label'],pred_prob_val)
#             #Log predictions on WandB
#         else:
        aucpc = '-'
        auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()

Create sweep with ID: rpxwhh4b
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/rpxwhh4b


[34m[1mwandb[0m: Agent Starting Run: 4p1861bg with config:
[34m[1mwandb[0m: 	C: 3.8226611915041673
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.87684
f1 macro,0.87684


[34m[1mwandb[0m: Agent Starting Run: qyphc65q with config:
[34m[1mwandb[0m: 	C: 4.669401765486275
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.87692
f1 macro,0.87692


[34m[1mwandb[0m: Agent Starting Run: 0183p2bg with config:
[34m[1mwandb[0m: 	C: 9.00014949014859
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.87705
f1 macro,0.87705


[34m[1mwandb[0m: Agent Starting Run: adm2u6ox with config:
[34m[1mwandb[0m: 	C: 7.294702767298227
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.87694
f1 macro,0.87693


[34m[1mwandb[0m: Agent Starting Run: f3y30hbz with config:
[34m[1mwandb[0m: 	C: 2.6896224983569006
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.87671
f1 macro,0.87671


[34m[1mwandb[0m: Agent Starting Run: mxkimlk9 with config:
[34m[1mwandb[0m: 	C: 1.9221465312579789
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.87669
f1 macro,0.87668


[34m[1mwandb[0m: Agent Starting Run: vigp7m4i with config:
[34m[1mwandb[0m: 	C: 4.558988446810811
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.87695
f1 macro,0.87694


[34m[1mwandb[0m: Agent Starting Run: z9kgz0yj with config:
[34m[1mwandb[0m: 	C: 7.4658825770815245
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.87686
f1 macro,0.87685


[34m[1mwandb[0m: Agent Starting Run: sarh719w with config:
[34m[1mwandb[0m: 	C: 2.157978013787031
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.87673
f1 macro,0.87673


[34m[1mwandb[0m: Agent Starting Run: dw7wityd with config:
[34m[1mwandb[0m: 	C: 0.8238544545134152
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.87602
f1 macro,0.87601


0.0014143217028017537

#### sst2

In [18]:
#Don't forget to name the sweep instance  
name = 'svm_tfidf_sst2' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_tfidf(config = None,
          train=train_sst2, #Change here
          val=val_sst2): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = LinearSVC(C=config.C, 
                  loss='hinge',
#                   probability=config.probability,
#                   kernel=config.kernel,
                  random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
#         pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
#         if train['label'].nunique() <=2:
#             aucpc =  average_precision_score(val['label'],pred_prob_val)
#             auc = roc_auc_score(val['label'],pred_prob_val)
#         else:
        aucpc = '-'
        auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf, count=OPT_ITER)
tracker.stop()

Create sweep with ID: tja4hvtv
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/tja4hvtv


[34m[1mwandb[0m: Agent Starting Run: 3vn5t145 with config:
[34m[1mwandb[0m: 	C: 8.102623470106243
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.79358
f1 macro,0.79278


[34m[1mwandb[0m: Agent Starting Run: lcbcicsm with config:
[34m[1mwandb[0m: 	C: 8.422386147734573
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.79243
f1 macro,0.7916


[34m[1mwandb[0m: Agent Starting Run: ffi8fms2 with config:
[34m[1mwandb[0m: 	C: 8.062281833296325
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.79587
f1 macro,0.79514


[34m[1mwandb[0m: Agent Starting Run: gx1pdgx7 with config:
[34m[1mwandb[0m: 	C: 0.44909603511660734
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.80505
f1 macro,0.8044


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: lt1xxoqi with config:
[34m[1mwandb[0m: 	C: 7.563696744052169
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.79358
f1 macro,0.79278


[34m[1mwandb[0m: Agent Starting Run: f5uo2adg with config:
[34m[1mwandb[0m: 	C: 4.95200993122633
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.79472
f1 macro,0.7939


[34m[1mwandb[0m: Agent Starting Run: getgh9gg with config:
[34m[1mwandb[0m: 	C: 3.759576975342259
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.79817
f1 macro,0.79733


[34m[1mwandb[0m: Agent Starting Run: pt2lpm2s with config:
[34m[1mwandb[0m: 	C: 0.24387447026214204
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.80619
f1 macro,0.80553


[34m[1mwandb[0m: Agent Starting Run: 2l5m6zqq with config:
[34m[1mwandb[0m: 	C: 5.417180794271327
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.79358
f1 macro,0.7929


[34m[1mwandb[0m: Agent Starting Run: rdv0n497 with config:
[34m[1mwandb[0m: 	C: 6.270578909694154
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.79472
f1 macro,0.79402


0.00140942581682884

In [20]:
#Don't forget to name the sweep instance   
name = 'svm_ft_sst2' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_fasttext(config = None,
          train=embedded_train_sst2, #CHANGE HERE
          val=embedded_val_sst2): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = LinearSVC(C=config.C, 
                        loss='hinge',
#                   probability=config.probability,
#                   kernel=config.kernel,
                  random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
#         pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
#         if train['label'].nunique() <=2:
#             aucpc =  average_precision_score(val['label'],pred_prob_val)
#             auc = roc_auc_score(val['label'],pred_prob_val)
#             #Log predictions on WandB
#         else:
        aucpc = '-'
        auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext, count=OPT_ITER)
tracker.stop()

Create sweep with ID: whxy89nz
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/whxy89nz


[34m[1mwandb[0m: Agent Starting Run: 0gghjo00 with config:
[34m[1mwandb[0m: 	C: 0.8448952034565771
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.76147
f1 macro,0.75923


[34m[1mwandb[0m: Agent Starting Run: h5ioh7qf with config:
[34m[1mwandb[0m: 	C: 4.259058833137273
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.76032
f1 macro,0.75813


[34m[1mwandb[0m: Agent Starting Run: vtwngjsu with config:
[34m[1mwandb[0m: 	C: 9.909070249374206
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.76147
f1 macro,0.75923


[34m[1mwandb[0m: Agent Starting Run: jrg2ckz0 with config:
[34m[1mwandb[0m: 	C: 6.457888825664023
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.76032
f1 macro,0.75813


[34m[1mwandb[0m: Agent Starting Run: hf5jb5ox with config:
[34m[1mwandb[0m: 	C: 1.112049090135547
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.76032
f1 macro,0.75802


[34m[1mwandb[0m: Agent Starting Run: w64xjczg with config:
[34m[1mwandb[0m: 	C: 6.517452103301635
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.76147
f1 macro,0.75923


[34m[1mwandb[0m: Agent Starting Run: ewvg3ckj with config:
[34m[1mwandb[0m: 	C: 2.15248154178116
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.75917
f1 macro,0.75703


[34m[1mwandb[0m: Agent Starting Run: g54s5o8r with config:
[34m[1mwandb[0m: 	C: 6.936178851925251
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.76032
f1 macro,0.75813


[34m[1mwandb[0m: Agent Starting Run: e7t8dvhl with config:
[34m[1mwandb[0m: 	C: 2.009117739727352
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.75917
f1 macro,0.75703


[34m[1mwandb[0m: Agent Starting Run: t0rme9ry with config:
[34m[1mwandb[0m: 	C: 1.9976250866800116
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.75917
f1 macro,0.75703


0.0010957358702634108