In [None]:
#Connect to wandb
#TO DO : how to save models on the weight and bias platform
import wandb
wandb.login()
wandb.init(project="svm", 
           entity="benchmark-nlp",
           name='topic datasets svm') #CHANGE

In [2]:
import os 
os.chdir('../../..')

In [3]:
#Load packages
import warnings
import io
import numpy as np
import pandas as pd
from codecarbon import EmissionsTracker
import yaml
from util.dataloader import DataLoader
from preprocessing.preprocessor import Preprocessor
from util.datasplitter import data_splitter
from preprocessing.fasttext_embeddings import FastTextEmbeddings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score
warnings.filterwarnings("ignore")

In [4]:
#Set constant values
SEED=42 
OPT_ITER=10

## Load data

In [5]:
dl = DataLoader(['topic'])
data = dl.load()


tweet_preprocessor = Preprocessor(is_tweet=True)
preprocessor = Preprocessor()

#We are not interested in the test sets for hyperparameter optimization
train_twentynews, val_twentynews, _ = data_splitter(data['twentynews'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)
train_agnews, val_agnews, _ = data_splitter(data['agnews'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)
train_yahoo, val_yahoo, _ = data_splitter(data['yahoo'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)

1400000 rows preprocessed in 1108.4388880729675 seconds
60000 rows preprocessed in 37.829808473587036 seconds


In [None]:
%%time
#fasttext 
fasttext = FastTextEmbeddings()
fasttext.load_model('fasttext/cc.en.300.bin')

In [8]:
embedded_train_twentynews = fasttext.generate_sentence_embeddings(train_twentynews['text'])
embedded_val_twentynews = fasttext.generate_sentence_embeddings(val_twentynews['text'])
embedded_train_twentynews['label'] = train_twentynews['label'].to_list()
embedded_val_twentynews['label'] = val_twentynews['label'].to_list()

Starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 9051/9051 [01:25<00:00, 105.50it/s]


Starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 2263/2263 [00:20<00:00, 111.76it/s]


In [9]:
embedded_train_agnews = fasttext.generate_sentence_embeddings(train_agnews['text'])
embedded_val_agnews = fasttext.generate_sentence_embeddings(val_agnews['text'])
embedded_train_agnews['label'] = train_agnews['label'].to_list()
embedded_val_agnews['label'] = val_agnews['label'].to_list()

Starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████| 96000/96000 [05:22<00:00, 297.56it/s]


Starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████| 24000/24000 [01:19<00:00, 302.99it/s]


In [None]:
embedded_train_yahoo = fasttext.generate_sentence_embeddings(train_yahoo['text'])
embedded_val_yahoo = fasttext.generate_sentence_embeddings(val_yahoo['text'])
embedded_train_yahoo['label'] = train_yahoo['label'].to_list()
embedded_val_yahoo['label'] = val_yahoo['label'].to_list()

## Hyperopt

In [6]:
import yaml
#Load the template yaml sweep config file for logistic regression
#If the value range for an hyperparameter needs to be changed, better to do it in the .yaml file than in a notebook
with open("config/svm_sweep.yaml", 'r') as stream:
    sweep_config = yaml.safe_load(stream)

In [7]:
sweep_config

{'method': 'random',
 'entity': 'benchmark-nlp',
 'project': 'hyperopt',
 'metric': {'name': 'loss', 'goal': 'minimize'},
 'parameters': {'C': {'min': 0, 'max': 10, 'distribution': 'uniform'},
  'kernel': {'values': ['linear', 'rbf']},
  'probability': {'value': True},
  'random_state': {'value': 42}}}

#### twenty news   Tf-Idf

In [13]:
#Don't forget to name the sweep instance  
name = 'svm_tfidf_twentynews' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_tfidf(config = None,
          train=train_twentynews, #Change here
          val=val_twentynews): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = SVC(C=config.C, 
                  kernel=config.kernel,
#                   probability=config.probability,
                  random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
#         pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
#         if train['label'].nunique() <=2:
#             aucpc =  average_precision_score(val['label'],pred_prob_val)
#             auc = roc_auc_score(val['label'],pred_prob_val)
#         else:
        aucpc = '-'
        auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER) #Count : number of iterations
tracker.stop()



Create sweep with ID: 11n0o4a9
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/11n0o4a9


[34m[1mwandb[0m: Agent Starting Run: vnu3eed7 with config:
[34m[1mwandb[0m: 	C: 9.055592611126652
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.71763
f1 macro,0.7277


[34m[1mwandb[0m: Agent Starting Run: qn05gumg with config:
[34m[1mwandb[0m: 	C: 9.283720563215184
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.71807
f1 macro,0.72834


[34m[1mwandb[0m: Agent Starting Run: k93patdh with config:
[34m[1mwandb[0m: 	C: 6.245126141564396
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.75166
f1 macro,0.74472


[34m[1mwandb[0m: Agent Starting Run: o3b2g2wy with config:
[34m[1mwandb[0m: 	C: 6.199005127560023
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.72912
f1 macro,0.73421


[34m[1mwandb[0m: Agent Starting Run: 1ugquz5z with config:
[34m[1mwandb[0m: 	C: 0.27265680828791106
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.68979
f1 macro,0.68151


[34m[1mwandb[0m: Agent Starting Run: n8kdegnw with config:
[34m[1mwandb[0m: 	C: 4.532952749403343
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.73221
f1 macro,0.73324


[34m[1mwandb[0m: Agent Starting Run: 6ftvw801 with config:
[34m[1mwandb[0m: 	C: 0.7010018797898376
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.70791
f1 macro,0.69511


[34m[1mwandb[0m: Agent Starting Run: bo7vypc3 with config:
[34m[1mwandb[0m: 	C: 0.7121880537137959
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.74282
f1 macro,0.73604


[34m[1mwandb[0m: Agent Starting Run: lkkvohg8 with config:
[34m[1mwandb[0m: 	C: 7.589747097587118
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.72249
f1 macro,0.73061


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ahtbycf6 with config:
[34m[1mwandb[0m: 	C: 0.6755032005930905
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.74459
f1 macro,0.73787


0.0024518219856565584

In [14]:
#Don't forget to name the sweep instance   
name = 'svm_ft_twentynews' #change here
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_fasttext(config = None,
          train=embedded_train_twentynews, #Change here
          val=embedded_val_twentynews): #change here
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        clf = SVC(C=config.C, 
                  kernel=config.kernel,
#                   probability=config.probability,
                  random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
#         pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()



Create sweep with ID: in0jrs99
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/in0jrs99


[34m[1mwandb[0m: Agent Starting Run: nnvk2rt7 with config:
[34m[1mwandb[0m: 	C: 7.536436517030978
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.66019
f1 macro,0.64632


[34m[1mwandb[0m: Agent Starting Run: e99d3hjf with config:
[34m[1mwandb[0m: 	C: 1.179566674641617
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.65621
f1 macro,0.64151


[34m[1mwandb[0m: Agent Starting Run: m2r6sktt with config:
[34m[1mwandb[0m: 	C: 9.707407661193816
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.66195
f1 macro,0.64981


[34m[1mwandb[0m: Agent Starting Run: y6hrlwir with config:
[34m[1mwandb[0m: 	C: 8.899383569308794
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.66019
f1 macro,0.64769


[34m[1mwandb[0m: Agent Starting Run: job4ji67 with config:
[34m[1mwandb[0m: 	C: 8.008876053805544
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.65709
f1 macro,0.64395


[34m[1mwandb[0m: Agent Starting Run: evqvxquv with config:
[34m[1mwandb[0m: 	C: 8.969942901690697
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.66107
f1 macro,0.64856


[34m[1mwandb[0m: Agent Starting Run: jdx7ernj with config:
[34m[1mwandb[0m: 	C: 8.158565935008804
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.67167
f1 macro,0.66221


[34m[1mwandb[0m: Agent Starting Run: bp3r2ve3 with config:
[34m[1mwandb[0m: 	C: 8.919947879388024
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.67167
f1 macro,0.66388


[34m[1mwandb[0m: Agent Starting Run: 9xz86b69 with config:
[34m[1mwandb[0m: 	C: 4.681814701268769
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.67565
f1 macro,0.66563


[34m[1mwandb[0m: Agent Starting Run: y0pgnaoy with config:
[34m[1mwandb[0m: 	C: 7.125864823569228
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.65886
f1 macro,0.64377


0.001348667556281457

#### agnews tfidf

In [8]:
#Don't forget to name the sweep instance  
name = 'linearsvc_tfidf_agnews' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_tfidf(config = None,
          train=train_agnews, #Change here
          val=val_agnews): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = LinearSVC(C=config.C, 
#                  kernel=config.kernel,
                   loss='hinge',
#                   probability=config.probability,
                  random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
#         pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
#         if train['label'].nunique() <=2:
#             aucpc =  average_precision_score(val['label'],pred_prob_val)
#             auc = roc_auc_score(val['label'],pred_prob_val)
#         else:
        aucpc = '-'
        auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER)
tracker.stop()

Create sweep with ID: muh2dn51
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/muh2dn51


[34m[1mwandb[0m: Agent Starting Run: 8c16uixc with config:
[34m[1mwandb[0m: 	C: 9.174144641053768
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.907
f1 macro,0.90668


[34m[1mwandb[0m: Agent Starting Run: wsxukona with config:
[34m[1mwandb[0m: 	C: 1.8434737660354104
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.92087
f1 macro,0.92051


[34m[1mwandb[0m: Agent Starting Run: 2vkbjazq with config:
[34m[1mwandb[0m: 	C: 9.531015735711618
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.90646
f1 macro,0.90614


[34m[1mwandb[0m: Agent Starting Run: nr5ga3fe with config:
[34m[1mwandb[0m: 	C: 3.5227411772036685
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.91821
f1 macro,0.91784


[34m[1mwandb[0m: Agent Starting Run: dfheibdp with config:
[34m[1mwandb[0m: 	C: 2.033006678025834
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.92054
f1 macro,0.92017


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ap1oiy5m with config:
[34m[1mwandb[0m: 	C: 4.117894541439087
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.91675
f1 macro,0.9164


[34m[1mwandb[0m: Agent Starting Run: tm2z6w4n with config:
[34m[1mwandb[0m: 	C: 2.3101459829156457
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.92012
f1 macro,0.91976


[34m[1mwandb[0m: Agent Starting Run: jjtvi2p0 with config:
[34m[1mwandb[0m: 	C: 7.462628103621725
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.91012
f1 macro,0.90979


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 6dsndypq with config:
[34m[1mwandb[0m: 	C: 2.182682146328209
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.92046
f1 macro,0.92009


[34m[1mwandb[0m: Agent Starting Run: o0fjj14c with config:
[34m[1mwandb[0m: 	C: 3.273306999215996
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.91904
f1 macro,0.91867


0.0015805391262487434

In [10]:
#Don't forget to name the sweep instance   
name = 'svm_ft_agnews' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_fasttext(config = None,
          train=embedded_train_agnews, #CHANGE HERE
          val=embedded_val_agnews): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = LinearSVC(C=config.C, 
#                   kernel=config.kernel,
                    loss='hinge',
#                   probability=config.probability,
                  random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
#         pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
#         if train['label'].nunique() <=2:
#             aucpc =  average_precision_score(val['label'],pred_prob_val)
#             auc = roc_auc_score(val['label'],pred_prob_val)
#             #Log predictions on WandB
#         else:
        aucpc = '-'
        auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()

Create sweep with ID: k3rhn1td
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/k3rhn1td


[34m[1mwandb[0m: Agent Starting Run: 1gyeqld5 with config:
[34m[1mwandb[0m: 	C: 0.06512241722822276
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.87858
f1 macro,0.87776


[34m[1mwandb[0m: Agent Starting Run: 3ef3ofcn with config:
[34m[1mwandb[0m: 	C: 3.726405068697055
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88879
f1 macro,0.88827


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 176aawd9 with config:
[34m[1mwandb[0m: 	C: 4.09229939573056
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88875
f1 macro,0.88823


[34m[1mwandb[0m: Agent Starting Run: yq0ix0t3 with config:
[34m[1mwandb[0m: 	C: 7.890501729074013
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88925
f1 macro,0.88874


[34m[1mwandb[0m: Agent Starting Run: dvioq8g6 with config:
[34m[1mwandb[0m: 	C: 8.412341572683104
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88925
f1 macro,0.88874


[34m[1mwandb[0m: Agent Starting Run: d294dzth with config:
[34m[1mwandb[0m: 	C: 6.513358601847983
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88896
f1 macro,0.88844


[34m[1mwandb[0m: Agent Starting Run: cbt0n8j5 with config:
[34m[1mwandb[0m: 	C: 5.617290499122973
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88883
f1 macro,0.88832


[34m[1mwandb[0m: Agent Starting Run: ba2twbvp with config:
[34m[1mwandb[0m: 	C: 7.0133204748826135
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88904
f1 macro,0.88854


[34m[1mwandb[0m: Agent Starting Run: 3907pm8d with config:
[34m[1mwandb[0m: 	C: 1.1547981661943352
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88796
f1 macro,0.8874


[34m[1mwandb[0m: Agent Starting Run: cgdvppv3 with config:
[34m[1mwandb[0m: 	C: 9.337129148384438
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.88938
f1 macro,0.88887


0.0013698724748339576

In [12]:
del embedded_train_agnews
del embedded_val_agnews

#### yahoo

In [8]:
#Don't forget to name the sweep instance  
name = 'svm_tfidf_yahoo' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_tfidf(config = None,
          train=train_yahoo, #Change here
          val=val_yahoo): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = LinearSVC(C=config.C, 
#                   kernel=config.kernel,
                    loss='hinge',
#                   probability=config.probability,
                  random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
#         pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
#         if train['label'].nunique() <=2:
#             aucpc =  average_precision_score(val['label'],pred_prob_val)
#             auc = roc_auc_score(val['label'],pred_prob_val)
#         else:
        aucpc = '-'
        auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf, count=OPT_ITER)
tracker.stop()



Create sweep with ID: ti974o5i
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/ti974o5i


[34m[1mwandb[0m: Agent Starting Run: w93y1ag1 with config:
[34m[1mwandb[0m: 	C: 4.359134783303472
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.4361
f1 macro,0.48845


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: jdih6vw2 with config:
[34m[1mwandb[0m: 	C: 0.13586381752014876
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.43867
f1 macro,0.49028


[34m[1mwandb[0m: Agent Starting Run: ewhudecm with config:
[34m[1mwandb[0m: 	C: 5.297760643641726
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.43698
f1 macro,0.47969


[34m[1mwandb[0m: Agent Starting Run: viu75oyg with config:
[34m[1mwandb[0m: 	C: 6.41372472617941
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.43283
f1 macro,0.48478


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: bbcck4im with config:
[34m[1mwandb[0m: 	C: 2.5489672014568834
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.43956
f1 macro,0.4923


[34m[1mwandb[0m: Agent Starting Run: 6bmmxkqs with config:
[34m[1mwandb[0m: 	C: 5.147242516431976
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.45005
f1 macro,0.50154


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: bu4vito2 with config:
[34m[1mwandb[0m: 	C: 5.002218631193828
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.44699
f1 macro,0.48993


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: k4he04d6 with config:
[34m[1mwandb[0m: 	C: 1.3352250653171371
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.44202
f1 macro,0.49494


[34m[1mwandb[0m: Agent Starting Run: p9yuggkj with config:
[34m[1mwandb[0m: 	C: 6.661346461500591
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.43252
f1 macro,0.4844


[34m[1mwandb[0m: Agent Starting Run: 0ublolaq with config:
[34m[1mwandb[0m: 	C: 6.420682518349638
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.43284
f1 macro,0.48479


0.023820712064613857

In [16]:
#Don't forget to name the sweep instance   
name = 'svm_ft_yahoo' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_fasttext(config = None,
          train=embedded_train_yahoo, #CHANGE HERE
          val=embedded_val_yahoo): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = LinearSVC(C=config.C, 
#                   kernel=config.kernel,
                    loss='hinge',
#                   probability=config.probability,
                  random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
#         pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
#         if train['label'].nunique() <=2:
#             aucpc =  average_precision_score(val['label'],pred_prob_val)
#             auc = roc_auc_score(val['label'],pred_prob_val)
#             #Log predictions on WandB
#         else:
        aucpc = '-'
        auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext, count=OPT_ITER)
tracker.stop()



Create sweep with ID: 8q83yjoe
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/8q83yjoe


[34m[1mwandb[0m: Agent Starting Run: lzriiexe with config:
[34m[1mwandb[0m: 	C: 3.426147217376755
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.42979
f1 macro,0.47649


[34m[1mwandb[0m: Agent Starting Run: r2tymvzh with config:
[34m[1mwandb[0m: 	C: 5.357984774600081
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.42971
f1 macro,0.47631


[34m[1mwandb[0m: Agent Starting Run: e5yawj06 with config:
[34m[1mwandb[0m: 	C: 0.8858751580342827
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.43111
f1 macro,0.47768


[34m[1mwandb[0m: Agent Starting Run: ouaz4xtr with config:
[34m[1mwandb[0m: 	C: 9.689045815348598
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.42945
f1 macro,0.47553


[34m[1mwandb[0m: Agent Starting Run: eu6pbdck with config:
[34m[1mwandb[0m: 	C: 1.857309303019292
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.43075
f1 macro,0.47707


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 2ed585x7 with config:
[34m[1mwandb[0m: 	C: 1.6616166050862513
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.42847
f1 macro,0.46134


[34m[1mwandb[0m: Agent Starting Run: 9pp58a7v with config:
[34m[1mwandb[0m: 	C: 4.295195507836894
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.42986
f1 macro,0.47629


[34m[1mwandb[0m: Agent Starting Run: stnw89x5 with config:
[34m[1mwandb[0m: 	C: 7.83531786647795
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.4298
f1 macro,0.47585


[34m[1mwandb[0m: Agent Starting Run: pjlw5f0k with config:
[34m[1mwandb[0m: 	C: 2.6710627849389743
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.43143
f1 macro,0.47794


[34m[1mwandb[0m: Agent Starting Run: 7lwpcbgk with config:
[34m[1mwandb[0m: 	C: 8.537706451301336
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.43017
f1 macro,0.47666


0.006280692263080593