In [1]:
#Connect to wandb
import wandb
wandb.login()
wandb.init(project="svm", 
           entity="benchmark-nlp",
           name='sarcasm datasets svm') #CHANGE

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjtonglet[0m ([33mbenchmark-nlp[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
import os 
os.chdir('../../..')

In [3]:
#Load packages
import warnings
import io
import numpy as np
import pandas as pd
from codecarbon import EmissionsTracker
import yaml
from util.dataloader import DataLoader
from preprocessing.preprocessor import Preprocessor
from util.datasplitter import data_splitter
from preprocessing.fasttext_embeddings import FastTextEmbeddings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score
warnings.filterwarnings("ignore")

In [4]:
#Set constants
SEED=42
OPT_ITER=10

## Load data

In [5]:
dl = DataLoader(['sarcasm'])
data = dl.load()


tweet_preprocessor = Preprocessor(is_tweet=True)
preprocessor = Preprocessor()

#We are not interested in the test sets for hyperparameter optimization
train_semeval_A, val_semeval_A, _ = data_splitter(data['SemEval_A'],
                                 tweet_preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)

train_isarcasm, val_isarcasm, _ = data_splitter(data['iSarcasm'],
                                 tweet_preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)

3817 rows preprocessed in 5.285405397415161 seconds
784 rows preprocessed in 0.2520742416381836 seconds
3468 rows preprocessed in 1.4594483375549316 seconds
1400 rows preprocessed in 0.48386168479919434 seconds


In [6]:
train_sarc, val_sarc, _ = data_splitter(data['sarc'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                seed=SEED)

In [7]:
%%time
#fasttext 
fasttext = FastTextEmbeddings()
fasttext.load_model('fasttext/cc.en.300.bin')

CPU times: total: 25.5 s
Wall time: 50.4 s




In [8]:
embedded_train_semeval_A = fasttext.generate_sentence_embeddings(train_semeval_A['text'])
embedded_val_semeval_A = fasttext.generate_sentence_embeddings(val_semeval_A['text'])
embedded_train_semeval_A['label'] = train_semeval_A['label'].to_list()
embedded_val_semeval_A['label'] = val_semeval_A['label'].to_list()

Starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 3053/3053 [00:13<00:00, 232.26it/s]


Starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████████| 764/764 [00:02<00:00, 305.68it/s]


In [9]:
embedded_train_isarcasm = fasttext.generate_sentence_embeddings(train_isarcasm['text'])
embedded_val_isarcasm = fasttext.generate_sentence_embeddings(val_isarcasm['text'])
embedded_train_isarcasm['label'] = train_isarcasm['label'].to_list()
embedded_val_isarcasm['label'] = val_isarcasm['label'].to_list()

Starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 2774/2774 [00:10<00:00, 254.74it/s]


Starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████████| 694/694 [00:02<00:00, 278.73it/s]


In [None]:
embedded_train_sarc = fasttext.generate_sentence_embeddings(train_sarc['text'])
embedded_val_sarc = fasttext.generate_sentence_embeddings(val_sarc['text'])
embedded_train_sarc['label'] = train_sarc['label'].to_list()
embedded_val_sarc['label'] = val_sarc['label'].to_list()

## Hyperopt

In [10]:
#Load the template yaml sweep config file for logistic regression
#If the value range for an hyperparameter needs to be changed, better to do it in the .yaml file than in a notebook
with open("config/svm_sweep.yaml", 'r') as stream:
    sweep_config = yaml.safe_load(stream)

In [11]:
#The config is displayed as a nested dictionary
sweep_config

{'method': 'random',
 'entity': 'benchmark-nlp',
 'project': 'hyperopt',
 'metric': {'name': 'loss', 'goal': 'minimize'},
 'parameters': {'C': {'min': 0, 'max': 10, 'distribution': 'uniform'},
  'kernel': {'values': ['linear', 'rbf']},
  'probability': {'value': True},
  'random_state': {'value': 42}}}

####  sem_eval_A   Tf-Idf

In [13]:
#Don't forget to name the sweep instance  
name = 'svm_tfidf_sem_eval_A' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_tfidf(config = None,
          train=train_semeval_A, #Change here
          val=val_semeval_A): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = SVC(C=config.C, 
                  kernel=config.kernel,
                  probability=config.probability,
                  random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER) #Count : number of iterations
tracker.stop()



Create sweep with ID: 7kcedxx0
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/7kcedxx0


[34m[1mwandb[0m: Agent Starting Run: g9w7bd4q with config:
[34m[1mwandb[0m: 	C: 7.282915133554417
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.67176
AUC-PC,0.70036
accuracy,0.61257
f1 macro,0.6118


[34m[1mwandb[0m: Agent Starting Run: lsfhygak with config:
[34m[1mwandb[0m: 	C: 8.88045703400401
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.67176
AUC-PC,0.70036
accuracy,0.61257
f1 macro,0.6118


[34m[1mwandb[0m: Agent Starting Run: rz19pvvs with config:
[34m[1mwandb[0m: 	C: 9.011481644548777
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.67176
AUC-PC,0.70036
accuracy,0.61257
f1 macro,0.6118


[34m[1mwandb[0m: Agent Starting Run: fuacq2xe with config:
[34m[1mwandb[0m: 	C: 8.538026223601968
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.67176
AUC-PC,0.70036
accuracy,0.61257
f1 macro,0.6118


[34m[1mwandb[0m: Agent Starting Run: oceuibkp with config:
[34m[1mwandb[0m: 	C: 2.1847199715785317
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.67117
AUC-PC,0.70061
accuracy,0.61126
f1 macro,0.61053


[34m[1mwandb[0m: Agent Starting Run: kf330fy6 with config:
[34m[1mwandb[0m: 	C: 1.6160155357409556
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.66357
AUC-PC,0.68406
accuracy,0.62304
f1 macro,0.62167


[34m[1mwandb[0m: Agent Starting Run: ym5kmj8b with config:
[34m[1mwandb[0m: 	C: 4.488779686242777
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.65684
AUC-PC,0.67575
accuracy,0.62435
f1 macro,0.62224


[34m[1mwandb[0m: Agent Starting Run: 0lrzz6nm with config:
[34m[1mwandb[0m: 	C: 2.1909662827183585
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.67116
AUC-PC,0.70061
accuracy,0.61126
f1 macro,0.61053


[34m[1mwandb[0m: Agent Starting Run: s5nptf52 with config:
[34m[1mwandb[0m: 	C: 9.440950383520676
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.67176
AUC-PC,0.70036
accuracy,0.61257
f1 macro,0.6118


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: wwd585sp with config:
[34m[1mwandb[0m: 	C: 7.783468564491618
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.6531
AUC-PC,0.66967
accuracy,0.60995
f1 macro,0.60799


0.00286378771483967

####  Semeval A fasttext

In [None]:
#Don't forget to name the sweep instance   
name = 'svm_fasttext_sem_eval_A' #change here
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_fasttext(config = None,
          train=embedded_train_semeval_A, #Change here
          val=embedded_val_semeval_A): #change here
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        clf = SVC(C=config.C, 
                  kernel=config.kernel,
                  probability=config.probability,
                  random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()

####  i Sarcasm


In [15]:
#Don't forget to name the sweep instance  
name = 'svm_tfidf_isarcasm' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_tfidf(config = None,
          train=train_isarcasm, #Change here
          val=val_isarcasm): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = SVC(C=config.C, 
                  kernel=config.kernel,
                  probability=config.probability,
                  random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER)
tracker.stop()

Create sweep with ID: xidyi5yn
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/xidyi5yn


[34m[1mwandb[0m: Agent Starting Run: oq0rx75c with config:
[34m[1mwandb[0m: 	C: 4.4452198314450895
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.58344
AUC-PC,0.3627
accuracy,0.66138
f1 macro,0.53954


[34m[1mwandb[0m: Agent Starting Run: ree1kc97 with config:
[34m[1mwandb[0m: 	C: 8.095630511750656
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.60808
AUC-PC,0.39529
accuracy,0.7147
f1 macro,0.47675


[34m[1mwandb[0m: Agent Starting Run: 0d73br81 with config:
[34m[1mwandb[0m: 	C: 4.076627957970672
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.60749
AUC-PC,0.39553
accuracy,0.71326
f1 macro,0.47597


[34m[1mwandb[0m: Agent Starting Run: myi3g60z with config:
[34m[1mwandb[0m: 	C: 8.711809388656558
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.57594
AUC-PC,0.35253
accuracy,0.64841
f1 macro,0.53032


[34m[1mwandb[0m: Agent Starting Run: o5tbm6y3 with config:
[34m[1mwandb[0m: 	C: 0.6197135509783236
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.61064
AUC-PC,0.40321
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: rgx89l06 with config:
[34m[1mwandb[0m: 	C: 2.010502736669105
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.60674
AUC-PC,0.39543
accuracy,0.7147
f1 macro,0.47298


[34m[1mwandb[0m: Agent Starting Run: adeac8iq with config:
[34m[1mwandb[0m: 	C: 4.618579602689987
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.60772
AUC-PC,0.39495
accuracy,0.7147
f1 macro,0.47675


[34m[1mwandb[0m: Agent Starting Run: pouvn8nx with config:
[34m[1mwandb[0m: 	C: 9.222895787876944
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.60808
AUC-PC,0.39529
accuracy,0.7147
f1 macro,0.47675


[34m[1mwandb[0m: Agent Starting Run: dj6vvoe0 with config:
[34m[1mwandb[0m: 	C: 2.8700333116949226
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.58721
AUC-PC,0.37344
accuracy,0.67579
f1 macro,0.54607


[34m[1mwandb[0m: Agent Starting Run: 7uzoh51e with config:
[34m[1mwandb[0m: 	C: 3.983777311377361
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.58455
AUC-PC,0.36583
accuracy,0.66571
f1 macro,0.54264


0.0011472796167397003

In [12]:
#Don't forget to name the sweep instance   
name = 'svm_ft_isarcasm' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_fasttext(config = None,
          train=embedded_train_isarcasm, #CHANGE HERE
          val=embedded_val_isarcasm): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = SVC(C=config.C, 
                  kernel=config.kernel,
                  probability=config.probability,
                  random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()

Create sweep with ID: k8wbpdlk
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/k8wbpdlk


[34m[1mwandb[0m: Agent Starting Run: gg5wibc3 with config:
[34m[1mwandb[0m: 	C: 7.851708468651158
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.60986
AUC-PC,0.38184
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: ixfba9h3 with config:
[34m[1mwandb[0m: 	C: 9.080114999442609
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.5819
AUC-PC,0.38312
accuracy,0.70461
f1 macro,0.52467


[34m[1mwandb[0m: Agent Starting Run: 32oh5z17 with config:
[34m[1mwandb[0m: 	C: 7.47663433634291
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.58022
AUC-PC,0.38509
accuracy,0.70749
f1 macro,0.52105


[34m[1mwandb[0m: Agent Starting Run: uutb5mzo with config:
[34m[1mwandb[0m: 	C: 6.414152440093673
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.61003
AUC-PC,0.38276
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: wgmi2z11 with config:
[34m[1mwandb[0m: 	C: 6.276928260747047
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.61029
AUC-PC,0.38315
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: btqowkbz with config:
[34m[1mwandb[0m: 	C: 0.7534611721924556
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.57482
AUC-PC,0.36621
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: 509znxic with config:
[34m[1mwandb[0m: 	C: 2.1460736463617125
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.57543
AUC-PC,0.3683
accuracy,0.71326
f1 macro,0.43507


[34m[1mwandb[0m: Agent Starting Run: uczei5ny with config:
[34m[1mwandb[0m: 	C: 2.456731548339426
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.57545
AUC-PC,0.36825
accuracy,0.71614
f1 macro,0.44516


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: td2uf7qw with config:
[34m[1mwandb[0m: 	C: 3.8777306577128856
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.60964
AUC-PC,0.38298
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: f3j4hdeg with config:
[34m[1mwandb[0m: 	C: 1.654056475280956
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.6091
AUC-PC,0.38349
accuracy,0.71326
f1 macro,0.41632


0.0018644592303644647

#### sarc

In [None]:
#Don't forget to name the sweep instance  
name = 'svm_tfidf_sarc' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_tfidf(config = None,
          train=train_sarc, #Change here
          val=val_sarc): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        config = wandb.config
        vec = TfidfVectorizer()
        clf = SVC(C=config.C, 
                  kernel=config.kernel,
                  probability=config.probability,
                  random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER)
tracker.stop()

In [46]:
#Don't forget to name the sweep instance   
name = 'svm_ft_sarc' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_fasttext(config = None,
          train=embedded_train_sarc, #CHANGE HERE
          val=embedded_val_sarc): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = SVC(C=config.C, 
                  kernel=config.kernel,
                  probability=config.probability,
                  random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()



Create sweep with ID: zb95v8ac
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/zb95v8ac


[34m[1mwandb[0m: Agent Starting Run: ed4d1fq3 with config:
[34m[1mwandb[0m: 	C: 0.001
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.6363
AUC-PC,0.61755
accuracy,0.60246
f1 macro,0.60236


[34m[1mwandb[0m: Agent Starting Run: s2k5am82 with config:
[34m[1mwandb[0m: 	C: 0.01
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.64789
AUC-PC,0.63131
accuracy,0.61092
f1 macro,0.61085


[34m[1mwandb[0m: Agent Starting Run: 96wok6si with config:
[34m[1mwandb[0m: 	C: 0.1
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.65033
AUC-PC,0.63416
accuracy,0.61295
f1 macro,0.61291


[34m[1mwandb[0m: Agent Starting Run: vzn31mkt with config:
[34m[1mwandb[0m: 	C: 1
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.65044
AUC-PC,0.63428
accuracy,0.61323
f1 macro,0.61319


[34m[1mwandb[0m: Agent Starting Run: g4gy4r13 with config:
[34m[1mwandb[0m: 	C: 10
[34m[1mwandb[0m: 	penalty: l2
[34m[1mwandb[0m: 	random_state: 42
[34m[1mwandb[0m: 	solver: lbfgs


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.65046
AUC-PC,0.63427
accuracy,0.6133
f1 macro,0.61326


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Sweep Agent: Exiting.


0.0008125484472224113