In [1]:
#Connect to wandb
#TO DO : how to save models on the weight and bias platform
import wandb
wandb.login()
wandb.init(project="svm", 
           entity="benchmark-nlp",
           name='fake news datasets svm') #CHANGE

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjtonglet[0m ([33mbenchmark-nlp[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
import os
#Move back to the root directory of the project
os.chdir('../../..')

In [3]:
#Load packages
import warnings
import io
import numpy as np
import pandas as pd
from codecarbon import EmissionsTracker
import yaml
from util.dataloader import DataLoader
from preprocessing.preprocessor import Preprocessor
from util.datasplitter import data_splitter
from preprocessing.fasttext_embeddings import FastTextEmbeddings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score
warnings.filterwarnings("ignore")

In [4]:
#Set constant values
SEED=42
OPT_ITER=10

## Load data

In [5]:
dl = DataLoader(['fake_news'])
data = dl.load()


tweet_preprocessor = Preprocessor(is_tweet=True)
preprocessor = Preprocessor()

#We are not interested in the test sets for hyperparameter optimization

train_gossipcop, val_gossipcop, _ = data_splitter(data['gossipcop'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)

100%|███████████████████████████████████████████████████████████████████████████| 13267/13267 [01:20<00:00, 165.40it/s]
100%|████████████████████████████████████████████████████████████████████████████| 5323/5323 [00:04<00:00, 1096.75it/s]


In [6]:
train_coaid, val_coaid, _ = data_splitter(data['CoAID'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                          test_split=0.25,
                                          val_split=0.2,
                                 seed=SEED)

5457 rows preprocessed in 6.509600639343262 seconds


In [6]:
train_liar, val_liar, _ = data_splitter(data['liar'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)

10269 rows preprocessed in 5.885899305343628 seconds
1283 rows preprocessed in 0.9678049087524414 seconds
1284 rows preprocessed in 0.7875833511352539 seconds


In [7]:
%%time
#fasttext 
fasttext = FastTextEmbeddings()
fasttext.load_model('fasttext/cc.en.300.bin')

CPU times: total: 18.9 s
Wall time: 42.2 s




In [9]:
embedded_train_gossipcop = fasttext.generate_sentence_embeddings(train_gossipcop['text'])
embedded_val_gossipcop = fasttext.generate_sentence_embeddings(val_gossipcop['text'])
embedded_train_gossipcop['label'] = train_gossipcop['label'].to_list()
embedded_val_gossipcop['label'] = val_gossipcop['label'].to_list()

Starting to generate sentence embeddings


100%|████████████████████████████████████████████████████████████████████████████| 11897/11897 [05:12<00:00, 38.01it/s]


Starting to generate sentence embeddings


100%|██████████████████████████████████████████████████████████████████████████████| 2975/2975 [01:07<00:00, 43.97it/s]


In [8]:
embedded_train_coaid = fasttext.generate_sentence_embeddings(train_coaid['text'])
embedded_val_coaid = fasttext.generate_sentence_embeddings(val_coaid['text'])
embedded_train_coaid['label'] = train_coaid['label'].to_list()
embedded_val_coaid['label'] = val_coaid['label'].to_list()

Starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 3273/3273 [00:17<00:00, 185.00it/s]


Starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████████| 819/819 [00:04<00:00, 178.04it/s]


In [10]:
embedded_train_liar = fasttext.generate_sentence_embeddings(train_liar['text'])
embedded_val_liar = fasttext.generate_sentence_embeddings(val_liar['text'])
embedded_train_liar['label'] = train_liar['label'].to_list()
embedded_val_liar['label'] = val_liar['label'].to_list()

Starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████| 10269/10269 [00:19<00:00, 536.31it/s]


Starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 1284/1284 [00:02<00:00, 595.21it/s]


## Hyperopt

In [9]:
import yaml
#Load the template yaml sweep config file for logistic regression
#If the value range for an hyperparameter needs to be changed, better to do it in the .yaml file than in a notebook
with open("config/svm_sweep.yaml", 'r') as stream:
    sweep_config = yaml.safe_load(stream)

In [10]:
#The config is displayed as a nested dictionary
sweep_config

{'method': 'random',
 'entity': 'benchmark-nlp',
 'project': 'hyperopt',
 'metric': {'name': 'loss', 'goal': 'minimize'},
 'parameters': {'C': {'min': 0, 'max': 10, 'distribution': 'uniform'},
  'kernel': {'values': ['linear', 'rbf']},
  'probability': {'value': True},
  'random_state': {'value': 42}}}

#### gossipcop

In [17]:
#Don't forget to name the sweep instance  
name = 'svm_tfidf_gossipcop' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_tfidf(config = None,
          train=train_gossipcop, #Change here
          val=val_gossipcop): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = SVC(C=config.C, 
#                   probability=config.probability,
                  kernel=config.kernel,
                  random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
#         pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        aucpc = '-'
        auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER)
tracker.stop()



Create sweep with ID: mskvuhxo
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/mskvuhxo


[34m[1mwandb[0m: Agent Starting Run: jy564qit with config:
[34m[1mwandb[0m: 	C: 1.5448114798644297
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.82252
f1 macro,0.75604


[34m[1mwandb[0m: Agent Starting Run: j6zrn35e with config:
[34m[1mwandb[0m: 	C: 0.1057792824586412
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.75966
f1 macro,0.57798


[34m[1mwandb[0m: Agent Starting Run: vyqnis58 with config:
[34m[1mwandb[0m: 	C: 8.384661965828267
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.79496
f1 macro,0.73301


[34m[1mwandb[0m: Agent Starting Run: ico828oz with config:
[34m[1mwandb[0m: 	C: 3.3558821804571126
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.80706
f1 macro,0.74507


[34m[1mwandb[0m: Agent Starting Run: 6o6n4rmr with config:
[34m[1mwandb[0m: 	C: 5.213582743208183
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.82319
f1 macro,0.75212


[34m[1mwandb[0m: Agent Starting Run: 2jpzb706 with config:
[34m[1mwandb[0m: 	C: 6.521356232841739
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.82353
f1 macro,0.75247


[34m[1mwandb[0m: Agent Starting Run: s70s7w74 with config:
[34m[1mwandb[0m: 	C: 2.948542285062985
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.81008
f1 macro,0.74785


[34m[1mwandb[0m: Agent Starting Run: te3q8pe3 with config:
[34m[1mwandb[0m: 	C: 2.9442077430071567
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.82319
f1 macro,0.75212


[34m[1mwandb[0m: Agent Starting Run: mdza4j13 with config:
[34m[1mwandb[0m: 	C: 1.783174372431039
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.81916
f1 macro,0.75355


[34m[1mwandb[0m: Agent Starting Run: weihfsag with config:
[34m[1mwandb[0m: 	C: 3.0250199756773757
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.82319
f1 macro,0.75212


0.008253153762995248

In [20]:
#Don't forget to name the sweep instance   
name = 'svm_ft_gossipcop' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_fasttext(config = None,
          train=embedded_train_gossipcop, #CHANGE HERE
          val=embedded_val_gossipcop): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = SVC(C=config.C, 
#                   probability=config.probability,
                  kernel=config.kernel,
                  random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
#         pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
#         if train['label'].nunique() <=2:
#             aucpc =  average_precision_score(val['label'],pred_prob_val)
#             auc = roc_auc_score(val['label'],pred_prob_val)
#             #Log predictions on WandB
#         else:
        auc = '-'
        aucpc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()



Create sweep with ID: 498qfn87
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/498qfn87


[34m[1mwandb[0m: Agent Starting Run: 0dvs3zum with config:
[34m[1mwandb[0m: 	C: 3.915236013738065
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.79059
f1 macro,0.67744


[34m[1mwandb[0m: Agent Starting Run: bd57u7td with config:
[34m[1mwandb[0m: 	C: 4.897724640497741
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.79193
f1 macro,0.68375


[34m[1mwandb[0m: Agent Starting Run: ho5fnk00 with config:
[34m[1mwandb[0m: 	C: 6.882852355324076
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.80235
f1 macro,0.7229


[34m[1mwandb[0m: Agent Starting Run: u6qo14on with config:
[34m[1mwandb[0m: 	C: 2.296622619929817
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.78353
f1 macro,0.65918


[34m[1mwandb[0m: Agent Starting Run: ahhtsdrc with config:
[34m[1mwandb[0m: 	C: 6.014932754508839
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.80134
f1 macro,0.72107


[34m[1mwandb[0m: Agent Starting Run: ets496qo with config:
[34m[1mwandb[0m: 	C: 6.171348596007412
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.79193
f1 macro,0.68673


[34m[1mwandb[0m: Agent Starting Run: c757tybp with config:
[34m[1mwandb[0m: 	C: 3.9865508504743055
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.79025
f1 macro,0.67751


[34m[1mwandb[0m: Agent Starting Run: 9akcakx1 with config:
[34m[1mwandb[0m: 	C: 7.903708836719603
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.79328
f1 macro,0.69305


[34m[1mwandb[0m: Agent Starting Run: a16xbq0o with config:
[34m[1mwandb[0m: 	C: 1.1031642164605016
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.76269
f1 macro,0.60009


[34m[1mwandb[0m: Agent Starting Run: cquqo1ev with config:
[34m[1mwandb[0m: 	C: 8.340172667482582
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.80336
f1 macro,0.72418


0.0013233434469840097

####  CoaiD




In [13]:
#Don't forget to name the sweep instance  
name = 'svm_tfidf_coaid' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_tfidf(config = None,
          train=train_coaid, #Change here
          val=val_coaid): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = SVC(C=config.C, 
                  probability=config.probability,
                  kernel=config.kernel,
                  random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
#         pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
#         if train['label'].nunique() <=2:
#             aucpc =  average_precision_score(val['label'],pred_prob_val)
#             auc = roc_auc_score(val['label'],pred_prob_val)
#         else:
        aucpc = '-'
        auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER)
tracker.stop()



Create sweep with ID: 7864icfp
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/7864icfp


[34m[1mwandb[0m: Agent Starting Run: xklx30qw with config:
[34m[1mwandb[0m: 	C: 1.2549147050909304
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.93529
f1 macro,0.87107


[34m[1mwandb[0m: Agent Starting Run: u31a1sas with config:
[34m[1mwandb[0m: 	C: 9.473743509307642
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.93895
f1 macro,0.88041


[34m[1mwandb[0m: Agent Starting Run: n2dticbu with config:
[34m[1mwandb[0m: 	C: 4.192639323517008
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.94505
f1 macro,0.89932


[34m[1mwandb[0m: Agent Starting Run: iy0d2hkf with config:
[34m[1mwandb[0m: 	C: 8.062956945816037
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.94505
f1 macro,0.89932


[34m[1mwandb[0m: Agent Starting Run: 5c6i2i0d with config:
[34m[1mwandb[0m: 	C: 0.5638690400639157
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.9011
f1 macro,0.77032


[34m[1mwandb[0m: Agent Starting Run: 34fi4mnd with config:
[34m[1mwandb[0m: 	C: 7.053644884205315
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.93895
f1 macro,0.88041


[34m[1mwandb[0m: Agent Starting Run: 0v41w68i with config:
[34m[1mwandb[0m: 	C: 5.364537655186078
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.94505
f1 macro,0.89932


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: r5jyww34 with config:
[34m[1mwandb[0m: 	C: 6.782581079050192
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.94505
f1 macro,0.89932


[34m[1mwandb[0m: Agent Starting Run: 65s079mp with config:
[34m[1mwandb[0m: 	C: 9.582743813985978
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.93895
f1 macro,0.88041


[34m[1mwandb[0m: Agent Starting Run: w8l8q8r9 with config:
[34m[1mwandb[0m: 	C: 9.271378712573156
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.94505
f1 macro,0.89932


0.0018849464036439843

In [14]:
#Don't forget to name the sweep instance   
name = 'svm_ft_coaid' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_fasttext(config = None,
          train=embedded_train_coaid, #CHANGE HERE
          val=embedded_val_coaid): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = SVC(C=config.C, 
                  probability=config.probability,
                  kernel=config.kernel,
                  random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
#         pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
#         if train['label'].nunique() <=2:
#             aucpc =  average_precision_score(val['label'],pred_prob_val)
#             auc = roc_auc_score(val['label'],pred_prob_val)
#             #Log predictions on WandB
#         else:
        aucpc = '-'
        auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()



Create sweep with ID: exsblwih
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/exsblwih


[34m[1mwandb[0m: Agent Starting Run: zymhge0i with config:
[34m[1mwandb[0m: 	C: 5.398691171394323
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.94017
f1 macro,0.88547


[34m[1mwandb[0m: Agent Starting Run: hb4g82p0 with config:
[34m[1mwandb[0m: 	C: 3.4145513785191883
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.93895
f1 macro,0.8857


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 5th1a53w with config:
[34m[1mwandb[0m: 	C: 1.731427786281593
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.93529
f1 macro,0.87847


[34m[1mwandb[0m: Agent Starting Run: lbxx69zn with config:
[34m[1mwandb[0m: 	C: 4.87278354691027
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.94139
f1 macro,0.89096


[34m[1mwandb[0m: Agent Starting Run: 345dthhf with config:
[34m[1mwandb[0m: 	C: 4.0042051255092135
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.93895
f1 macro,0.88199


[34m[1mwandb[0m: Agent Starting Run: wkrusj8n with config:
[34m[1mwandb[0m: 	C: 0.7162005724148557
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.93773
f1 macro,0.8845


[34m[1mwandb[0m: Agent Starting Run: qwdxkrsz with config:
[34m[1mwandb[0m: 	C: 3.905523113184115
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.93773
f1 macro,0.87923


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 6msevvvq with config:
[34m[1mwandb[0m: 	C: 4.6179974209141115
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.93895
f1 macro,0.8857


[34m[1mwandb[0m: Agent Starting Run: ihdlfqrj with config:
[34m[1mwandb[0m: 	C: 5.182105637836203
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.94017
f1 macro,0.88547


[34m[1mwandb[0m: Agent Starting Run: 0dnw7l4s with config:
[34m[1mwandb[0m: 	C: 2.8635892835654575
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.93651
f1 macro,0.87646


0.0017498850723012796

#### liar

In [21]:
#Don't forget to name the sweep instance  
name = 'svm_tfidf_liar' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_tfidf(config = None,
          train=train_liar, #Change here
          val=val_liar): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        config = wandb.config
        vec = TfidfVectorizer()
        clf = SVC(C=config.C, 
#                   probability=config.probability,
                  kernel=config.kernel,
                  random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
#         pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
#         if train['label'].nunique() <=2:
#             aucpc =  average_precision_score(val['label'],pred_prob_val)
#             auc = roc_auc_score(val['label'],pred_prob_val)
#         else:
        aucpc = '-'
        auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER)
tracker.stop()



Create sweep with ID: cd9p0s93
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/cd9p0s93


[34m[1mwandb[0m: Agent Starting Run: i33isj0x with config:
[34m[1mwandb[0m: 	C: 2.3737503803591298
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.25
f1 macro,0.23714


[34m[1mwandb[0m: Agent Starting Run: gi3c2uvi with config:
[34m[1mwandb[0m: 	C: 2.955685570511525
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.24299
f1 macro,0.23861


[34m[1mwandb[0m: Agent Starting Run: lfmzbtns with config:
[34m[1mwandb[0m: 	C: 3.5116923974374115
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.2391
f1 macro,0.23262


[34m[1mwandb[0m: Agent Starting Run: mpurpy9r with config:
[34m[1mwandb[0m: 	C: 5.086307850684535
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.24766
f1 macro,0.23487


[34m[1mwandb[0m: Agent Starting Run: zhr1a4a7 with config:
[34m[1mwandb[0m: 	C: 6.72150001957101
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.24844
f1 macro,0.23543


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: tqkgjn86 with config:
[34m[1mwandb[0m: 	C: 1.1908438927869758
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.24065
f1 macro,0.23349


[34m[1mwandb[0m: Agent Starting Run: q2cz8ovn with config:
[34m[1mwandb[0m: 	C: 3.4054770167783897
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.24533
f1 macro,0.23311


[34m[1mwandb[0m: Agent Starting Run: glekbynm with config:
[34m[1mwandb[0m: 	C: 0.15681678856989456
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.2095
f1 macro,0.10391


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: rocnhtoh with config:
[34m[1mwandb[0m: 	C: 2.51907822832776
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.23832
f1 macro,0.23438


[34m[1mwandb[0m: Agent Starting Run: hiy8fp7t with config:
[34m[1mwandb[0m: 	C: 0.7100441070863273
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.24299
f1 macro,0.18013


0.001354064435838337

In [22]:
#Don't forget to name the sweep instance   
name = 'svm_ft_liar' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="svm")

def train_fasttext(config = None,
          train=embedded_train_liar, #CHANGE HERE
          val=embedded_val_liar): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = SVC(C=config.C, 
#                   probability=config.probability,
                  kernel=config.kernel,
                  random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
#         pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
#         if train['label'].nunique() <=2:
#             aucpc =  average_precision_score(val['label'],pred_prob_val)
#             auc = roc_auc_score(val['label'],pred_prob_val)
#             #Log predictions on WandB
#         else:
        aucpc = '-'
        auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()



Create sweep with ID: lpvuntns
Sweep URL: https://wandb.ai/benchmark-nlp/svm/sweeps/lpvuntns


[34m[1mwandb[0m: Agent Starting Run: mooc1uup with config:
[34m[1mwandb[0m: 	C: 6.670558976421033
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.26402
f1 macro,0.25711


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: bi9e471l with config:
[34m[1mwandb[0m: 	C: 0.10324197914680087
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.20016
f1 macro,0.06827


[34m[1mwandb[0m: Agent Starting Run: cf5oxgmb with config:
[34m[1mwandb[0m: 	C: 5.947854826625992
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.26324
f1 macro,0.25855


[34m[1mwandb[0m: Agent Starting Run: f8vegms0 with config:
[34m[1mwandb[0m: 	C: 7.10556065469918
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.22664
f1 macro,0.1844


[34m[1mwandb[0m: Agent Starting Run: lo0vmihr with config:
[34m[1mwandb[0m: 	C: 2.317122395898142
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.25545
f1 macro,0.24348


[34m[1mwandb[0m: Agent Starting Run: vsygzi52 with config:
[34m[1mwandb[0m: 	C: 9.637015269137894
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.27259
f1 macro,0.26637


[34m[1mwandb[0m: Agent Starting Run: 6pcduln6 with config:
[34m[1mwandb[0m: 	C: 6.604459807307093
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.22586
f1 macro,0.18315


[34m[1mwandb[0m: Agent Starting Run: wmi3a72h with config:
[34m[1mwandb[0m: 	C: 6.074103618514879
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.26713
f1 macro,0.26166


[34m[1mwandb[0m: Agent Starting Run: qcfzq1pd with config:
[34m[1mwandb[0m: 	C: 1.3181643038829982
[34m[1mwandb[0m: 	kernel: rbf
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.24065
f1 macro,0.212


[34m[1mwandb[0m: Agent Starting Run: ec7kzsyq with config:
[34m[1mwandb[0m: 	C: 4.409713036244081
[34m[1mwandb[0m: 	kernel: linear
[34m[1mwandb[0m: 	probability: True
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.22352
f1 macro,0.17715


0.001796613704196457