In [1]:
#Connect to wandb
#TO DO : how to save models on the weight and bias platform
import wandb
wandb.login()
wandb.init(project="hyperopt", 
           entity="benchmark-nlp",
           name='fake news datasets rf') #CHANGE

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjtonglet[0m ([33mbenchmark-nlp[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
import os
#Move back to the root directory of the project
os.chdir('../../..')

In [3]:
#Load packages
import warnings
import io
import numpy as np
import pandas as pd
from codecarbon import EmissionsTracker
import yaml
from util.dataloader import DataLoader
from preprocessing.preprocessor import Preprocessor
from util.datasplitter import data_splitter
from preprocessing.fasttext_embeddings import FastTextEmbeddings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score
warnings.filterwarnings("ignore")

In [4]:
#Set constant values
SEED=42
OPT_ITER=10

## Load data

In [5]:
dl = DataLoader(['fake_news'])
data = dl.load()


tweet_preprocessor = Preprocessor(is_tweet=True)
preprocessor = Preprocessor()

#We are not interested in the test sets for hyperparameter optimization

train_gossipcop, val_gossipcop, _ = data_splitter(data['gossipcop'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)


100%|████████████████████████████████████████████████████████████████████████████| 13267/13267 [02:15<00:00, 97.74it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 5323/5323 [00:08<00:00, 593.35it/s]


18590 rows preprocessed in 268.0978093147278 seconds


In [6]:
train_coaid, val_coaid, _ = data_splitter(data['CoAID'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 val_split=0.2,
                                 test_split=0.25,
                                 seed=SEED)
# train_liar, val_liar, _ = data_splitter(data['liar'],
#                                  preprocessor, 
#                                  create_val_set=True,   #No validation set is provided
#                                  seed=SEED)

5457 rows preprocessed in 12.108634233474731 seconds


In [7]:
%%time
#fasttext 
fasttext = FastTextEmbeddings()
fasttext.load_model('fasttext/cc.en.300.bin')

CPU times: total: 35.2 s
Wall time: 1min 24s




In [9]:
embedded_train_gossipcop = fasttext.generate_sentence_embeddings(train_gossipcop['text'])
embedded_val_gossipcop = fasttext.generate_sentence_embeddings(val_gossipcop['text'])
embedded_train_gossipcop['label'] = train_gossipcop['label'].to_list()
embedded_val_gossipcop['label'] = val_gossipcop['label'].to_list()

Starting to generate sentence embeddings


100%|████████████████████████████████████████████████████████████████████████████| 11897/11897 [05:54<00:00, 33.60it/s]


Starting to generate sentence embeddings


100%|██████████████████████████████████████████████████████████████████████████████| 2975/2975 [01:22<00:00, 36.12it/s]


In [8]:
embedded_train_coaid = fasttext.generate_sentence_embeddings(train_coaid['text'])
embedded_val_coaid = fasttext.generate_sentence_embeddings(val_coaid['text'])
embedded_train_coaid['label'] = train_coaid['label'].to_list()
embedded_val_coaid['label'] = val_coaid['label'].to_list()

Starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 3273/3273 [00:27<00:00, 117.06it/s]


Starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████████| 819/819 [00:06<00:00, 119.05it/s]


In [10]:
embedded_train_liar = fasttext.generate_sentence_embeddings(train_liar['text'])
embedded_val_liar = fasttext.generate_sentence_embeddings(val_liar['text'])
embedded_train_liar['label'] = train_liar['label'].to_list()
embedded_val_liar['label'] = val_liar['label'].to_list()

Starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████| 10269/10269 [00:23<00:00, 435.15it/s]


Starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 1284/1284 [00:02<00:00, 436.19it/s]


## Hyperopt

In [9]:
import yaml
#Load the template yaml sweep config file for logistic regression
#If the value range for an hyperparameter needs to be changed, better to do it in the .yaml file than in a notebook
with open("config/rf_sweep.yaml", 'r') as stream:
    sweep_config = yaml.safe_load(stream)

In [10]:
#The config is displayed as a nested dictionary
sweep_config

{'method': 'random',
 'entity': 'benchmark-nlp',
 'project': 'hyperopt',
 'metric': {'name': 'loss', 'goal': 'minimize'},
 'parameters': {'n_estimators': {'min': 10,
   'max': 200,
   'distribution': 'int_uniform'},
  'max_features': {'values': ['log2', 'sqrt']},
  'random_state': {'value': 42}}}

####  politifact   Tf-Idf

#### gossipcop

In [14]:
#Don't forget to name the sweep instance  
name = 'rf_tfidf_gossipcop' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_tfidf(config = None,
          train=train_gossipcop, #Change here
          val=val_gossipcop): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = RandomForestClassifier(n_estimators=config.n_estimators,
                                max_features=config.max_features,
                                 random_state=config.random_state)
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER)
tracker.stop()



Create sweep with ID: gpm3oikj
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/gpm3oikj


[34m[1mwandb[0m: Agent Starting Run: 37epbru4 with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 35
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.8111
AUC-PC,0.88986
accuracy,0.81277
f1 macro,0.71678


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 0y5d8t0j with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 63
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.78704
AUC-PC,0.87372
accuracy,0.78084
f1 macro,0.63539


[34m[1mwandb[0m: Agent Starting Run: 9mchv3aw with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 147
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.79528
AUC-PC,0.87664
accuracy,0.7805
f1 macro,0.63406


[34m[1mwandb[0m: Agent Starting Run: rotzmjc8 with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 102
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.78702
AUC-PC,0.87344
accuracy,0.7795
f1 macro,0.63107


[34m[1mwandb[0m: Agent Starting Run: 6v1grmkc with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 44
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.81758
AUC-PC,0.89343
accuracy,0.81378
f1 macro,0.71946


[34m[1mwandb[0m: Agent Starting Run: 1fzoagdr with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 50
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.77576
AUC-PC,0.86883
accuracy,0.78319
f1 macro,0.64208


[34m[1mwandb[0m: Agent Starting Run: cf0cminu with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 74
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.7854
AUC-PC,0.87257
accuracy,0.78218
f1 macro,0.63814


[34m[1mwandb[0m: Agent Starting Run: w8c8tl5j with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 176
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.82388
AUC-PC,0.89563
accuracy,0.81513
f1 macro,0.71748


[34m[1mwandb[0m: Agent Starting Run: nmvi79gg with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 113
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.78823
AUC-PC,0.87386
accuracy,0.7805
f1 macro,0.63302


[34m[1mwandb[0m: Agent Starting Run: nszco1rl with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 153
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.7955
AUC-PC,0.87655
accuracy,0.7805
f1 macro,0.63406


0.004469255448202312

In [15]:
#Don't forget to name the sweep instance   
name = 'rf_ft_gossipcop' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_fasttext(config = None,
          train=embedded_train_gossipcop, #CHANGE HERE
          val=embedded_val_gossipcop): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = RandomForestClassifier(n_estimators=config.n_estimators,
                                max_features=config.max_features,
                                 random_state=config.random_state)
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()



Create sweep with ID: 8730l14f
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/8730l14f


[34m[1mwandb[0m: Agent Starting Run: pmlwwr6c with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 17
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.75566
AUC-PC,0.85717
accuracy,0.78387
f1 macro,0.68089


[34m[1mwandb[0m: Agent Starting Run: wsjf4g4a with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 188
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.78946
AUC-PC,0.87709
accuracy,0.79496
f1 macro,0.68202


[34m[1mwandb[0m: Agent Starting Run: fykfsape with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 113
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.78483
AUC-PC,0.87492
accuracy,0.79697
f1 macro,0.68631


[34m[1mwandb[0m: Agent Starting Run: 6jsf4igh with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 77
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.78166
AUC-PC,0.87395
accuracy,0.79563
f1 macro,0.68578


[34m[1mwandb[0m: Agent Starting Run: 2n1o25jy with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 65
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.77911
AUC-PC,0.87272
accuracy,0.79697
f1 macro,0.68709


[34m[1mwandb[0m: Agent Starting Run: 7p9xc5da with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 10
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.73076
AUC-PC,0.84186
accuracy,0.76773
f1 macro,0.68241


[34m[1mwandb[0m: Agent Starting Run: e35qcwxc with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 109
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.78603
AUC-PC,0.87559
accuracy,0.79597
f1 macro,0.68456


[34m[1mwandb[0m: Agent Starting Run: 347e7nh1 with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 137
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.78787
AUC-PC,0.87581
accuracy,0.79664
f1 macro,0.68521


[34m[1mwandb[0m: Agent Starting Run: tbc9styn with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 140
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.79023
AUC-PC,0.87943
accuracy,0.79966
f1 macro,0.69677


[34m[1mwandb[0m: Agent Starting Run: td31rsle with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 94
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.7887
AUC-PC,0.87906
accuracy,0.79966
f1 macro,0.6982


0.0014062016857814795

####  fake and real news dataset



In [11]:
#Don't forget to name the sweep instance  
name = 'rf_tfidf_coaid' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_tfidf(config = None,
          train=train_coaid, #Change here
          val=val_coaid): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = RandomForestClassifier(n_estimators=config.n_estimators,
                                max_features=config.max_features,
                                 random_state=config.random_state)
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER)
tracker.stop()



Create sweep with ID: n7viahvr
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/n7viahvr


[34m[1mwandb[0m: Agent Starting Run: rwal7yu3 with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 197
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.97007
AUC-PC,0.99392
accuracy,0.93407
f1 macro,0.87255


[34m[1mwandb[0m: Agent Starting Run: uau7pxw3 with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 186
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.97714
AUC-PC,0.99543
accuracy,0.93895
f1 macro,0.88276


[34m[1mwandb[0m: Agent Starting Run: ai3g4iae with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 191
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.9769
AUC-PC,0.99537
accuracy,0.93651
f1 macro,0.87807


[34m[1mwandb[0m: Agent Starting Run: ggsogi91 with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 47
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.96983
AUC-PC,0.9935
accuracy,0.92796
f1 macro,0.86029


[34m[1mwandb[0m: Agent Starting Run: ug4ipg98 with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 98
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.9764
AUC-PC,0.99514
accuracy,0.94017
f1 macro,0.88547


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: kw1542dp with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 45
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.96976
AUC-PC,0.9935
accuracy,0.9304
f1 macro,0.86502


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: vamrbf8b with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 133
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.97629
AUC-PC,0.99521
accuracy,0.93529
f1 macro,0.87612


[34m[1mwandb[0m: Agent Starting Run: ubeu36jx with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 167
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.97671
AUC-PC,0.99533
accuracy,0.94017
f1 macro,0.88621


[34m[1mwandb[0m: Agent Starting Run: ly0c34se with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 27
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.96826
AUC-PC,0.99276
accuracy,0.92796
f1 macro,0.8612


[34m[1mwandb[0m: Agent Starting Run: b0h7tqs1 with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 161
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.97664
AUC-PC,0.9953
accuracy,0.93895
f1 macro,0.88351


0.0016385528600349343

In [12]:
#Don't forget to name the sweep instance   
name = 'rf_ft_coaid' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_fasttext(config = None,
          train=embedded_train_coaid, #CHANGE HERE
          val=embedded_val_coaid): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = RandomForestClassifier(n_estimators=config.n_estimators,
                                max_features=config.max_features,
                                 random_state=config.random_state)
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()



Create sweep with ID: c3mog94m
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/c3mog94m


[34m[1mwandb[0m: Agent Starting Run: uaqkj2g5 with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 51
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.95645
AUC-PC,0.99039
accuracy,0.9243
f1 macro,0.84761


[34m[1mwandb[0m: Agent Starting Run: j7ysrgad with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 134
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.95898
AUC-PC,0.99126
accuracy,0.92063
f1 macro,0.84402


[34m[1mwandb[0m: Agent Starting Run: pn1wmxui with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 95
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.95792
AUC-PC,0.99091
accuracy,0.92186
f1 macro,0.84485


[34m[1mwandb[0m: Agent Starting Run: 6xrdrtlc with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 99
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.96264
AUC-PC,0.99203
accuracy,0.92918
f1 macro,0.864


[34m[1mwandb[0m: Agent Starting Run: 63zwtb0a with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 144
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.959
AUC-PC,0.99126
accuracy,0.9243
f1 macro,0.85071


[34m[1mwandb[0m: Agent Starting Run: n2tj98yz with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 167
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.96381
AUC-PC,0.99237
accuracy,0.9304
f1 macro,0.86763


[34m[1mwandb[0m: Agent Starting Run: 116j011s with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 71
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.95716
AUC-PC,0.99077
accuracy,0.91819
f1 macro,0.83589


[34m[1mwandb[0m: Agent Starting Run: opa6tp6n with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 107
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.96268
AUC-PC,0.99206
accuracy,0.9304
f1 macro,0.86847


[34m[1mwandb[0m: Agent Starting Run: 5jobll0x with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 27
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.9599
AUC-PC,0.99072
accuracy,0.92796
f1 macro,0.8612


[34m[1mwandb[0m: Agent Starting Run: c3vh3ph0 with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 11
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.93668
AUC-PC,0.98237
accuracy,0.90354
f1 macro,0.80913


0.0013534707882262326

#### liar

In [16]:
#Don't forget to name the sweep instance  
name = 'rf_tfidf_liar' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_tfidf(config = None,
          train=train_liar, #Change here
          val=val_liar): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        config = wandb.config
        vec = TfidfVectorizer()
        clf = RandomForestClassifier(n_estimators=config.n_estimators,
                                max_features=config.max_features,
                                 random_state=config.random_state)
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER)
tracker.stop()



Create sweep with ID: ld27797m
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/ld27797m


[34m[1mwandb[0m: Agent Starting Run: elq26b8a with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 41
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.24143
f1 macro,0.21857


[34m[1mwandb[0m: Agent Starting Run: zlc1cgb1 with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 58
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.25701
f1 macro,0.23489


[34m[1mwandb[0m: Agent Starting Run: 07l67gfc with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 88
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.25545
f1 macro,0.23171


[34m[1mwandb[0m: Agent Starting Run: h2fysnp8 with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 35
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.25389
f1 macro,0.23598


[34m[1mwandb[0m: Agent Starting Run: 4z0x5a83 with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 100
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.24455
f1 macro,0.21902


[34m[1mwandb[0m: Agent Starting Run: w8dikn5p with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 121
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.2648
f1 macro,0.24267


[34m[1mwandb[0m: Agent Starting Run: om5xs4l2 with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 170
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.25701
f1 macro,0.23299


[34m[1mwandb[0m: Agent Starting Run: 0lm579tr with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 12
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.22274
f1 macro,0.20336


[34m[1mwandb[0m: Agent Starting Run: 2a41ike2 with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 41
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.24143
f1 macro,0.21857


[34m[1mwandb[0m: Agent Starting Run: uzpfw0mo with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 154
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.25
f1 macro,0.2229


0.0016207617531553624

In [17]:
#Don't forget to name the sweep instance   
name = 'rf_ft_liar' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="hyperopt")

def train_fasttext(config = None,
          train=embedded_train_liar, #CHANGE HERE
          val=embedded_val_liar): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = RandomForestClassifier(n_estimators=config.n_estimators,
                                max_features=config.max_features,
                                 random_state=config.random_state)
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()



Create sweep with ID: jb670u0y
Sweep URL: https://wandb.ai/benchmark-nlp/hyperopt/sweeps/jb670u0y


[34m[1mwandb[0m: Agent Starting Run: ijp0rxys with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 39
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.22897
f1 macro,0.19973


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: pvumfjem with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 190
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.2352
f1 macro,0.20071


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: b8bb1cdc with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 27
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.21807
f1 macro,0.18393


[34m[1mwandb[0m: Agent Starting Run: w0ph2iy4 with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 119
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.23442
f1 macro,0.1957


[34m[1mwandb[0m: Agent Starting Run: 3liu1knf with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 198
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.23832
f1 macro,0.20092


[34m[1mwandb[0m: Agent Starting Run: rtuyyskh with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 175
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.22586
f1 macro,0.19459


[34m[1mwandb[0m: Agent Starting Run: nptlt3kl with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 65
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.22508
f1 macro,0.19336


[34m[1mwandb[0m: Agent Starting Run: g6kzfzg7 with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 198
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.23832
f1 macro,0.20092


[34m[1mwandb[0m: Agent Starting Run: qm3lsbkr with config:
[34m[1mwandb[0m: 	max_features: sqrt
[34m[1mwandb[0m: 	n_estimators: 81
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.22196
f1 macro,0.18738


[34m[1mwandb[0m: Agent Starting Run: owfhn2wa with config:
[34m[1mwandb[0m: 	max_features: log2
[34m[1mwandb[0m: 	n_estimators: 94
[34m[1mwandb[0m: 	random_state: 42


0,1
accuracy,▁
f1 macro,▁

0,1
AUC,-
AUC-PC,-
accuracy,0.2352
f1 macro,0.2021


0.001799031014590331