In [1]:
#Connect to wandb
import wandb
wandb.login()
wandb.init(project="xgb", 
           entity="benchmark-nlp",
           name='sarcasm datasets') #CHANGE

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjtonglet[0m ([33mbenchmark-nlp[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [2]:
import os 
os.chdir('../../..')

In [3]:
#Load packages
import warnings
import io
import numpy as np
import pandas as pd
from codecarbon import EmissionsTracker
import yaml
from util.dataloader import DataLoader
from preprocessing.preprocessor import Preprocessor
from util.datasplitter import data_splitter
from preprocessing.fasttext_embeddings import FastTextEmbeddings
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, average_precision_score
warnings.filterwarnings("ignore")

In [4]:
#Set constants
SEED=42
OPT_ITER=20

## Load data

In [8]:
dl = DataLoader(['sarcasm'])
data = dl.load()


tweet_preprocessor = Preprocessor(is_tweet=True)
preprocessor = Preprocessor()

#We are not interested in the test sets for hyperparameter optimization
train_semeval_A, val_semeval_A, _ = data_splitter(data['SemEval_A'],
                                 tweet_preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)

train_isarcasm, val_isarcasm, _ = data_splitter(data['iSarcasm'],
                                 tweet_preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                 seed=SEED)

3817 rows preprocessed in 2.9911282062530518 seconds
784 rows preprocessed in 0.16309666633605957 seconds
3468 rows preprocessed in 0.7899963855743408 seconds
1400 rows preprocessed in 0.283416748046875 seconds


In [5]:
train_sarc, val_sarc, _ = data_splitter(data['sarc'],
                                 preprocessor, 
                                 create_val_set=True,   #No validation set is provided
                                seed=SEED)

KeyboardInterrupt: 

In [9]:
%%time
#fasttext 
fasttext = FastTextEmbeddings()
fasttext.load_model('fasttext/cc.en.300.bin')



CPU times: total: 17.8 s
Wall time: 36.2 s


In [10]:
embedded_train_semeval_A = fasttext.generate_sentence_embeddings(train_semeval_A['text'])
embedded_val_semeval_A = fasttext.generate_sentence_embeddings(val_semeval_A['text'])
embedded_train_semeval_A['label'] = train_semeval_A['label'].to_list()
embedded_val_semeval_A['label'] = val_semeval_A['label'].to_list()

Starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 3053/3053 [00:04<00:00, 700.10it/s]


Starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████████| 764/764 [00:01<00:00, 697.92it/s]


In [11]:
embedded_train_isarcasm = fasttext.generate_sentence_embeddings(train_isarcasm['text'])
embedded_val_isarcasm = fasttext.generate_sentence_embeddings(val_isarcasm['text'])
embedded_train_isarcasm['label'] = train_isarcasm['label'].to_list()
embedded_val_isarcasm['label'] = val_isarcasm['label'].to_list()

Starting to generate sentence embeddings


100%|█████████████████████████████████████████████████████████████████████████████| 2774/2774 [00:04<00:00, 668.45it/s]


Starting to generate sentence embeddings


100%|███████████████████████████████████████████████████████████████████████████████| 694/694 [00:01<00:00, 618.07it/s]


In [None]:
embedded_train_sarc = fasttext.generate_sentence_embeddings(train_sarc['text'])
embedded_val_sarc = fasttext.generate_sentence_embeddings(val_sarc['text'])
embedded_train_sarc['label'] = train_sarc['label'].to_list()
embedded_val_sarc['label'] = val_sarc['label'].to_list()

## Hyperopt

In [12]:
#Load the template yaml sweep config file for logistic regression
#If the value range for an hyperparameter needs to be changed, better to do it in the .yaml file than in a notebook
with open("config/xgb_sweep.yaml", 'r') as stream:
    sweep_config = yaml.safe_load(stream)

In [13]:
#The config is displayed as a nested dictionary
sweep_config

{'method': 'random',
 'entity': 'benchmark-nlp',
 'project': 'hyperopt',
 'metric': {'name': 'loss', 'goal': 'minimize'},
 'parameters': {'gamma': {'min': 0, 'max': 1, 'distribution': 'uniform'},
  'n_estimators': {'min': 10, 'max': 150, 'distribution': 'int_uniform'},
  'learning_rate': {'min': 0.001, 'max': 0.1, 'distribution': 'uniform'},
  'max_depth': {'min': 0, 'max': 10, 'distribution': 'int_uniform'},
  'random_state': {'value': 42}}}

####  sem_eval_A   Tf-Idf

In [None]:
#Don't forget to name the sweep instance  
name = 'xgb_tfidf_sem_eval_A' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="xgb")

def train_tfidf(config = None,
          train=train_semeval_A, #Change here
          val=val_semeval_A): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = XGBClassifier(gamma=config.gamma,
                                 n_estimators=config.n_estimators,
                                 learning_rate=config.learning_rate,
                                 max_depth=config.max_depth,
                                 random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER) #Count : number of iterations
tracker.stop()

####  Semeval A fasttext

In [None]:
#Don't forget to name the sweep instance   
name = 'xgb_fasttext_sem_eval_A' #change here
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="xgb")

def train_fasttext(config = None,
          train=embedded_train_semeval_A, #Change here
          val=embedded_val_semeval_A): #change here
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        clf = XGBClassifier(gamma=config.gamma,
                                 n_estimators=config.n_estimators,
                                 learning_rate=config.learning_rate,
                                 max_depth=config.max_depth,
                                 random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()

####  i Sarcasm


In [16]:
#Don't forget to name the sweep instance  
name = 'xgb_tfidf_isarcasm' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="xgb")

def train_tfidf(config = None,
          train=train_isarcasm, #Change here
          val=val_isarcasm): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        # If called by wandb.agent, as below,
        # this config will be set by Sweep Controller
        config = wandb.config
        vec = TfidfVectorizer()
        clf = XGBClassifier(gamma=config.gamma,
                                 n_estimators=config.n_estimators,
                                 learning_rate=config.learning_rate,
                                 max_depth=config.max_depth,
                                 random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER)
tracker.stop()

Create sweep with ID: yjjhh0kd
Sweep URL: https://wandb.ai/benchmark-nlp/xgb/sweeps/yjjhh0kd


[34m[1mwandb[0m: Agent Starting Run: 8mptc1md with config:
[34m[1mwandb[0m: 	gamma: 0.4808179108082832
[34m[1mwandb[0m: 	learning_rate: 0.09486955485991944
[34m[1mwandb[0m: 	max_depth: 2
[34m[1mwandb[0m: 	n_estimators: 96
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.56627
AUC-PC,0.34224
accuracy,0.71326
f1 macro,0.43053


[34m[1mwandb[0m: Agent Starting Run: xxcvg5t5 with config:
[34m[1mwandb[0m: 	gamma: 0.9495070256331224
[34m[1mwandb[0m: 	learning_rate: 0.03283234873044642
[34m[1mwandb[0m: 	max_depth: 0
[34m[1mwandb[0m: 	n_estimators: 59
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.5
AUC-PC,0.28674
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: usp0xwz0 with config:
[34m[1mwandb[0m: 	gamma: 0.10106824332665998
[34m[1mwandb[0m: 	learning_rate: 0.05237409379224264
[34m[1mwandb[0m: 	max_depth: 1
[34m[1mwandb[0m: 	n_estimators: 79
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.54666
AUC-PC,0.31425
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: jjpt1msu with config:
[34m[1mwandb[0m: 	gamma: 0.31687354130783274
[34m[1mwandb[0m: 	learning_rate: 0.033085009571974695
[34m[1mwandb[0m: 	max_depth: 2
[34m[1mwandb[0m: 	n_estimators: 63
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.54823
AUC-PC,0.31623
accuracy,0.71182
f1 macro,0.41582


[34m[1mwandb[0m: Agent Starting Run: whcop6a3 with config:
[34m[1mwandb[0m: 	gamma: 0.6021633424130931
[34m[1mwandb[0m: 	learning_rate: 0.021380034532905528
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 128
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.58138
AUC-PC,0.34607
accuracy,0.70893
f1 macro,0.44623


[34m[1mwandb[0m: Agent Starting Run: p43blsiz with config:
[34m[1mwandb[0m: 	gamma: 0.3018490478054957
[34m[1mwandb[0m: 	learning_rate: 0.03502830178322199
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 69
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.5846
AUC-PC,0.342
accuracy,0.70893
f1 macro,0.44623


[34m[1mwandb[0m: Agent Starting Run: vnvjdf3t with config:
[34m[1mwandb[0m: 	gamma: 0.5279677416916099
[34m[1mwandb[0m: 	learning_rate: 0.07272353908369285
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	n_estimators: 148
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.58224
AUC-PC,0.35311
accuracy,0.70461
f1 macro,0.46769


[34m[1mwandb[0m: Agent Starting Run: p42yae07 with config:
[34m[1mwandb[0m: 	gamma: 0.279048017095323
[34m[1mwandb[0m: 	learning_rate: 0.07191790091176915
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	n_estimators: 93
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.58468
AUC-PC,0.35532
accuracy,0.71182
f1 macro,0.45581


[34m[1mwandb[0m: Agent Starting Run: t0ol49fp with config:
[34m[1mwandb[0m: 	gamma: 0.17224418599516078
[34m[1mwandb[0m: 	learning_rate: 0.0472064088091314
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	n_estimators: 133
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.57342
AUC-PC,0.34211
accuracy,0.71902
f1 macro,0.44643


[34m[1mwandb[0m: Agent Starting Run: rgx9cykz with config:
[34m[1mwandb[0m: 	gamma: 0.6788796783887925
[34m[1mwandb[0m: 	learning_rate: 0.06510568289163796
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	n_estimators: 116
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.58107
AUC-PC,0.34966
accuracy,0.71182
f1 macro,0.45171


[34m[1mwandb[0m: Agent Starting Run: aea4b8v7 with config:
[34m[1mwandb[0m: 	gamma: 0.0654480310216079
[34m[1mwandb[0m: 	learning_rate: 0.044639145399196695
[34m[1mwandb[0m: 	max_depth: 7
[34m[1mwandb[0m: 	n_estimators: 56
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.57531
AUC-PC,0.33931
accuracy,0.70893
f1 macro,0.44201


[34m[1mwandb[0m: Agent Starting Run: mtc96n3e with config:
[34m[1mwandb[0m: 	gamma: 0.30658526286084753
[34m[1mwandb[0m: 	learning_rate: 0.06097059784629158
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	n_estimators: 57
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.56004
AUC-PC,0.33349
accuracy,0.71326
f1 macro,0.42589


[34m[1mwandb[0m: Agent Starting Run: 0jsv0axp with config:
[34m[1mwandb[0m: 	gamma: 0.9210616066891384
[34m[1mwandb[0m: 	learning_rate: 0.042311503433603966
[34m[1mwandb[0m: 	max_depth: 7
[34m[1mwandb[0m: 	n_estimators: 62
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.57664
AUC-PC,0.34469
accuracy,0.71037
f1 macro,0.44264


[34m[1mwandb[0m: Agent Starting Run: irwqzv26 with config:
[34m[1mwandb[0m: 	gamma: 0.7436404181360473
[34m[1mwandb[0m: 	learning_rate: 0.06424614757776372
[34m[1mwandb[0m: 	max_depth: 7
[34m[1mwandb[0m: 	n_estimators: 50
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.57331
AUC-PC,0.3444
accuracy,0.71182
f1 macro,0.44753


[34m[1mwandb[0m: Agent Starting Run: llzdjgas with config:
[34m[1mwandb[0m: 	gamma: 0.04073700317171747
[34m[1mwandb[0m: 	learning_rate: 0.03626184406607943
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 113
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.58496
AUC-PC,0.34679
accuracy,0.71182
f1 macro,0.43892


[34m[1mwandb[0m: Agent Starting Run: 7hfmsf1b with config:
[34m[1mwandb[0m: 	gamma: 0.7436512978210327
[34m[1mwandb[0m: 	learning_rate: 0.07550625010582028
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 108
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.59321
AUC-PC,0.35404
accuracy,0.70605
f1 macro,0.4647


[34m[1mwandb[0m: Agent Starting Run: fegm5tz2 with config:
[34m[1mwandb[0m: 	gamma: 0.08154056326461845
[34m[1mwandb[0m: 	learning_rate: 0.042329070277996955
[34m[1mwandb[0m: 	max_depth: 1
[34m[1mwandb[0m: 	n_estimators: 34
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.50087
AUC-PC,0.29231
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: 1czvf4ee with config:
[34m[1mwandb[0m: 	gamma: 0.0941212955066182
[34m[1mwandb[0m: 	learning_rate: 0.006738929046722315
[34m[1mwandb[0m: 	max_depth: 4
[34m[1mwandb[0m: 	n_estimators: 42
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.50546
AUC-PC,0.29097
accuracy,0.70317
f1 macro,0.44363


[34m[1mwandb[0m: Agent Starting Run: ibhiquyb with config:
[34m[1mwandb[0m: 	gamma: 0.9334088158667438
[34m[1mwandb[0m: 	learning_rate: 0.06611383767949078
[34m[1mwandb[0m: 	max_depth: 8
[34m[1mwandb[0m: 	n_estimators: 43
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.57186
AUC-PC,0.3335
accuracy,0.71037
f1 macro,0.44688


[34m[1mwandb[0m: Agent Starting Run: 1631y1vl with config:
[34m[1mwandb[0m: 	gamma: 0.2102759142809254
[34m[1mwandb[0m: 	learning_rate: 0.049800778527698514
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	n_estimators: 35
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.55937
AUC-PC,0.32872
accuracy,0.70749
f1 macro,0.43709


0.0020891967381379477

In [17]:
#Don't forget to name the sweep instance   
name = 'xgb_ft_isarcasm' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="xgb")

def train_fasttext(config = None,
          train=embedded_train_isarcasm, #CHANGE HERE
          val=embedded_val_isarcasm): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = XGBClassifier(gamma=config.gamma,
                                 n_estimators=config.n_estimators,
                                 learning_rate=config.learning_rate,
                                 max_depth=config.max_depth,
                                 random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()



Create sweep with ID: pznefzco
Sweep URL: https://wandb.ai/benchmark-nlp/xgb/sweeps/pznefzco


[34m[1mwandb[0m: Agent Starting Run: n6jl8eg1 with config:
[34m[1mwandb[0m: 	gamma: 0.3573933324788726
[34m[1mwandb[0m: 	learning_rate: 0.07136936766899564
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 92
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.57797
AUC-PC,0.37145
accuracy,0.71758
f1 macro,0.45014


[34m[1mwandb[0m: Agent Starting Run: zzp7tw68 with config:
[34m[1mwandb[0m: 	gamma: 0.6329361642699473
[34m[1mwandb[0m: 	learning_rate: 0.030453974056918538
[34m[1mwandb[0m: 	max_depth: 2
[34m[1mwandb[0m: 	n_estimators: 43
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.58108
AUC-PC,0.35875
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: 3ofkkzvj with config:
[34m[1mwandb[0m: 	gamma: 0.30883156252289345
[34m[1mwandb[0m: 	learning_rate: 0.02277246871499335
[34m[1mwandb[0m: 	max_depth: 8
[34m[1mwandb[0m: 	n_estimators: 128
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.58982
AUC-PC,0.36899
accuracy,0.71182
f1 macro,0.43449


[34m[1mwandb[0m: Agent Starting Run: qq0v7hhb with config:
[34m[1mwandb[0m: 	gamma: 0.6396979446072553
[34m[1mwandb[0m: 	learning_rate: 0.05439692831040006
[34m[1mwandb[0m: 	max_depth: 7
[34m[1mwandb[0m: 	n_estimators: 129
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.61355
AUC-PC,0.42676
accuracy,0.7147
f1 macro,0.43109


[34m[1mwandb[0m: Agent Starting Run: 6zh97ov0 with config:
[34m[1mwandb[0m: 	gamma: 0.354694830281701
[34m[1mwandb[0m: 	learning_rate: 0.003808842238119524
[34m[1mwandb[0m: 	max_depth: 0
[34m[1mwandb[0m: 	n_estimators: 44
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.5
AUC-PC,0.28674
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: w3ynagx6 with config:
[34m[1mwandb[0m: 	gamma: 0.405930266670432
[34m[1mwandb[0m: 	learning_rate: 0.08380986234166986
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	n_estimators: 132
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.59357
AUC-PC,0.37497
accuracy,0.71902
f1 macro,0.45508


[34m[1mwandb[0m: Agent Starting Run: l288ukie with config:
[34m[1mwandb[0m: 	gamma: 0.5153422906858495
[34m[1mwandb[0m: 	learning_rate: 0.05519430720988509
[34m[1mwandb[0m: 	max_depth: 0
[34m[1mwandb[0m: 	n_estimators: 78
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.5
AUC-PC,0.28674
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: 92b2bhoz with config:
[34m[1mwandb[0m: 	gamma: 0.9117619839726
[34m[1mwandb[0m: 	learning_rate: 0.0258534789033194
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 111
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.58011
AUC-PC,0.36887
accuracy,0.70893
f1 macro,0.43331


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: r25t3j0l with config:
[34m[1mwandb[0m: 	gamma: 0.2741115705770528
[34m[1mwandb[0m: 	learning_rate: 0.08340111064957444
[34m[1mwandb[0m: 	max_depth: 8
[34m[1mwandb[0m: 	n_estimators: 83
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.5819
AUC-PC,0.36425
accuracy,0.71182
f1 macro,0.44753


[34m[1mwandb[0m: Agent Starting Run: yvhoynzi with config:
[34m[1mwandb[0m: 	gamma: 0.571949752924806
[34m[1mwandb[0m: 	learning_rate: 0.07986688827993672
[34m[1mwandb[0m: 	max_depth: 4
[34m[1mwandb[0m: 	n_estimators: 96
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.59342
AUC-PC,0.38464
accuracy,0.71902
f1 macro,0.45508


[34m[1mwandb[0m: Agent Starting Run: dj88rw1w with config:
[34m[1mwandb[0m: 	gamma: 0.13194132318624108
[34m[1mwandb[0m: 	learning_rate: 0.04692012109480288
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	n_estimators: 144
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.59042
AUC-PC,0.38067
accuracy,0.7219
f1 macro,0.4477


[34m[1mwandb[0m: Agent Starting Run: 7wmvzzuc with config:
[34m[1mwandb[0m: 	gamma: 0.5154825619943819
[34m[1mwandb[0m: 	learning_rate: 0.015597327418232065
[34m[1mwandb[0m: 	max_depth: 1
[34m[1mwandb[0m: 	n_estimators: 109
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.59111
AUC-PC,0.36107
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: 71ovudpp with config:
[34m[1mwandb[0m: 	gamma: 0.2562280365184074
[34m[1mwandb[0m: 	learning_rate: 0.049323569865115095
[34m[1mwandb[0m: 	max_depth: 7
[34m[1mwandb[0m: 	n_estimators: 54
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.59673
AUC-PC,0.38861
accuracy,0.71182
f1 macro,0.43449


[34m[1mwandb[0m: Agent Starting Run: pgslb8hu with config:
[34m[1mwandb[0m: 	gamma: 0.7014667765311771
[34m[1mwandb[0m: 	learning_rate: 0.0672343671966863
[34m[1mwandb[0m: 	max_depth: 0
[34m[1mwandb[0m: 	n_estimators: 37
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.5
AUC-PC,0.28674
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: g5r2uuvu with config:
[34m[1mwandb[0m: 	gamma: 0.35048322127231757
[34m[1mwandb[0m: 	learning_rate: 0.02546421634391623
[34m[1mwandb[0m: 	max_depth: 2
[34m[1mwandb[0m: 	n_estimators: 127
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.59446
AUC-PC,0.36147
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: xjsf2cqc with config:
[34m[1mwandb[0m: 	gamma: 0.47342213470786376
[34m[1mwandb[0m: 	learning_rate: 0.0035454896694220583
[34m[1mwandb[0m: 	max_depth: 1
[34m[1mwandb[0m: 	n_estimators: 93
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.56214
AUC-PC,0.33019
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: v0eix8bn with config:
[34m[1mwandb[0m: 	gamma: 0.814265485295665
[34m[1mwandb[0m: 	learning_rate: 0.008586616539483075
[34m[1mwandb[0m: 	max_depth: 1
[34m[1mwandb[0m: 	n_estimators: 92
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.57292
AUC-PC,0.34145
accuracy,0.71326
f1 macro,0.41632


[34m[1mwandb[0m: Agent Starting Run: r4cgzatw with config:
[34m[1mwandb[0m: 	gamma: 0.6163887917313163
[34m[1mwandb[0m: 	learning_rate: 0.09706535199721636
[34m[1mwandb[0m: 	max_depth: 10
[34m[1mwandb[0m: 	n_estimators: 101
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.59556
AUC-PC,0.39244
accuracy,0.71902
f1 macro,0.45508


[34m[1mwandb[0m: Agent Starting Run: dbpjr2cs with config:
[34m[1mwandb[0m: 	gamma: 0.8764105497183883
[34m[1mwandb[0m: 	learning_rate: 0.0810929316558163
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 49
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.58933
AUC-PC,0.37732
accuracy,0.71182
f1 macro,0.43449


[34m[1mwandb[0m: Agent Starting Run: l5ei73re with config:
[34m[1mwandb[0m: 	gamma: 0.5615850141182016
[34m[1mwandb[0m: 	learning_rate: 0.09383609798968869
[34m[1mwandb[0m: 	max_depth: 5
[34m[1mwandb[0m: 	n_estimators: 117
[34m[1mwandb[0m: 	random_state: 42


0,1
AUC,▁
AUC-PC,▁
accuracy,▁
f1 macro,▁

0,1
AUC,0.59088
AUC-PC,0.40401
accuracy,0.7219
f1 macro,0.47289


0.002386086947879153

#### sarc

In [None]:
#Don't forget to name the sweep instance  
name = 'xgb_tfidf_sarc' #change here
sweep_config['name'] =  name

#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="xgb")

def train_tfidf(config = None,
          train=train_sarc, #Change here
          val=val_sarc): #change here
    '''
    Generic WandB function to conduct hyperparameter optimization with tf-idf vectorizer
    '''
    # Initialize a new wandb run
    with wandb.init(config=config,group=name):
        config = wandb.config
        vec = TfidfVectorizer()
        clf = XGBClassifier(gamma=config.gamma,
                                 n_estimators=config.n_estimators,
                                 learning_rate=config.learning_rate,
                                 max_depth=config.max_depth,
                                 random_state=config.random_state) #set the hyperparams here
        
        #Create the pipeline
        pipe = Pipeline([('vectorizer',vec),('clf',clf)])
        #Fit the pipeline
        pipe.fit(train['text'],train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val['text'])
        pred_prob_val = pipe.predict_proba(val['text'])[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
        else:
            aucpc = '-'
            auc = '-'
        #Log metrics on WandB
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })

#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_tfidf,count=OPT_ITER)
tracker.stop()

In [None]:
#Don't forget to name the sweep instance   
name = 'xgb_ft_sarc' #CHANGE HERE
sweep_config['name'] = name
#Generate a sweep_id
sweep_id = wandb.sweep(sweep_config, project="xgb")

def train_fasttext(config = None,
          train=embedded_train_sarc, #CHANGE HERE
          val=embedded_val_sarc): #CHANGE HERE
    # Initialize a new wandb run
    with wandb.init(config=config, group=name):
        config = wandb.config
        clf = XGBClassifier(gamma=config.gamma,
                                 n_estimators=config.n_estimators,
                                 learning_rate=config.learning_rate,
                                 max_depth=config.max_depth,
                                 random_state=config.random_state) #set the hyperparams here
        pipe = Pipeline([('clf',clf)])
        pipe.fit(train.fillna(0).drop(['label'],axis=1),train['label'])
        
        #Make predictions
        pred_val = pipe.predict(val.fillna(0).drop(['label'],axis=1))
        pred_prob_val = pipe.predict_proba(val.fillna(0).drop(['label'],axis=1))[:,1]
        accuracy = accuracy_score(val['label'],pred_val)
        f1_macro = f1_score(val['label'],pred_val,average='macro')
        if train['label'].nunique() <=2:
            aucpc =  average_precision_score(val['label'],pred_prob_val)
            auc = roc_auc_score(val['label'],pred_prob_val)
            #Log predictions on WandB
        else:
            aucpc = '-'
            auc = '-'
        wandb.log({"accuracy": accuracy, "f1 macro":f1_macro, "AUC-PC":aucpc, 'AUC':auc })


#Track emissions
tracker = EmissionsTracker(project_name=name,log_level='warning', measure_power_secs=300,
                           output_file='output/emissions_hyperopt.csv')
#Launch the agent
tracker.start()
wandb.agent(sweep_id, train_fasttext,count=OPT_ITER)
tracker.stop()