In [None]:
import pickle 
import numpy as np
import pandas as pd 
import xgboost as xgb
from sklearn.metrics import classification_report, f1_score

In [None]:
with open('../data/features_training_irony_twitter_semeval.p', 'rb') as handle:
    train = pickle.load(handle)

In [None]:
X_train_embed = train['bert_embed']

X_train_pp = np.concatenate([train['bert_embed'], train['emoji']['emoji'],
                             np.expand_dims(train['emoji']['emoji_positive'], axis = 1), np.expand_dims(train['emoji']['emoji_negative'], axis = 1),
                         train['punc'], train['onom'], train['init']], axis = 1)

X_train_pos = np.concatenate([train['pos'], train['bert_embed']], axis = 1)

X_train_pp_pos = np.concatenate([train['emoji']['emoji'],np.expand_dims(train['emoji']['emoji_positive'], axis = 1), 
                                 np.expand_dims(train['emoji']['emoji_negative'], axis = 1), train['pos'],train['punc'],
                                 train['onom'], train['init'], train['bert_embed']], axis = 1)

X_train_pp_pos_pol = np.concatenate([train['emoji']['emoji'],np.expand_dims(train['emoji']['emoji_positive'], axis = 1), 
                                 np.expand_dims(train['emoji']['emoji_negative'], axis = 1), train['pos'],train['punc'],
                                 train['onom'], train['init'], train['bert_embed'], train['polarity']], axis = 1)

y_train = train['label']

In [None]:
X_train_pol = np.concatenate([train['bert_embed'], train['polarity']], axis =1)
X_train_pos_pol = np.concatenate([train['bert_embed'], train['polarity'], train['pos']], axis = 1)
X_train_pp_pol = np.concatenate([train['bert_embed'], train['emoji']['emoji'],
                             np.expand_dims(train['emoji']['emoji_positive'], axis = 1), np.expand_dims(train['emoji']['emoji_negative'], axis = 1),
                         train['punc'], train['onom'], train['init']], axis = 1)

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import uniform
from imblearn.pipeline import Pipeline

def get_val_metrics(clf): 
    split0 = clf.cv_results_['split0_test_score'][clf.best_index_]
    split1 = clf.cv_results_['split1_test_score'][clf.best_index_]
    split2 = clf.cv_results_['split2_test_score'][clf.best_index_]
    split3 = clf.cv_results_['split3_test_score'][clf.best_index_]
    split4 = clf.cv_results_['split4_test_score'][clf.best_index_]
    split5 = clf.cv_results_['split5_test_score'][clf.best_index_]
    fold_results = [split0,split1, split2, split3, split4, split5]
    
    return fold_results 

# Random Search

In [None]:
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_validate
from scipy.stats import randint as sp_randint
from sklearn.model_selection import StratifiedKFold 
# Define dictionary with performance metrics
scoring = {'accuracy':make_scorer(accuracy_score), 
           'precision':make_scorer(precision_score),
           'recall':make_scorer(recall_score), 
           'f1_score':make_scorer(f1_score)}
from imblearn.pipeline import Pipeline
# Import required libraries for machine learning classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.experimental import enable_hist_gradient_boosting 
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from scipy import stats
# Instantiate the machine learning classifiers
log_model = LogisticRegression(max_iter=500)
dtr_model = HistGradientBoostingClassifier()
rfc_model = RandomForestClassifier()
gnb_model = AdaBoostClassifier()
xgb_model = xgb.XGBClassifier(objective = 'binary:logistic') #tree_method = 'gpu_hist'
nb_model = GaussianNB()
from imblearn.under_sampling import RandomUnderSampler

# Define the models evaluation function
def models_evaluation(X, y, folds, epoch, metric = 'accuracy'):
    
    '''
    X : data set features
    y : data set target
    folds : number of cross-validation folds
    
    '''
    diz = {}
    rand_list_xgb = {'n_estimators': stats.randint(200, 500),
              'learning_rate': stats.uniform(0.01, 0.6),
              'subsample': stats.uniform(0.3, 0.9),
              'max_depth': stats.randint(3, 30),
              'min_child_weight':stats.randint(1, 20)
             }
    rand_list_svm = {"C": stats.uniform(2, 20),"gamma": stats.uniform(0.1, 1), 'kernel': ['linear', 'rbf', 'sigmoid']}
    rand_list_reg = { 'C': stats.uniform(0.1, 10), 'penalty' : ['l2'], 'solver' : ['liblinear', 'saga']}
    rand_list_hist = {'max_depth': stats.randint(3, 30), 'min_samples_leaf': stats.randint(1, 20), 'learning_rate': stats.uniform(0.001, 0.1)}
    rand_list_rf = {'max_depth': stats.randint(3, 30), 'min_samples_leaf':  stats.randint(1, 20), 'min_samples_split': stats.randint(1, 20), 'n_estimators':  stats.randint(50, 500)}
    rand_list_ada = {'n_estimators':  stats.randint(50, 500), 'learning_rate' : stats.uniform(0.001, 0.1)}
    undersample = RandomUnderSampler(sampling_strategy='majority')

    strat = StratifiedKFold(n_splits = folds, random_state = None)
    
    
    pipeline = Pipeline([('sampling', RandomUnderSampler()),
        ('classification',dtr_model)])
    random_grid = {'classification__' + key: rand_list_hist[key] for key in rand_list_hist}
    clf_randomsearch_hgb = RandomizedSearchCV(pipeline, random_grid, n_iter = epoch, cv = strat, scoring = metric,  n_jobs=6, verbose = 2)
    clf_randomsearch_hgb.fit(X, y)
    diz['Hgboost'] = get_val_metrics(clf_randomsearch_hgb)
    std = clf_randomsearch_hgb.cv_results_['std_test_score'][clf_randomsearch_hgb.best_index_]
    print("Best configuration for Hist G.Boost: {}\n Metric score: {} and std score: {}".format(clf_randomsearch_hgb.best_params_, clf_randomsearch_hgb.best_score_, std))
    
    pipeline = Pipeline([('sampling', RandomUnderSampler()),
        ('classification',rfc_model)])
    random_grid = {'classification__' + key: rand_list_rf[key] for key in rand_list_rf}    
    clf_randomsearch_rf = RandomizedSearchCV(pipeline, random_grid, n_iter = epoch, cv = strat, scoring = metric,  n_jobs=6, verbose = 2)
    clf_randomsearch_rf.fit(X, y)
    std = clf_randomsearch_rf.cv_results_['std_test_score'][clf_randomsearch_rf.best_index_]
    diz['RandomForest'] = get_val_metrics(clf_randomsearch_rf)
    print("Best configuration for Random Forest: {}\n Metric score: {} and std score: {}".format(clf_randomsearch_rf.best_params_, clf_randomsearch_rf.best_score_, std))
    
    pipeline = Pipeline([('sampling', RandomUnderSampler()),
        ('classification',xgb_model)])
    random_grid = {'classification__' + key: rand_list_xgb[key] for key in rand_list_xgb}    
    clf_randomsearch_xgb = RandomizedSearchCV(pipeline,random_grid, n_iter = epoch, cv = strat, scoring = metric,  n_jobs=6, verbose = 2)
    clf_randomsearch_xgb.fit(X, y)
    diz['XgBoost'] = get_val_metrics(clf_randomsearch_xgb)
    std = clf_randomsearch_xgb.cv_results_['std_test_score'][clf_randomsearch_xgb.best_index_]
    print("Best configuration for XG.Boost: {}\n Metric score: {} and std score: {}".format(clf_randomsearch_xgb.best_params_, clf_randomsearch_xgb.best_score_, std))
    
    pipeline = Pipeline([('sampling', RandomUnderSampler()),
        ('classification',gnb_model)])
    random_grid = {'classification__' + key: rand_list_ada[key] for key in rand_list_ada}      
    clf_randomsearch_ada = RandomizedSearchCV(pipeline, random_grid, n_iter = epoch, cv = strat, scoring = metric,  n_jobs=6, verbose = 2)
    clf_randomsearch_ada.fit(X, y)
    diz['AdaBoost'] = get_val_metrics(clf_randomsearch_ada)
    std = clf_randomsearch_ada.cv_results_['std_test_score'][clf_randomsearch_ada.best_index_]
    print("Best configuration for Ada Boost: {}\n Metric score: {} and std score: {}".format(clf_randomsearch_ada.best_params_, clf_randomsearch_ada.best_score_, std))
    
    pipeline = Pipeline([('sampling', RandomUnderSampler()),
        ('classification',log_model)])
    random_grid = {'classification__' + key: rand_list_reg[key] for key in rand_list_reg}       
    clf_randomsearch_reg = RandomizedSearchCV(pipeline, random_grid, n_iter = epoch, cv = strat, scoring = metric,  n_jobs=6, verbose = 2)
    clf_randomsearch_reg.fit(X, y)
    diz['Logistic'] = get_val_metrics(clf_randomsearch_reg)
    std = clf_randomsearch_reg.cv_results_['std_test_score'][clf_randomsearch_reg.best_index_]
    print("Best configuration for Logistic Regression: {}\n Metric score: {} and std score: {}".format(clf_randomsearch_reg.best_params_, clf_randomsearch_reg.best_score_, std))
    
    return diz

In [None]:
print('Embedding features')
score = models_evaluation(X_train_embed, y_train, 6, 10)

In [None]:
print("PP features")
score2 = models_evaluation(np.nan_to_num(X_train_pp), y_train, 6, 10)


In [None]:
print("Pos features")
score3 = models_evaluation(X_train_pos, y_train, 6, 10)

In [None]:
print("Pos + PP features")
score4 = models_evaluation(np.nan_to_num(X_train_pp_pos), y_train, 6, 10)

In [None]:
print("Pos + PP features + Polarity")
score5 = models_evaluation(np.nan_to_num(X_train_pp_pos_pol), y_train, 6, 10)

In [None]:
print('Polarity')
score6 = models_evaluation(X_train_pol, y_train, 6, 10)
print('Pos and Polarity')
score7 = models_evaluation(X_train_pos_pol, y_train, 6, 10)


In [None]:
print('PP and Polarity')
score8 = models_evaluation(np.nan_to_num(X_train_pp_pol), y_train, 6, 10)

In [None]:
features_name = ['POL', 'POS+POL', 'PP+POL']
list_score = [score6, score7, score8]

In [None]:
list_score = [score,score2,score3, score4, score5]

In [None]:
features_name = ["Embedding","PP","POS","POS+PP","POS+PP+POL"]

In [None]:
diz_scores = {}
for i in range(len(list_score)):
    diz_scores['{}'.format(features_name[i])] = list_score[i]

In [None]:
import pickle

with open('bayes_search_irony_semeval_otherfeat.p', 'wb') as fp:
    pickle.dump(diz_scores, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
data = pd.DataFrame(diz_scores).reset_index()

In [None]:
def bootstrap(samples, n_bootstrap, size_samples, ic = 0.95): #size sample n splits 
    diz = {}
    samples = np.array(list(samples))
    x_mean = np.mean(samples)
    samples_boot = []
    for i in range(n_bootstrap):
        samples_boot.append(np.mean(np.random.choice(np.squeeze(samples), size_samples)))
    scarti = samples_boot - x_mean
    v = 100 - ic*100
    pinf = 100 - v/2
    psup = 100 - pinf
    lim_inf = x_mean - np.percentile(scarti, pinf)
    lim_sup = x_mean - np.percentile(scarti, psup)
    diz['Mean'] = x_mean
    diz['Lower'] = lim_inf
    diz['Upper'] = lim_sup
    return diz
    return diz

In [None]:
def compute_boostrap(df, column):    
    diz = {}
    diz2 = {}
    diz['Hgboost'] = bootstrap(df[df['index'] == 'Hgboost'][column], 50, 6, 0.95)
    diz['RandomForest'] = bootstrap(df[df['index'] == 'RandomForest'][column], 50, 6, 0.95)
    diz['XgBoost']  = bootstrap(df[df['index'] == 'XgBoost'][column], 50, 6, 0.95)
    diz['AdaBoost'] = bootstrap(df[df['index'] == 'Adaboost'][column], 50, 6, 0.95)
    diz['Logistic'] = bootstrap(df[df['index'] == 'Logistic'][column], 50, 6, 0.95)
    return pd.DataFrame(diz).T

In [None]:
embed = compute_boostrap(data, "POL").reset_index()
embed2 = compute_boostrap(data, "POS+POL").reset_index()
embed3 = compute_boostrap(data, "PP+POL").reset_index()
embed['Features'] = 'POL'
embed2['Features'] = 'POS+POL'
embed3['Features'] = 'PP+POL'

In [None]:
embed = compute_boostrap(data, "Embedding").reset_index()
embed2 = compute_boostrap(data, "PP").reset_index()
embed3 = compute_boostrap(data, "POS").reset_index()
embed4 = compute_boostrap(data, "POS+PP").reset_index()
embed5 = compute_boostrap(data, "POS+PP+POL").reset_index()

In [None]:
embed['Features'] = 'Embedding'
embed2['Features'] = 'PP'
embed3['Features'] = 'POS'
embed4['Features'] = 'POS+PP'
embed5['Features'] = 'POS+PP+POL'

In [None]:
final_score = pd.concat([embed,embed2,embed3])

In [None]:
final_score = pd.concat([embed, embed2, embed3, embed4, embed5])

In [None]:
final_score.to_csv('../data/Optimization/Irony/random_search_semeval_otherfeat.csv', index = False)

# Bayesian Optimization

In [None]:
from skopt import BayesSearchCV
def bayes_opt(X_train, y_train, fold, n_iter):
    diz = {}
    global count 
    
    log_model = LogisticRegression(max_iter=500)
    dtr_model = HistGradientBoostingClassifier()
    rfc_model = RandomForestClassifier()
    ada_model = AdaBoostClassifier()
    xgb_model = xgb.XGBClassifier(objective = 'binary:logistic')


    rand_list_xgb = {'n_estimators': [int(x) for x in np.linspace(50, 300, num=251)],
              'learning_rate':  np.linspace(1e-3, 1, num=500),
              'subsample': [np.random.uniform(0.3, 0.9) for _ in range(200)],
              'max_depth': list(range(3,21)),
              'colsample_bytree': [np.random.uniform(0.5, 0.9) for _ in range(200)],
              'min_child_weight':list(range(1,21))
             }
    rand_list_reg = { 'C': [np.random.uniform(0.1, 10) for _ in range(200)], 'penalty' : ['l2']}
    rand_list_hist = {'max_depth': list(range(3,21)), 'min_samples_leaf': list(range(1,21)), 'learning_rate': np.linspace(1e-3, 1, num=500)}
    rand_list_rf = {'max_depth': list(range(3,21)), 'min_samples_leaf':  list(range(1,21)), 'min_samples_split': list(range(2,21)), 'n_estimators':   [int(x) for x in np.linspace(50, 300, num=251)]}
    rand_list_ada = {'n_estimators':   [int(x) for x in np.linspace(50, 300, num=251)], 'learning_rate':  np.linspace(1e-3, 1, num=500)}


    gb_bayes_reglog = BayesSearchCV(log_model, rand_list_reg, n_iter=n_iter, cv=fold,
                             random_state=1, n_jobs=6, refit=True, scoring = 'accuracy')
    
    gb_bayes_xgb = BayesSearchCV(xgb_model, rand_list_xgb, n_iter=n_iter, cv=fold,
                             random_state=1, n_jobs=6, refit=True, scoring = 'accuracy')

    gb_bayes_hist = BayesSearchCV(dtr_model, rand_list_hist, n_iter=n_iter, cv=fold,
                             random_state=1, n_jobs=6, refit=True, scoring = 'accuracy')

    gb_bayes_rf = BayesSearchCV(rfc_model, rand_list_rf, n_iter=n_iter, cv=fold,
                             random_state=1, n_jobs=6, refit=True, scoring = 'accuracy')

    gb_bayes_ada = BayesSearchCV(ada_model, rand_list_ada, n_iter=n_iter, cv=fold,
                            random_state=1, n_jobs=6, refit=True, scoring = 'accuracy')

    count = 1
    def on_epoch(optim_result):
        global count
        if count == n_iter:
            std = gb_bayes_reglog.cv_results_['std_test_score'][gb_bayes_reglog.best_index_]
            print("Params:",gb_bayes_reglog.best_params_, " Logistic regression score:",gb_bayes_reglog.best_score_,' standard dev: ', std)    
        count += 1
        

    gb_bayes_reglog.fit(X_train, y_train, callback = on_epoch)
    diz['Logistic'] = get_val_metrics(gb_bayes_reglog)
    count = 1
    def on_epoch(optim_result):
        global count
        if count == n_iter:
            std = gb_bayes_xgb.cv_results_['std_test_score'][gb_bayes_xgb.best_index_]
            print("Params:",gb_bayes_xgb.best_params_, " Xgboost score:",gb_bayes_xgb.best_score_,' standard dev: ', std)    
        count += 1
    gb_bayes_xgb.fit(X_train, y_train, callback = on_epoch)
    diz['XgBoost'] = get_val_metrics(gb_bayes_xgb)
    count = 1
    def on_epoch(optim_result):
        global count
        if count == n_iter:
            std = gb_bayes_hist.cv_results_['std_test_score'][gb_bayes_hist.best_index_]
            print("Params:",gb_bayes_hist.best_params_, "Hist GB score:",gb_bayes_hist.best_score_,' standard dev: ', std)    
        count += 1
    gb_bayes_hist.fit(X_train, y_train, callback = on_epoch)
    diz['Hgboost'] = get_val_metrics(gb_bayes_hist)
    count = 1
    def on_epoch(optim_result):
        global count
        if count == n_iter:
            std = gb_bayes_rf.cv_results_['std_test_score'][gb_bayes_rf.best_index_]
            print("Params:",gb_bayes_rf.best_params_, " Random Forest score:",gb_bayes_rf.best_score_,' standard dev: ', std)    
        count += 1
    gb_bayes_rf.fit(X_train, y_train, callback = on_epoch)
    diz['RandomForest'] = get_val_metrics(gb_bayes_rf)
    count = 1
    def on_epoch(optim_result):
        global count
        if count == n_iter:
            std = gb_bayes_ada.cv_results_['std_test_score'][gb_bayes_ada.best_index_]
            print("Params:",gb_bayes_ada.best_params_, " Ada Boost score:",gb_bayes_ada.best_score_,' standard dev: ', std)    
        count += 1
    gb_bayes_ada.fit(X_train, y_train, callback = on_epoch)
    diz['Adaboost'] = get_val_metrics(gb_bayes_ada)
    
    return diz

In [None]:
print('Embedding features')
score = bayes_opt(X_train_embed, y_train, 6, 10)

In [None]:
print("PP features")
score2 = bayes_opt(np.nan_to_num(X_train_pp), y_train, 6, 10)

In [None]:
print("Pos features")
score3 = bayes_opt(X_train_pos, y_train, 6, 10)

In [None]:
print("Pos + PP features")
score4 = bayes_opt(np.nan_to_num(X_train_pp_pos), y_train, 6, 10)

In [None]:
print("Pos + PP features + Polarity")
score5 = bayes_opt(np.nan_to_num(X_train_pp_pos_pol), y_train, 6, 10)

In [None]:
score6 = bayes_opt(X_train_pol, y_train, 6, 10)
score7 = bayes_opt(X_train_pos_pol, y_train, 6, 10)
score8 = bayes_opt(np.nan_to_num(X_train_pp_pol), y_train, 6, 10)

In [None]:
features_name = ["Embedding","PP","POS","POS+PP","POS+PP+POL"]
list_score = [score,score2,score3, score4, score5]
diz_scores = {}
for i in range(len(list_score)):
    diz_scores['{}'.format(features_name[i])] = list_score[i]

In [None]:
import pickle

with open('bayesian_search_irony_semeval.p', 'wb') as fp:
    pickle.dump(diz_scores, fp, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
data = pd.DataFrame(diz_scores).reset_index()

In [None]:
embed = compute_boostrap(data, "Embedding").reset_index()
embed2 = compute_boostrap(data, "PP").reset_index()
embed3 = compute_boostrap(data, "POS").reset_index()
embed4 = compute_boostrap(data, "POS+PP").reset_index()
embed5 = compute_boostrap(data, "POS+PP+POL").reset_index()

In [None]:
embed['Features'] = 'Embedding'
embed2['Features'] = 'PP'
embed3['Features'] = 'POS'
embed4['Features'] = 'POS+PP'
embed5['Features'] = 'POS+PP+POL'

In [None]:
final_score = pd.concat([embed, embed2, embed3, embed4, embed5])

In [None]:
final_score.to_csv('../data/Optimization/Irony/bayes_search_semeval_otherfeat.csv', index = False)

# Evaluation

### BMA JAVA INPUT

In [None]:
def normalize_lab(x):
    new_lab_list = []
    for i in x:
        if i == 0:
            new_lab = '1:1'
        else:
            new_lab =  '2:0'
            
        new_lab_list.append(new_lab)
        
    return new_lab_list

In [None]:
def mark_error(actual, predicted):
    mark_list = []
    for i,j in zip(actual, predicted):
        if i != j:
            mark = '+'
        else:
            mark = np.nan
            
        mark_list.append(mark)
        
    return mark_list

In [None]:
def get_proba_distrib(clf_proba):
    proba_ast = []
    for i,j in zip(clf_proba[:,0], clf_proba[:,1]):
        if i > j:
            proba = ['*{}'.format(str(i.round(5))), str(j.round(5))]
        else:
            proba = [str(i.round(5)), '*{}'.format(str(j.round(5)))]
            
        proba_ast.append(proba)
        
    return np.array(proba_ast)

In [None]:
def get_outpupt_bma(clf, x, ground_truth):
    
    pred = clf.predict(x)
    
    actual = normalize_lab(ground_truth)
    predicted = normalize_lab(pred)
    
    error = mark_error(actual, predicted)
    conta = 0
    lista_ins = []
    for i in range(len(x)):
        conta += 1
        if conta == int(len(x)/10) + 2:
            conta = 1
     
        lista_ins.append(conta)  
        
    instanc = lista_ins
    
    predict_proba = clf.predict_proba(x)
    
    distribution = get_proba_distrib(predict_proba)
    
    final_df = pd.DataFrame(instanc, columns=['inst#'])
    
    final_df['actual'] = actual
    
    final_df['predicted'] = predicted
    
    final_df['error'] = error

    final_df['distribution'] = distribution[:, 0]
    
    final_df[''] = distribution[:,1]
    
    return final_df

### Training models for sarcasm with the best hyperparameters identified 

In [None]:
xgb_model = xgb.XGBClassifier(objective = 'binary:logistic', colsample_bytree = 0.8020639316513152, learning_rate = 0.063062124248497, max_depth= 9, min_child_weight= 4, n_estimators= 173, subsample= 0.838253733300651) #bs, pos, pp, polarity
randomf_model = RandomForestClassifier(max_depth = 15, min_samples_leaf = 9, min_samples_split = 4,n_estimators = 13) #bs, pos, pp, pp
hist_model =  HistGradientBoostingClassifier(learning_rate = 0.11711623246492987,max_depth = 14, min_samples_leaf = 13) #bs, pos, polarity, pp 
logi_model = LogisticRegression(C = 0.888809044392379, penalty = 'l2')  #bs, pos, polarity, pp 
ada_model = AdaBoostClassifier(learning_rate = 0.683683366733467, n_estimators = 134) #bs, pos pp pola

In [None]:
xgb_model.fit(X_train_pos, y_train)
randomf_model.fit(X_train_pos, y_train)
hist_model.fit(X_train_pos, y_train)
logi_model.fit(X_train_pos, y_train)
ada_model.fit(X_train_pos, y_train)

### Test Irony data

In [None]:
with open('../data/semeval_test3a_irony_onlysemeval.p', 'rb') as handle:
    semeval = pickle.load(handle)

In [None]:
semeval_pos = np.concatenate([semeval['pos'], semeval['bert_embed']], axis = 1)
semeval_pp_pos_pol = np.concatenate([semeval['emoji']['emoji'],np.expand_dims(semeval['emoji']['emoji_positive'], axis = 1), 
                                 np.expand_dims(semeval['emoji']['emoji_negative'], axis = 1), semeval['pos'],semeval['punc'],
                         semeval['onom'], semeval['init'], semeval['bert_embed'], semeval['polarity']], axis = 1)
semeval_pp_pos = np.concatenate([semeval['emoji']['emoji'],np.expand_dims(semeval['emoji']['emoji_positive'], axis = 1), 
                                 np.expand_dims(semeval['emoji']['emoji_negative'], axis = 1), semeval['pos'],semeval['punc'],
                                 semeval['onom'], semeval['init'], semeval['bert_embed']], axis = 1)
y_semeval = semeval['label']

In [None]:
xgb_output = get_outpupt_bma(xgb_model, semeval_pos, y_semeval)
hist_output = get_outpupt_bma(hist_model, semeval_pos, y_semeval)
rf_output = get_outpupt_bma(randomf_model, semeval_pos, y_semeval)
ada_output = get_outpupt_bma(ada_model, semeval_pos, y_semeval)
logi_output = get_outpupt_bma(logi_model, semeval_pos, y_semeval)

In [None]:
xgb_output.to_csv('../Code/BMA/results_semeval/input/prediction_file/xgb_labels_semeval.csv', index = False)
hist_output.to_csv('../Code/BMA/results_semeval/input/prediction_file/hist_labels_semeval.csv', index = False)
rf_output.to_csv('../Code/BMA/results_semeval/input/prediction_file/rf_labels_semeval.csv', index = False)
ada_output.to_csv('../Code/BMA/results_semeval/input/prediction_file/ada_labels_semeval.csv', index = False)
logi_output.to_csv('../Code/BMA/results_semeval/input/prediction_file/logi_labels_semeval.csv', index = False)

In [None]:

print(classification_report(randomf_model.predict(semeval_pos), y_semeval))

In [None]:
f1_score(logi_model.predict(semeval_pos), y_semeval)