In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.metrics import f1_score, accuracy_score

In [3]:
def load_predictions(dataset_name, partition):
    path = './Saved Predict and Proba/' + dataset_name.upper() + '/prob_' + partition + '_' + dataset_name.lower() + '.csv'
    table_pred = pd.read_csv(path)
    if dataset_name == 'zw':
        label = table_pred['norm']
        methods = table_pred.drop('norm', axis=1)
    else: 
        label = table_pred['class']
        methods = table_pred.drop('class', axis=1)
    methods = methods.drop('Unnamed: 0', axis=1)
    return label, methods

def load_dataset(dataset_name):
    label_train, probas_train = load_predictions(dataset_name, 'train')
    label_val, probas_val = load_predictions(dataset_name, 'val')
    label_test, probas_test = load_predictions(dataset_name, 'test')
    return label_train, probas_train, label_test, probas_test, label_val, probas_val

def filter_df_train_test(train_df, test_df, name, filter_first=True):
    train = train_df.filter(regex=name, axis=1)
    test = test_df.filter(regex=name, axis=1)
    return train, test

# Get dataset

In [81]:
names_Z = {}
names_Z['td'] ='CNN-W2V|NB-TFIDF|KNN-FAST|NB-GLOVE|LR-W2V|NB-FAST|CNN-TFIDF|KNN-TFIDF|NB-W2V|KNN-CV'
names_Z['zw'] = 'NB-GLOVE|LR-GLOVE|NB-W2V|SVM-CV|MLP-TFIDF|SVM-FAST|MLP-W2V|NB-CV|SVM-GLOVE|KNN-FAST|SVM-W2V|RF-CV'
names_Z['td_zw']= 'NB-W2V|KNN-FAST|LR-FAST|KNN-GLOVE|CNN-CV|EXTRA-W2V|CNN-GLOVE|KNN-CV|NB-GLOVE|CNN-TFIDF|LR-W2V|KNN-TFIDF|NB-TFIDF'

In [78]:
def stacking(dataset_name):
    _, _, label_test, probas_test, label_val, probas_val = load_dataset(dataset_name)
    all_stacking = [LogisticRegressionCV(class_weight='balanced', cv=10, scoring='f1_macro', n_jobs=5)]
    all_stacking_names = ['Stacking LR']

    names = names_Z[dataset_name]
    results_Z = np.zeros(len(all_stacking))
    X_val, X_test = filter_df_train_test(probas_val, probas_test, names)
    for idx_clf, clf in enumerate(all_stacking):
        clf.fit(X_val, label_val)
        y_pred = clf.predict(X_test)
        results_Z[idx_clf] = f1_score(label_test, y_pred, average='macro')

    group_Z_df = pd.DataFrame(results_Z.reshape(1, 1), columns=all_stacking_names, index=[dataset_name])
    return group_Z_df


In [82]:
group_TD_df = stacking('td')
group_ZW_df = stacking('zw')
group_TD_ZW_df = stacking('td_zw')

In [83]:
pd.concat([group_TD_df, group_ZW_df, group_TD_ZW_df])

Unnamed: 0,Stacking LR
td,0.711346
zw,0.782641
td_zw,0.87865
