In [67]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from pathlib import Path
import os
import optuna
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, recall_score
import warnings
warnings.filterwarnings("ignore", category=UserWarning)
pd.set_option('display.float_format', '{:.6f}'.format)
root = Path('')

In [68]:
encoder = {'low': 0, 'mixed': 1, 'high': 2}
classes = list(encoder.keys())

### Best model article bert

In [69]:
X_train_article = pd.read_pickle(root / 'probability/bert-base-uncased_train_prob_fact_article.pkl')
X_test_article = pd.read_pickle(root / 'probability/bert-base-uncased_test_prob_fact_article.pkl')

### Best model description debert v3

In [70]:
X_train_des = pd.read_pickle(root / 'probability/deberta-v3-base_train_prob_fact_description.pkl')
X_test_des = pd.read_pickle(root / 'probability/deberta-v3-base_test_prob_fact_description.pkl')

In [71]:
def merge_dataset(df, df_1):
    df = df.merge(df_1, on=['website', 'target'])
    df['low'] = df['low_x'] + df['low_y']
    df['mixed'] = df['mixed_x'] + df['mixed_y']
    df['high'] = df['high_x'] + df['high_y']
    df['low'] /=2
    df['mixed'] /=2
    df['high'] /=2
    return df[['target', 'website'] + classes]

In [72]:
X_train = merge_dataset(X_train_article, X_train_des)
X_test = merge_dataset(X_test_article, X_test_des)
y_train = X_train['target']
y_test = X_test['target']

In [73]:
X_train

Unnamed: 0,target,website,low,mixed,high
0,high,inthesetimes.com,0.041575,0.041129,0.917296
1,high,alreporter.com,0.129161,0.184921,0.685918
2,high,nymag.com,0.016432,0.040486,0.943082
3,high,investopedia.com,0.027289,0.040707,0.932004
4,high,newpol.org,0.030245,0.040960,0.928795
...,...,...,...,...,...
768,mixed,theamericanmirror.com,0.153471,0.436085,0.410444
769,high,americanmilitarynews.com,0.098004,0.308399,0.593597
770,low,researchantisemitism.ca,0.550774,0.306654,0.142572
771,mixed,taiwannews.com.tw,0.140939,0.570158,0.288903


In [85]:
best_params = {'C': 0.025920931494754832, 'tol': 6.802290436130853e-05, 'fit_intercept': True,
          'solver': 'sag', 'max_iter': 251, 'class_weight': 'balanced', 'penalty': None, 'random_state': 42}

In [86]:
clf = LogisticRegression(**best_params)
clf.fit(X_train[classes], X_train['target'])
y_pred = clf.predict(X_test[classes])
y_test = X_test['target']
accuracy = accuracy_score(y_test, y_pred)
macro_f1 = f1_score(y_test, y_pred, average='macro')
avg_recall = recall_score(y_test, y_pred, average='macro')
print("Accuracy:", accuracy)
print("Macro-F1 Score:", macro_f1)
print("Average Recall:", avg_recall)

Accuracy: 0.8421052631578947
Macro-F1 Score: 0.7972049261160651
Average Recall: 0.7654135338345865


### Step 3

In [87]:
def read_models_file(X_train, num_model='M1'):
    files = []
    renames = {'logit_1': 'low', 'logit_2':'high', 'logit_3':'mixed'}
    for file in os.listdir(root / 'step_3' / num_model):
        if file.endswith('csv'):
            df = pd.read_csv(root / 'step_3' / num_model / file)
            df.rename(columns=renames, inplace=True)
            df['target'] = None
            for index, row in df.iterrows():
                df.iloc[index, -1] = row.index[row[1:].argmax() + 1]
            df = pd.concat([X_train, df])
            files.append(df)
    
    return files

def evaluate(files: list[pd.DataFrame], X_test: pd.DataFrame, params_log_reg={}):
    accuracy_list = []
    macro_f1_list = []
    avg_recall_list = []
    for index, X_train in enumerate(files):
        clf = LogisticRegression(**params_log_reg)
        clf.fit(X_train[classes], X_train['target'])
        y_pred = clf.predict(X_test[classes])
        y_test = X_test['target']
        accuracy = accuracy_score(y_test, y_pred)
        macro_f1 = f1_score(y_test, y_pred, average='macro')
        avg_recall = recall_score(y_test, y_pred, average='macro')
        accuracy_list.append(accuracy), macro_f1_list.append(macro_f1), avg_recall_list.append(avg_recall)
        print(f'Index: {index}')
        print("Accuracy:", accuracy)
        print("Macro-F1 Score:", macro_f1)
        print("Average Recall:", avg_recall)
    accuracy_std = np.std(accuracy_list)
    macro_f1_std = np.std(macro_f1_list)
    avg_recall_std = np.std(avg_recall_list)

    print(f"Standard Deviation of Accuracy: {accuracy_std}")
    print(f"Standard Deviation of Macro-F1 Score: {macro_f1_std}")
    print(f"Standard Deviation of Average Recall: {avg_recall_std}")
    print()
        

#### Model m1

In [91]:
files = read_models_file(X_train, num_model='M1')
evaluate(files, X_test, best_params)

Index: 0
Accuracy: 0.7602339181286549
Macro-F1 Score: 0.7151446683299434
Average Recall: 0.6947786131996659
Index: 1
Accuracy: 0.7953216374269005
Macro-F1 Score: 0.713952401276345
Average Recall: 0.6708437761069339
Index: 2
Accuracy: 0.8362573099415205
Macro-F1 Score: 0.7827733961509938
Average Recall: 0.7479114452798662
Index: 3
Accuracy: 0.8421052631578947
Macro-F1 Score: 0.7940487572774568
Average Recall: 0.7566833751044276
Index: 4
Accuracy: 0.8304093567251462
Macro-F1 Score: 0.7783883679847806
Average Recall: 0.7416040100250626
Standard Deviation of Accuracy: 0.030944459778533228
Standard Deviation of Macro-F1 Score: 0.03492623457819747
Standard Deviation of Average Recall: 0.03351404627396947



#### Model m2

In [89]:
files = read_models_file(X_train, num_model='M2')
evaluate(files, X_test, best_params)

Index: 0
Accuracy: 0.8304093567251462
Macro-F1 Score: 0.784319899799776
Average Recall: 0.7503341687552213
Index: 1
Accuracy: 0.8304093567251462
Macro-F1 Score: 0.784319899799776
Average Recall: 0.7503341687552213
Index: 2
Accuracy: 0.8304093567251462
Macro-F1 Score: 0.784319899799776
Average Recall: 0.7503341687552213
Index: 3
Accuracy: 0.8304093567251462
Macro-F1 Score: 0.784319899799776
Average Recall: 0.7503341687552213
Index: 4
Accuracy: 0.8304093567251462
Macro-F1 Score: 0.784319899799776
Average Recall: 0.7503341687552213
Standard Deviation of Accuracy: 0.0
Standard Deviation of Macro-F1 Score: 0.0
Standard Deviation of Average Recall: 0.0



#### Model m3

In [90]:
files = read_models_file(X_train, num_model='M3')
evaluate(files, X_test, best_params)

Index: 0
Accuracy: 0.8245614035087719
Macro-F1 Score: 0.7771583107967547
Average Recall: 0.7471595655806181
Index: 1
Accuracy: 0.8245614035087719
Macro-F1 Score: 0.7771583107967547
Average Recall: 0.7471595655806181
Index: 2
Accuracy: 0.8245614035087719
Macro-F1 Score: 0.7771583107967547
Average Recall: 0.7471595655806181
Index: 3
Accuracy: 0.8304093567251462
Macro-F1 Score: 0.784319899799776
Average Recall: 0.7503341687552213
Index: 4
Accuracy: 0.8304093567251462
Macro-F1 Score: 0.782002754278969
Average Recall: 0.7503341687552213
Standard Deviation of Accuracy: 0.002864900283956906
Standard Deviation of Macro-F1 Score: 0.00303077635246345
Standard Deviation of Average Recall: 0.0015552315827194727

