In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from pathlib import Path
import os
import numpy as np
import sys
from sklearn.metrics import accuracy_score, f1_score, recall_score
root = Path('')
encoder = {'left': 0, 'center': 1, 'right': 2}
classes = list(encoder.keys())

In [7]:
def change_prob_to_zero(X_train, X_test):
    df = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), 'bias', 'train.csv'))
    df_2 = pd.read_pickle(os.path.join(os.path.dirname(os.getcwd()), 'Data', 'article.pkl'))
    df_2['website'] = df_2['source_url'].str.strip('https://').str.strip('www.')

    to_zero_train = df[~df['website'].isin(df_2['website'])][['website', 'target']]
    mask_train = X_train['website'].isin(to_zero_train['website']) & X_train['target'].isin(to_zero_train['target'])

    df = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), 'bias', 'test.csv'))
    df_2 = pd.read_pickle(os.path.join(os.path.dirname(os.getcwd()), 'Data', 'article.pkl'))
    df_2['website'] = df_2['source_url'].str.strip('https://').str.strip('www.')

    to_zero_test = df[~df['website'].isin(df_2['website'])][['website', 'target']]
    mask_test = X_test['website'].isin(to_zero_test['website']) & X_test['target'].isin(to_zero_test['target'])
    X_train.loc[mask_train, classes] = 0
    X_test.loc[mask_test, classes] = 0
    return X_train, X_test

def log_reg(X_train, X_test):
    clf = LogisticRegression()
    clf.fit(X_train[classes], X_train['target'])
    y_pred = clf.predict(X_test[classes])
    y_test = X_test['target']
    accuracy = accuracy_score(y_test, y_pred)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    avg_recall = recall_score(y_test, y_pred, average='macro')
    print("Accuracy:", accuracy)
    print("Macro-F1 Score:", macro_f1)
    print("Average Recall:", avg_recall)

### Bert - article

In [8]:
X_train = pd.read_pickle(root / 'probability/bert-base-uncased_train_prob_bias_article.pkl')
X_test = pd.read_pickle(root / 'probability/bert-base-uncased_test_prob_bias_article.pkl')
X_train, X_test = change_prob_to_zero(X_train, X_test)
log_reg(X_train, X_test)

Accuracy: 0.6569767441860465
Macro-F1 Score: 0.633422994689289
Average Recall: 0.6241247505327605


### Bert description

In [9]:
X_train = pd.read_pickle(root / 'probability/bert-base-uncased_train_prob_bias_description.pkl')
X_test = pd.read_pickle(root / 'probability/bert-base-uncased_test_prob_bias_description.pkl')
X_train, X_test = change_prob_to_zero(X_train, X_test)
log_reg(X_train, X_test)

Accuracy: 0.6337209302325582
Macro-F1 Score: 0.5965277777777778
Average Recall: 0.5914752887456353


### RoBerta - article

In [13]:
X_train = pd.read_pickle(root / 'probability/berta-base_train_prob_bias_article.pkl')
X_test = pd.read_pickle(root / 'probability/berta-base_test_prob_bias_article.pkl')
X_train, X_test = change_prob_to_zero(X_train, X_test)
log_reg(X_train, X_test)

Accuracy: 0.6744186046511628
Macro-F1 Score: 0.653750111676941
Average Recall: 0.6391134563850459


### RoBerta description

In [14]:
X_train = pd.read_pickle(root / 'probability/berta-base_train_prob_bias_description.pkl')
X_test = pd.read_pickle(root / 'probability/berta-base_test_prob_bias_description.pkl')
X_train, X_test = change_prob_to_zero(X_train, X_test)
log_reg(X_train, X_test)

Accuracy: 0.627906976744186
Macro-F1 Score: 0.5870340946175122
Average Recall: 0.5869090786999731


### Distil Bert - article

In [16]:
X_train = pd.read_pickle(root / 'probability/distilbert-base-uncased_train_prob_bias_article.pkl')
X_test = pd.read_pickle(root / 'probability/distilbert-base-uncased_test_prob_bias_article.pkl')
X_train, X_test = change_prob_to_zero(X_train, X_test)
log_reg(X_train, X_test)

Accuracy: 0.6802325581395349
Macro-F1 Score: 0.6503927008709816
Average Recall: 0.6401113253428647


### Distil Bert description

In [17]:
X_train = pd.read_pickle(root / 'probability/distilbert-base-uncased_train_prob_bias_description.pkl')
X_test = pd.read_pickle(root / 'probability/distilbert-base-uncased_test_prob_bias_description.pkl')
X_train, X_test = change_prob_to_zero(X_train, X_test)
log_reg(X_train, X_test)

Accuracy: 0.627906976744186
Macro-F1 Score: 0.5872171563660925
Average Recall: 0.5869090786999731


### DeBerta v3 - article

In [19]:
X_train_article = pd.read_pickle(root / 'probability/deberta-v3-base_train_prob_bias_article.pkl')
X_test_article = pd.read_pickle(root / 'probability/deberta-v3-base_test_prob_bias_article.pkl')
X_train, X_test = change_prob_to_zero(X_train, X_test)
log_reg(X_train, X_test)

Accuracy: 0.627906976744186
Macro-F1 Score: 0.5872171563660925
Average Recall: 0.5869090786999731


### DeBerta v3 description

In [20]:
X_train_article = pd.read_pickle(root / 'probability/deberta-v3-base_train_prob_bias_description.pkl')
X_test_article = pd.read_pickle(root / 'probability/deberta-v3-base_test_prob_bias_description.pkl')
X_train, X_test = change_prob_to_zero(X_train, X_test)
log_reg(X_train, X_test)

Accuracy: 0.627906976744186
Macro-F1 Score: 0.5872171563660925
Average Recall: 0.5869090786999731


### Best model merge

In [22]:
X_train_article = pd.read_pickle(root / 'probability/deberta-v3-base_train_prob_bias_article.pkl')
X_test_article = pd.read_pickle(root / 'probability/deberta-v3-base_test_prob_bias_article.pkl')

In [23]:
X_train_des = pd.read_pickle(root / 'probability/bert-base-uncased_train_prob_bias_description.pkl')
X_test_des = pd.read_pickle(root / 'probability/bert-base-uncased_test_prob_bias_description.pkl')

In [24]:
def merge_dataset(df, df_1):
    df = df.merge(df_1, on=['website', 'target'])
    df['left'] = df['left_x'] + df['left_y']
    df['right'] = df['right_x'] + df['right_y']
    df['center'] = df['center_x'] + df['center_y']
    return df[['target', 'website'] + classes]

In [25]:
X_train = merge_dataset(X_train_article, X_train_des)
X_test = merge_dataset(X_test_article, X_test_des)

In [26]:
X_train, X_test = change_prob_to_zero(X_train, X_test)
log_reg(X_train, X_test)

Accuracy: 0.7076023391812866
Macro-F1 Score: 0.6824352678056765
Average Recall: 0.6685430608656558
