In [5]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from pathlib import Path
import os
import numpy as np
import sys
from sklearn.metrics import accuracy_score, f1_score, recall_score
root = Path(os.getcwd())
encoder = {'low': 0, 'mixed': 1, 'high': 2}
classes = list(encoder.keys())

In [6]:
def change_prob_to_zero(X_train, X_test):
    df = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), 'fact', 'train.csv'))
    df_2 = pd.read_pickle(os.path.join(os.path.dirname(os.getcwd()), 'Data', 'article.pkl'))
    df_2['website'] = df_2['source_url'].str.strip('https://').str.strip('www.')

    to_zero_train = df[~df['website'].isin(df_2['website'])][['website', 'target']]
    mask_train = X_train['website'].isin(to_zero_train['website']) & X_train['target'].isin(to_zero_train['target'])

    df = pd.read_csv(os.path.join(os.path.dirname(os.getcwd()), 'fact', 'test.csv'))
    df_2 = pd.read_pickle(os.path.join(os.path.dirname(os.getcwd()), 'Data', 'article.pkl'))
    df_2['website'] = df_2['source_url'].str.strip('https://').str.strip('www.')

    to_zero_test = df[~df['website'].isin(df_2['website'])][['website', 'target']]
    mask_test = X_test['website'].isin(to_zero_test['website']) & X_test['target'].isin(to_zero_test['target'])
    X_train.loc[mask_train, classes] = 0
    X_test.loc[mask_test, classes] = 0
    return X_train, X_test

def log_reg(X_train, X_test):
    clf = LogisticRegression()
    clf.fit(X_train[classes], X_train['target'])
    y_pred = clf.predict(X_test[classes])
    y_test = X_test['target']
    accuracy = accuracy_score(y_test, y_pred)
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    avg_recall = recall_score(y_test, y_pred, average='macro')
    print("Accuracy:", accuracy)
    print("Macro-F1 Score:", macro_f1)
    print("Average Recall:", avg_recall)

### Bert - article

In [7]:
X_train = pd.read_pickle(root / 'probability/bert-base-uncased_train_prob_fact_article.pkl')
X_test = pd.read_pickle(root / 'probability/bert-base-uncased_test_prob_fact_article.pkl')
X_train, X_test = change_prob_to_zero(X_train, X_test)
log_reg(X_train, X_test)

Accuracy: 0.622093023255814
Macro-F1 Score: 0.3527239511972336
Average Recall: 0.3765073060008512


### Bert description

In [8]:
X_train = pd.read_pickle(root / 'probability/bert-base-uncased_train_prob_fact_description.pkl')
X_test = pd.read_pickle(root / 'probability/bert-base-uncased_test_prob_fact_description.pkl')
X_train, X_test = change_prob_to_zero(X_train, X_test)
log_reg(X_train, X_test)

Accuracy: 0.5930232558139535
Macro-F1 Score: 0.3463523061046281
Average Recall: 0.37978279030910606


### RoBerta - article

In [9]:
X_train = pd.read_pickle(root / 'probability/berta-base_train_prob_fact_article.pkl')
X_test = pd.read_pickle(root / 'probability/berta-base_test_prob_fact_article.pkl')
X_train, X_test = change_prob_to_zero(X_train, X_test)
log_reg(X_train, X_test)

Accuracy: 0.6337209302325582
Macro-F1 Score: 0.3355311355311355
Average Recall: 0.3765309500165508


### RoBerta description

In [10]:
X_train = pd.read_pickle(root / 'probability/berta-base_train_prob_fact_description.pkl')
X_test = pd.read_pickle(root / 'probability/berta-base_test_prob_fact_description.pkl')
X_train, X_test = change_prob_to_zero(X_train, X_test)
log_reg(X_train, X_test)

Accuracy: 0.5930232558139535
Macro-F1 Score: 0.4081011657775144
Average Recall: 0.40746406245498806


### Distil Bert - article

In [11]:
X_train = pd.read_pickle(root / 'probability/distilbert-base-uncased_train_prob_fact_article.pkl')
X_test = pd.read_pickle(root / 'probability/distilbert-base-uncased_test_prob_fact_article.pkl')
X_train, X_test = change_prob_to_zero(X_train, X_test)
log_reg(X_train, X_test)

Accuracy: 0.627906976744186
Macro-F1 Score: 0.3826945977174118
Average Recall: 0.39653378729843475


### Distil Bert description

In [12]:
X_train = pd.read_pickle(root / 'probability/distilbert-base-uncased_train_prob_fact_description.pkl')
X_test = pd.read_pickle(root / 'probability/distilbert-base-uncased_test_prob_fact_description.pkl')
X_train, X_test = change_prob_to_zero(X_train, X_test)
log_reg(X_train, X_test)

Accuracy: 0.6104651162790697
Macro-F1 Score: 0.2527075812274368
Average Recall: 0.3333333333333333


### DeBerta v3 - article

In [13]:
X_train_article = pd.read_pickle(root / 'probability/deberta-v3-base_train_prob_fact_article.pkl')
X_test_article = pd.read_pickle(root / 'probability/deberta-v3-base_test_prob_fact_article.pkl')
X_train, X_test = change_prob_to_zero(X_train, X_test)
log_reg(X_train, X_test)

Accuracy: 0.6104651162790697
Macro-F1 Score: 0.2527075812274368
Average Recall: 0.3333333333333333


### DeBerta v3 description

In [14]:
X_train_article = pd.read_pickle(root / 'probability/deberta-v3-base_train_prob_fact_description.pkl')
X_test_article = pd.read_pickle(root / 'probability/deberta-v3-base_test_prob_fact_description.pkl')
X_train, X_test = change_prob_to_zero(X_train, X_test)
log_reg(X_train, X_test)

Accuracy: 0.6104651162790697
Macro-F1 Score: 0.2527075812274368
Average Recall: 0.3333333333333333


### Best model merge

In [15]:
X_train_article = pd.read_pickle(root / 'probability/bert-base-uncased_train_prob_fact_article.pkl')
X_test_article = pd.read_pickle(root / 'probability/bert-base-uncased_test_prob_fact_article.pkl')

In [16]:
X_train_des = pd.read_pickle(root / 'probability/deberta-v3-base_train_prob_fact_description.pkl')
X_test_des = pd.read_pickle(root / 'probability/deberta-v3-base_test_prob_fact_description.pkl')

In [17]:
def merge_dataset(df, df_1):
    df = df.merge(df_1, on=['website', 'target'])
    df['low'] = df['low_x'] + df['low_y']
    df['mixed'] = df['mixed_x'] + df['mixed_y']
    df['high'] = df['high_x'] + df['high_y']
    df['low'] /=2
    df['mixed'] /=2
    df['high'] /=2
    return df[['target', 'website'] + classes]

In [18]:
X_train = merge_dataset(X_train_article, X_train_des)
X_test = merge_dataset(X_test_article, X_test_des)

In [19]:
X_train, X_test = change_prob_to_zero(X_train, X_test)
log_reg(X_train, X_test)

Accuracy: 0.6257309941520468
Macro-F1 Score: 0.364957264957265
Average Recall: 0.38512949039264827
