In [1]:
import os
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, 
                             confusion_matrix, 
                             ConfusionMatrixDisplay, roc_curve, auc, PrecisionRecallDisplay, RocCurveDisplay, mean_squared_error)
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier

In [2]:
df = pd.read_csv('./../../datasets/oal_doc_dataset_extended.csv', 
                 names=['doi', 'pm_grouptype', 'type', 'abstract', 'title', 'page', 'author_count',
                          'has_license', 'is_referenced_by_count',
                          'references_count', 'has_funder', 'country_count', 'inst_count', 'has_oa_url'],
                 dtype={'doi': str,
                        'pm_grouptype': str,
                        'type': str,
                        'abstract': str,
                        'title': str,
                        'page': str,
                        'author_count': int,
                        'has_license': int,
                        'is_referenced_by_count': int,
                        'references_count': int,
                        'has_funder': int,
                        'country_count': int,
                        'inst_count': int,
                        'has_oa_url': int
                 }, sep=',', quotechar='"', header=0)

In [3]:
def page_counter(page_str):
    page_int = 1
    if '-' in str(page_str):
        try:
            page_str = re.sub(r'(\.e)[\d]*', '', page_str)
            page_str = re.sub(r'(\.)[\d]*', '', page_str)
            page_str = re.sub(r'(?<=\d)(e)(\d)*', '', page_str)
            page_str = re.sub(r'[^\d-]', '', page_str)
            page_int = int(abs(eval(page_str)))
            page_int += 1
        except:
            pass
        
    return page_int

In [4]:
df['page_count'] = df.page.apply(page_counter)
df['page_count'] = df['page_count'].astype(int)

In [5]:
df['title_word_length'] = df['title'].str.split().str.len()

In [6]:
def has_abstract(abstract_str):
    if pd.isna(abstract_str):
        return 0
    else:
        return 1

In [7]:
df['title_word_length'] = df['title_word_length'].fillna(0)

In [8]:
df['has_abstract'] = df.abstract.apply(has_abstract)
df['has_abstract'] = df['has_abstract'].astype(int)

In [9]:
df = df[df['type'] != 'not assigned']
df['type'] = df['type'].replace(to_replace='research_discourse', value='1')
df['type'] = df['type'].replace(to_replace='editorial_discourse', value='0')
df['type'] = df['type'].astype(int)
df = df.reset_index(drop=True)

In [10]:
df_publisher = pd.read_csv('./../../datasets/cr_publisher.csv', sep=',')

df_with_publisher = df.merge(df_publisher, on=['doi'])
df_pub_n = df_with_publisher.groupby(['publisher'])['doi'].count().reset_index().sort_values(by=['doi'], ascending=False)
df_pub_n.columns = ['publisher', 'n']
df_pub_n = df_pub_n[df_pub_n.n > 5000]
df = df_with_publisher.merge(df_pub_n, on=['publisher'])

In [11]:
X = df[['author_count', 'has_license', 'is_referenced_by_count',
        'references_count', 'has_funder', 'page_count', 'has_abstract', 'title_word_length', 'inst_count', 'has_oa_url']].values
y = df[['type']].values.ravel()

In [12]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

In [13]:
classifier = LogisticRegression(max_iter=2000, 
                                class_weight='balanced', 
                                random_state=42, 
                                n_jobs=-1)
classifier.fit(X_train, y_train)
 
y_pred = classifier.predict(X_val)

print(classification_report(y_val, 
                            y_pred, 
                            zero_division=1,
                            digits=4,
                            target_names=['editorial_discourse', 'research_discourse']))

                     precision    recall  f1-score   support

editorial_discourse     0.3219    0.9215    0.4771     67883
 research_discourse     0.9926    0.8447    0.9127    848479

           accuracy                         0.8504    916362
          macro avg     0.6572    0.8831    0.6949    916362
       weighted avg     0.9429    0.8504    0.8804    916362



In [14]:
y_pred = classifier.predict(X_test)

print(classification_report(y_test, 
                            y_pred, 
                            zero_division=1,
                            digits=4,
                            target_names=['editorial_discourse', 'research_discourse']))

                     precision    recall  f1-score   support

editorial_discourse     0.3209    0.9193    0.4757     67884
 research_discourse     0.9924    0.8443    0.9124    848478

           accuracy                         0.8499    916362
          macro avg     0.6566    0.8818    0.6941    916362
       weighted avg     0.9427    0.8499    0.8801    916362



In [15]:
clf = RandomForestClassifier(criterion='gini', 
                             max_depth=None, 
                             max_features='sqrt', 
                             class_weight='balanced', 
                             n_estimators=200, 
                             n_jobs=-1, 
                             random_state=42)
clf.fit(X_train, y_train)
 
y_pred = clf.predict(X_val)

print(classification_report(y_val, 
                            y_pred, 
                            zero_division=1, 
                            digits=4,
                            target_names=['editorial_discourse', 'research_discourse']))

                     precision    recall  f1-score   support

editorial_discourse     0.5995    0.7059    0.6483     67883
 research_discourse     0.9761    0.9623    0.9692    848479

           accuracy                         0.9433    916362
          macro avg     0.7878    0.8341    0.8087    916362
       weighted avg     0.9482    0.9433    0.9454    916362



In [16]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, 
                            y_pred, 
                            zero_division=1,
                            digits=4,
                            target_names=['editorial_discourse', 'research_discourse']))

                     precision    recall  f1-score   support

editorial_discourse     0.5967    0.7062    0.6469     67884
 research_discourse     0.9761    0.9618    0.9689    848478

           accuracy                         0.9429    916362
          macro avg     0.7864    0.8340    0.8079    916362
       weighted avg     0.9480    0.9429    0.9451    916362



In [17]:
knn = KNeighborsClassifier(n_neighbors=50, 
                           weights='uniform', 
                           algorithm='auto',
                           leaf_size=30, 
                           p=1, 
                           n_jobs=-1)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_val)

print(classification_report(y_val, 
                            y_pred, 
                            zero_division=1,
                            digits=4,
                            target_names=['editorial_discourse', 'research_discourse']))

                     precision    recall  f1-score   support

editorial_discourse     0.6580    0.6244    0.6408     67883
 research_discourse     0.9701    0.9740    0.9721    848479

           accuracy                         0.9481    916362
          macro avg     0.8140    0.7992    0.8064    916362
       weighted avg     0.9470    0.9481    0.9475    916362



In [18]:
y_pred = knn.predict(X_test)

print(classification_report(y_test, 
                            y_pred, 
                            zero_division=1,
                            digits=4,
                            target_names=['editorial_discourse', 'research_discourse']))

                     precision    recall  f1-score   support

editorial_discourse     0.6546    0.6228    0.6383     67884
 research_discourse     0.9699    0.9737    0.9718    848478

           accuracy                         0.9477    916362
          macro avg     0.8123    0.7983    0.8051    916362
       weighted avg     0.9466    0.9477    0.9471    916362



In [19]:
abc = AdaBoostClassifier(n_estimators=50,
                         algorithm='SAMME',
                         learning_rate=1,
                         random_state=42)

abc.fit(X_train, y_train)

y_pred = abc.predict(X_val)

print(classification_report(y_val, 
                            y_pred, 
                            zero_division=1, 
                            digits=4,
                            target_names=['editorial_discourse', 'research_discourse']))

                     precision    recall  f1-score   support

editorial_discourse     0.5902    0.4082    0.4826     67883
 research_discourse     0.9538    0.9773    0.9654    848479

           accuracy                         0.9352    916362
          macro avg     0.7720    0.6928    0.7240    916362
       weighted avg     0.9269    0.9352    0.9297    916362



In [20]:
y_pred = abc.predict(X_test)

print(classification_report(y_test, 
                            y_pred, 
                            zero_division=1,
                            digits=4,
                            target_names=['editorial_discourse', 'research_discourse']))

                     precision    recall  f1-score   support

editorial_discourse     0.5847    0.4065    0.4796     67884
 research_discourse     0.9536    0.9769    0.9651    848478

           accuracy                         0.9346    916362
          macro avg     0.7692    0.6917    0.7224    916362
       weighted avg     0.9263    0.9346    0.9292    916362



In [21]:
dummy_clf = DummyClassifier(strategy='uniform',
                            random_state=42)

dummy_clf.fit(X_train, y_train)

y_pred = dummy_clf.predict(X_val)

print(classification_report(y_val, 
                            y_pred, 
                            zero_division=1, 
                            digits=4,
                            target_names=['editorial_discourse', 'research_discourse']))

                     precision    recall  f1-score   support

editorial_discourse     0.0744    0.5021    0.1295     67883
 research_discourse     0.9262    0.4999    0.6493    848479

           accuracy                         0.5000    916362
          macro avg     0.5003    0.5010    0.3894    916362
       weighted avg     0.8631    0.5000    0.6108    916362



In [22]:
y_pred = dummy_clf.predict(X_test)

print(classification_report(y_test, 
                            y_pred, 
                            zero_division=1,
                            digits=4,
                            target_names=['editorial_discourse', 'research_discourse']))

                     precision    recall  f1-score   support

editorial_discourse     0.0740    0.4998    0.1289     67884
 research_discourse     0.9258    0.4997    0.6491    848478

           accuracy                         0.4997    916362
          macro avg     0.4999    0.4997    0.3890    916362
       weighted avg     0.8627    0.4997    0.6105    916362

