In [50]:
import os
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, 
                             confusion_matrix, 
                             ConfusionMatrixDisplay, roc_curve, auc, PrecisionRecallDisplay, RocCurveDisplay, mean_squared_error)
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier

In [2]:
df = pd.read_csv('./../../datasets/oal_doc_dataset_extended.csv', 
                 names=['doi', 'pm_grouptype', 'type', 'abstract', 'title', 'page', 'author_count',
                          'has_license', 'is_referenced_by_count',
                          'references_count', 'has_funder', 'country_count', 'inst_count', 'has_oa_url'],
                 dtype={'doi': str,
                        'pm_grouptype': str,
                        'type': str,
                        'abstract': str,
                        'title': str,
                        'page': str,
                        'author_count': int,
                        'has_license': int,
                        'is_referenced_by_count': int,
                        'references_count': int,
                        'has_funder': int,
                        'country_count': int,
                        'inst_count': int,
                        'has_oa_url': int
                 }, sep=',', quotechar='"', header=0)

In [3]:
def page_counter(page_str):
    page_int = 1
    if '-' in str(page_str):
        try:
            page_str = re.sub(r'(\.e)[\d]*', '', page_str)
            page_str = re.sub(r'(\.)[\d]*', '', page_str)
            page_str = re.sub(r'(?<=\d)(e)(\d)*', '', page_str)
            page_str = re.sub(r'[^\d-]', '', page_str)
            page_int = int(abs(eval(page_str)))
            page_int += 1
        except:
            pass
        
    return page_int

In [4]:
df['page_count'] = df.page.apply(page_counter)
df['page_count'] = df['page_count'].astype(int)

In [5]:
df['title_word_length'] = df['title'].str.split().str.len()

In [6]:
def has_abstract(abstract_str):
    if pd.isna(abstract_str):
        return 0
    else:
        return 1

In [7]:
df['has_abstract'] = df.abstract.apply(has_abstract)
df['has_abstract'] = df['has_abstract'].astype(int)

In [8]:
df['title_word_length'] = df['title_word_length'].fillna(0)

In [9]:
df = df[df['type'] != 'not assigned']
df['type'] = df['type'].replace(to_replace='research_discourse', value='1')
df['type'] = df['type'].replace(to_replace='editorial_discourse', value='0')
df['type'] = df['type'].astype(int)
df = df.reset_index(drop=True)

In [10]:
X = df[['author_count', 'has_license', 'is_referenced_by_count',
        'references_count', 'has_funder', 'page_count', 'has_abstract', 'title_word_length', 'inst_count', 'has_oa_url']].values
y = df[['type']].values.ravel()

In [34]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25, random_state=42)

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

In [36]:
classifier = LogisticRegression(max_iter=2000, 
                                class_weight='balanced', 
                                random_state=42, 
                                n_jobs=-1)
classifier.fit(X_train, y_train)
 
y_pred = classifier.predict(X_val)

print(classification_report(y_val, 
                            y_pred, 
                            zero_division=1,
                            digits=4,
                            target_names=['editorial_discourse', 'research_discourse']))

                     precision    recall  f1-score   support

editorial_discourse     0.3085    0.9163    0.4616     70846
 research_discourse     0.9920    0.8353    0.9069    883468

           accuracy                         0.8413    954314
          macro avg     0.6503    0.8758    0.6843    954314
       weighted avg     0.9413    0.8413    0.8739    954314



In [37]:
y_pred = classifier.predict(X_test)

print(classification_report(y_test, 
                            y_pred, 
                            zero_division=1,
                            digits=4,
                            target_names=['editorial_discourse', 'research_discourse']))

                     precision    recall  f1-score   support

editorial_discourse     0.3080    0.9182    0.4613     70846
 research_discourse     0.9922    0.8346    0.9066    883468

           accuracy                         0.8408    954314
          macro avg     0.6501    0.8764    0.6840    954314
       weighted avg     0.9414    0.8408    0.8736    954314



In [38]:
clf = RandomForestClassifier(criterion='gini', 
                             max_depth=None, 
                             max_features='sqrt', 
                             class_weight='balanced', 
                             n_estimators=200, 
                             n_jobs=-1, 
                             random_state=42)
clf.fit(X_train, y_train)
 
y_pred = clf.predict(X_val)

print(classification_report(y_val, 
                            y_pred, 
                            zero_division=1, 
                            digits=4,
                            target_names=['editorial_discourse', 'research_discourse']))

                     precision    recall  f1-score   support

editorial_discourse     0.5814    0.7017    0.6359     70846
 research_discourse     0.9757    0.9595    0.9675    883468

           accuracy                         0.9404    954314
          macro avg     0.7786    0.8306    0.8017    954314
       weighted avg     0.9464    0.9404    0.9429    954314



In [39]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, 
                            y_pred, 
                            zero_division=1,
                            digits=4,
                            target_names=['editorial_discourse', 'research_discourse']))

                     precision    recall  f1-score   support

editorial_discourse     0.5811    0.7057    0.6374     70846
 research_discourse     0.9760    0.9592    0.9675    883468

           accuracy                         0.9404    954314
          macro avg     0.7786    0.8324    0.8024    954314
       weighted avg     0.9467    0.9404    0.9430    954314



In [40]:
knn = KNeighborsClassifier(n_neighbors=50, 
                           weights='uniform', 
                           algorithm='auto',
                           leaf_size=30, 
                           p=1, 
                           n_jobs=-1)
knn.fit(X_train, y_train)

y_pred = knn.predict(X_val)

print(classification_report(y_val, 
                            y_pred, 
                            zero_division=1,
                            digits=4,
                            target_names=['editorial_discourse', 'research_discourse']))

                     precision    recall  f1-score   support

editorial_discourse     0.6515    0.6037    0.6267     70846
 research_discourse     0.9684    0.9741    0.9712    883468

           accuracy                         0.9466    954314
          macro avg     0.8100    0.7889    0.7990    954314
       weighted avg     0.9449    0.9466    0.9457    954314



In [48]:
y_pred = knn.predict(X_test)

print(classification_report(y_test, 
                            y_pred, 
                            zero_division=1,
                            digits=4,
                            target_names=['editorial_discourse', 'research_discourse']))

                     precision    recall  f1-score   support

editorial_discourse     0.6509    0.6036    0.6263     70846
 research_discourse     0.9684    0.9740    0.9712    883468

           accuracy                         0.9465    954314
          macro avg     0.8096    0.7888    0.7988    954314
       weighted avg     0.9448    0.9465    0.9456    954314



In [42]:
abc = AdaBoostClassifier(n_estimators=50,
                         algorithm='SAMME',
                         learning_rate=1,
                         random_state=42)

abc.fit(X_train, y_train)

y_pred = abc.predict(X_val)

print(classification_report(y_val, 
                            y_pred, 
                            zero_division=1, 
                            digits=4,
                            target_names=['editorial_discourse', 'research_discourse']))

                     precision    recall  f1-score   support

editorial_discourse     0.5705    0.4143    0.4800     70846
 research_discourse     0.9540    0.9750    0.9644    883468

           accuracy                         0.9334    954314
          macro avg     0.7623    0.6946    0.7222    954314
       weighted avg     0.9256    0.9334    0.9284    954314



In [46]:
y_pred = abc.predict(X_test)

print(classification_report(y_test, 
                            y_pred, 
                            zero_division=1,
                            digits=4,
                            target_names=['editorial_discourse', 'research_discourse']))

                     precision    recall  f1-score   support

editorial_discourse     0.5726    0.4143    0.4807     70846
 research_discourse     0.9540    0.9752    0.9645    883468

           accuracy                         0.9336    954314
          macro avg     0.7633    0.6947    0.7226    954314
       weighted avg     0.9257    0.9336    0.9286    954314



In [44]:
dummy_clf = DummyClassifier(strategy='uniform',
                            random_state=42)

dummy_clf.fit(X_train, y_train)

y_pred = dummy_clf.predict(X_val)

print(classification_report(y_val, 
                            y_pred, 
                            zero_division=1, 
                            digits=4,
                            target_names=['editorial_discourse', 'research_discourse']))

                     precision    recall  f1-score   support

editorial_discourse     0.0746    0.5023    0.1298     70846
 research_discourse     0.9261    0.5000    0.6494    883468

           accuracy                         0.5001    954314
          macro avg     0.5003    0.5011    0.3896    954314
       weighted avg     0.8629    0.5001    0.6108    954314



In [47]:
y_pred = dummy_clf.predict(X_test)

print(classification_report(y_test, 
                            y_pred, 
                            zero_division=1,
                            digits=4,
                            target_names=['editorial_discourse', 'research_discourse']))

                     precision    recall  f1-score   support

editorial_discourse     0.0740    0.4984    0.1288     70846
 research_discourse     0.9255    0.4997    0.6490    883468

           accuracy                         0.4996    954314
          macro avg     0.4997    0.4990    0.3889    954314
       weighted avg     0.8623    0.4996    0.6103    954314

