In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mtick
import matplotlib.patches as mpatches
from matplotlib.lines import Line2D
import seaborn as sns
import matplotlib.image as mpimg
import os
from sqlalchemy import create_engine

In [3]:
sns.set_style('whitegrid')
plt.rc('font', family='Arial')
plt.rc('font', size=9) 
plt.rc('axes', titlesize=9) 
plt.rc('axes', labelsize=9) 
plt.rc('xtick', labelsize=9) 
plt.rc('ytick', labelsize=9) 
plt.rc('legend', fontsize=9)

In [4]:
host = os.environ['KB_HOST']
database = os.environ['KB_DATABASE']
user = os.environ['KB_USER']
pw = os.environ['KB_PASSWORD']
port = os.environ['KB_PORT']
engine = create_engine(f'postgresql://{user}:{pw}@{host}:{port}/{database}')

In [6]:
scp_cla_n = pd.read_sql("""
                        SELECT COUNT(DISTINCT(dt.doi)) AS n, is_research
                        FROM  kb_project_openbib.classification_article_reviews_october_2024 AS dt
                        JOIN scp_b_202407.items scp
                            ON LOWER(dt.doi) = LOWER(scp.doi)
                        WHERE ('Article' = ANY(item_type) OR 'Review' = ANY(item_type)) AND (scp.pubyear BETWEEN 2012 AND 2021)
                        GROUP BY is_research
                        """, 
                        con=engine)

In [8]:
scp_cla_n[scp_cla_n['is_research'] == False].n / scp_cla_n.n.sum()

0    0.01161
Name: n, dtype: float64

In [18]:
scp_cla_a_n = pd.read_sql("""
                          SELECT COUNT(DISTINCT(dt.doi)) AS n, is_research
                          FROM  kb_project_openbib.classification_article_reviews_october_2024 AS dt
                          JOIN scp_b_202407.items scp
                              ON LOWER(dt.doi) = LOWER(scp.doi)
                          WHERE 'Article' = ANY(item_type) AND scp.pubyear BETWEEN 2012 AND 2021
                          GROUP BY is_research
                          """, 
                          con=engine)

In [19]:
scp_cla_a_n[scp_cla_a_n['is_research'] == False].n / scp_cla_n.n.sum()

0    0.010532
Name: n, dtype: float64

In [16]:
scp_cla_r_n = pd.read_sql("""
                          SELECT COUNT(DISTINCT(dt.doi)) AS n, is_research
                          FROM  kb_project_openbib.classification_article_reviews_october_2024 AS dt
                          JOIN scp_b_202407.items scp
                              ON LOWER(dt.doi) = LOWER(scp.doi)
                          WHERE 'Review' = ANY(item_type) AND scp.pubyear BETWEEN 2012 AND 2021
                          GROUP BY is_research
                          """, 
                          con=engine)

In [17]:
scp_cla_r_n[scp_cla_r_n['is_research'] == False].n / scp_cla_n.n.sum()

0    0.00108
Name: n, dtype: float64

In [20]:
scp_cla_n[scp_cla_n['is_research'] == False].n

0    231420
Name: n, dtype: int64

In [21]:
scp_cla_n.n.sum()

19933320

In [24]:
df_doi_2012_2021 = pd.read_sql("""
                               SELECT COUNT(DISTINCT(oal.doi)) AS n, is_research
                               FROM kb_project_openbib.classification_article_reviews_october_2024 AS dt
                               JOIN fiz_openalex_bdb_20240831_openbib.items AS oal
                                   ON oal.doi = dt.doi
                               WHERE oal.pubyear BETWEEN 2012 AND 2021
                               GROUP BY is_research
                               """,
                               con=engine)

In [27]:
df_doi_2012_2021[df_doi_2012_2021['is_research'] == False].n / df_doi_2012_2021.n.sum()

0    0.096226
Name: n, dtype: float64

In [26]:
df_doi_2012_2021_a = pd.read_sql("""
                                 SELECT COUNT(DISTINCT(oal.doi)) AS n, is_research
                                 FROM kb_project_openbib.classification_article_reviews_october_2024 AS dt
                                 JOIN fiz_openalex_bdb_20240831_openbib.items AS oal
                                     ON oal.doi = dt.doi
                                 WHERE 'article' = ANY(item_type) AND oal.pubyear BETWEEN 2012 AND 2021
                                 GROUP BY is_research
                                 """,
                                 con=engine)

In [31]:
df_doi_2012_2021_a[df_doi_2012_2021_a['is_research'] == False].n / df_doi_2012_2021.n.sum()

0    0.095279
Name: n, dtype: float64

In [29]:
df_doi_2012_2021_r = pd.read_sql("""
                                 SELECT COUNT(DISTINCT(oal.doi)) AS n, is_research
                                 FROM kb_project_openbib.classification_article_reviews_october_2024 AS dt
                                 JOIN fiz_openalex_bdb_20240831_openbib.items AS oal
                                     ON oal.doi = dt.doi
                                 WHERE 'review' = ANY(item_type) AND oal.pubyear BETWEEN 2012 AND 2021
                                 GROUP BY is_research
                                 """,
                                 con=engine)

In [32]:
df_doi_2012_2021_r[df_doi_2012_2021_r['is_research'] == False].n / df_doi_2012_2021.n.sum()

0    0.000938
Name: n, dtype: float64

In [19]:
df = pd.read_csv('./../datasets/classifier_sample_evaluated.csv', sep=';')

In [20]:
df['validation'].value_counts(normalize=True)

validation
True       0.736
unknown    0.222
False      0.042
Name: proportion, dtype: float64

In [22]:
with pd.option_context('display.max_rows', None):
    print(df['comment'].value_counts())

comment
not found                            207
?                                    139
abstract                              80
paratext                              62
paywall                               62
book review                           48
not loading                           37
case report                           37
news                                  27
conference                            22
no fulltext found                     22
article                               21
no document provided                  18
comment                               15
editorial                             14
note                                  12
no document found                     11
article?                              11
case study                             8
letter to the editor                   8
paratext?                              7
abstract?                              6
interview                              5
meeting abstract                       4
note?   