In [1]:
from models import *
from utility import *
import logging
import os
import yaml
from pathlib import Path
import plotly
from adapter import OstiMongoAdapter
import pandas as pd
from typing import Union, List, Dict
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

  from tqdm.autonotebook import tqdm


In [2]:
# configuration stuff
config_file = Path(os.path.abspath('')).parent / "files" / "config.yaml"
config = yaml.load(open(config_file.as_posix(), 'r'), Loader=yaml.SafeLoader)

# prepare
oma = OstiMongoAdapter.from_config(config)
elink = ConnectionModel.parse_obj(config["osti"]["elink"])
explorer = ConnectionModel.parse_obj(config["osti"]["explorer"])
elsevier = ConnectionModel.parse_obj(config["elsevier"])
osti = OSTIModel(elink=elink, explorer=explorer, elsevier=elsevier)

oma.materials_store.connect()
oma.doi_store.connect()

In [30]:
def mongo_to_panda(mongo_store, 
                   criteria:dict = {}, 
                   properties:Union[Dict, List, None]=None,
                   skip:int=0, limit:int=0, no_id=True):
    cursor = list(mongo_store.query(criteria=criteria, properties=properties, skip=skip, limit=limit))
    df = pd.DataFrame(list(cursor))
    if no_id:
        del df['_id']
    return df

In [44]:
def make_doi_status():
    df = mongo_to_panda(oma.doi_store)
    labels=df.status.unique()
    values = df.status.value_counts()
    fig=go.Figure(data=[go.Pie(labels=labels, values=[values[labels[0]], values[labels[1]]], hole=.5, name="status")])
    fig.update_layout(annotations=[dict(text="Status", font_size=20, showarrow=False)])
    return fig
def make_doi_valid():
    df = mongo_to_panda(oma.doi_store)
    labels=df.valid.unique()
    values = df.valid.value_counts()
    fig=go.Figure(data=[go.Pie(labels=labels, values=values, hole=.5, name="valid")])
    fig.update_layout(annotations=[dict(text="Valid", font_size=20, showarrow=False)])
    return fig
def make_doi_last_updated():
    df = mongo_to_panda(oma.doi_store)
    df.last_updated = [f"{t.year}/{t.month}/{t.day}" for t in df.last_updated] # only get the days
    labels=df.last_updated.unique()
    values = df.last_updated.value_counts()
    fig=go.Figure(data=[go.Pie(labels=labels, values=values, hole=.5, name="last_updated")])
    fig.update_layout(annotations=[dict(text="Last Updated", font_size=20, showarrow=False)])
    return fig
def make_doi_bibtex():
    df = mongo_to_panda(oma.doi_store)
    bibtex_bool = ["Does not have Bibtex" if i == None else "Has Bibtex" for i in df.bibtex]
    df["bibtex_bool"] = bibtex_bool
    labels=df.bibtex_bool.unique()
    values = df.bibtex_bool.value_counts()
    fig=go.Figure(data=[go.Pie(labels=labels, values=values, hole=.5, name="bibtex")])
    fig.update_layout(annotations=[dict(text="bibtex", font_size=20, showarrow=False)])
    return fig
def make_doi_citation_created():
    labels = ["DOI Citation Not Created", "DOI Citation Pending", "DOI Citation Completed"]
    pending_len = len([i for i in oma.doi_store.query(criteria={"status":"PENDING"})])
    values = [oma.materials_store.count()-oma.doi_store.count(),
              pending_len,
              oma.doi_store.count() - pending_len
             ]
    fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.5)])
    fig.update_layout(annotations=[dict(text="DOI Citation", font_size=20, showarrow=False)])
    return fig

In [45]:
fig = make_doi_status()
fig.show()

In [15]:
cursor = list(oma.doi_store.query({"bibtex":{"$ne":None}}))
df = pd.DataFrame(list(cursor))
try:
    df.drop(["_id"], axis=1, inplace=True)
except:
    pass
df.head()

len(df)

199

## Draw time dependent graphs

In [8]:
print("last_updated", len(oma.doi_store.distinct("last_updated")))
print("created_at", len(oma.doi_store.distinct("created_at")))
print("last_validated_on", len(oma.doi_store.distinct("last_validated_on")))
print("elsevier_updated_on",len(oma.doi_store.distinct("elsevier_updated_on")))

last_updated 20
created_at 8
last_validated_on 27
elsevier_updated_on 1138


In [20]:
from pathlib import Path
import json
import os
default_log_path = Path(os.getcwd()).parent / "files" / "log.txt"
def read_data(file_path:Path=default_log_path) -> List[LogContent]:
    if file_path.exists:
        file = file_path.open('r')
        data = file.readlines()
        file.close()
        result = []
        for d in data:
            result.append(LogContent.parse_obj(json.loads(d)))
        return result
    else:
        return []
    
log_contents:List[LogContent] = read_data()
X = []
last_updated_count = []
created_at_count=[]
elsevier_updated_on_count = []
material_data_base_count = []
doi_store_count = []
bibtex_count = []

for i in range(len(log_contents)):
    log_content = log_contents[i]
    X.append(log_content.date)
    material_data_base_count.append(log_content.material_data_base_count)
    doi_store_count.append(log_content.doi_store_count)
    if i == 0:
        last_updated_count.append(log_content.last_updated_count)
        elsevier_updated_on_count.append(log_content.elsevier_updated_on_count)
        created_at_count.append(log_content.created_at_count)  
        bibtex_count.append(log_content.bibtex_count)
    else:
        last_updated_count.append(last_updated_count[-1] + log_content.last_updated_count)
        elsevier_updated_on_count.append(elsevier_updated_on_count[-1] + log_content.elsevier_updated_on_count)
        created_at_count.append(created_at_count[-1] + log_content.created_at_count)
        bibtex_count.append(log_content.bibtex_count)

fig = go.Figure()
fig.add_trace(go.Scatter(x=X, 
                         y=last_updated_count,
                        mode='lines+markers',
                        name='last_updated'))
fig.add_trace(go.Scatter(x=X, 
                         y=elsevier_updated_on_count,
                        mode='lines+markers',
                        name='elsevier_updated_on_count'))
fig.add_trace(go.Scatter(x=X, 
                         y=created_at_count,
                        mode='lines+markers',
                        name='created_at_count'))
fig.add_trace(go.Scatter(x=X, 
                         y=material_data_base_count,
                        mode='lines+markers',
                        name='material_data_base_count'))
fig.add_trace(go.Scatter(x=X, 
                         y=doi_store_count,
                        mode='lines+markers',
                        name='doi_store_count'))
fig.add_trace(go.Scatter(x=X, 
                         y=bibtex_count,
                        mode='lines+markers',
                        name='bibtex_count'))

fig.update_layout(
    title="MPCite Status",
    xaxis_title="Time",
    yaxis_title="# Submission",
    font=dict(
        family="Franklin Gothic",
        size=14,
        color="#0d0d0d"
    )
)

fig.show()

In [9]:
date_formatter = {
    "year":"%Y" ,
    "month":"%Y-%m" ,
    "day":"%Y-%m-%d " ,
    "hour":"%Y-%m-%d %H:%M" ,
    "minute":"%Y-%m-%d %H:%M" ,
    "second": "%Y-%m-%d %H:%M:%S",
    "milisecond":"%Y-%m-%d %H:%M:%S.%f"

}    
def group_datetime_by(store, col_name, until:str):
    """
        @param until: until has values of ["year", "month", "day", "hour", "minute", "second", "milisecond"]
        @param store: store is a mongoStore instance where it has a groupby method
        @return:
            dictionary of dataframe of date cumulative count
    """
    until = until.lower()
    assert until in ["year", "month", "day", "hour", "minute", "second", "milisecond"]
    gb = store.groupby(col_name)

    
    result = dict() # dict of datetime -> count
    for r in gb:
        t = r[0][col_name].strftime(date_formatter[until]) # the date in desired format
        if t in result:
            result[t] += len(r[1])
        else:
            result[t] = len(r[1])
    
    # return mapping of datetime -> cumulative count
    X = sorted(result.keys())
    Y = []
    CC = []

    sum_so_far = 0
    for x in X:
        num = result[x]
        Y.append(num)
        sum_so_far += num
        CC.append(sum_so_far)
    
    data = {"date": X,
            "type":col_name,
            "CC": CC,
#             f"{col_name} Count": Y,
           }
    
    return pd.DataFrame(data=data)


In [10]:
def graph_data_simple(until="day"):
    last_updated_data = group_datetime_by(oma.doi_store, "last_updated", until)
    created_at = group_datetime_by(oma.doi_store, "created_at", until)
    last_validated_on = group_datetime_by(oma.doi_store, "last_validated_on", until)
    elsevier_updated_on = group_datetime_by(oma.doi_store, "elsevier_updated_on", until)
    df = pd.concat([last_updated_data, last_validated_on,created_at,elsevier_updated_on])
    fig = px.line(df, x="date", y="CC", color="type", line_group="type")
    fig.show()

graph_data_simple(until="hour")

In [11]:
until="hour"
X = group_datetime_by(oma.doi_store, "last_updated", until).date.to_list()
y_last_updated = group_datetime_by(oma.doi_store, "last_updated", until).CC.to_list()
y_created_at = group_datetime_by(oma.doi_store, "created_at", until).CC.to_list()
y_last_validated_on = group_datetime_by(oma.doi_store, "last_validated_on", until).CC.to_list()
y_elsevier_updated_on = group_datetime_by(oma.doi_store, "elsevier_updated_on", until).CC.to_list()

import plotly.graph_objects as go


# Create traces
# fig = go.Figure()
# fig.add_trace(go.Scatter(x=X, y=y_last_updated,
#                     mode='lines+markers',
#                     name='last_updated'))
# fig.add_trace(go.Scatter(x=X, y=y_created_at,
#                     mode='lines+markers',
#                     name='created_at'))
# fig.add_trace(go.Scatter(x=X, y=y_last_validated_on,
#                     mode='lines+markers',
#                     name='last_validated_on'))
# fig.add_trace(go.Scatter(x=X, y=y_elsevier_updated_on,
#                     mode='lines+markers',
#                     name='elsevier_updated_on'))


# fig.show()

In [12]:
until="day"
last_updated = group_datetime_by(oma.doi_store, "last_updated", until).rename(columns={"CC":"last_updated CC"}).drop(["type"], axis=1)
created_at = group_datetime_by(oma.doi_store, "created_at", until).rename(columns={"CC":"created_at CC"}).drop(["type"], axis=1)
last_validated_on = group_datetime_by(oma.doi_store, "last_validated_on", until).rename(columns={"CC":"last_validated_on CC"}).drop(["type"], axis=1)
elsevier_updated_on = group_datetime_by(oma.doi_store, "elsevier_updated_on", until).rename(columns={"CC":"elsevier_updated_on CC"}).drop(["type"], axis=1)

overall = last_updated.merge(created_at, on="date").merge(last_validated_on, on="date", how="left").fillna(value=0).merge(elsevier_updated_on)


fig = go.Figure()
fig.add_trace(go.Scatter(x=overall.date, y=overall["last_updated CC"],
                    mode='lines+markers',
                    name='last_updated'))
fig.add_trace(go.Scatter(x=overall.date, y=overall["created_at CC"],
                    mode='lines+markers',
                    name='created_at'))
fig.add_trace(go.Scatter(x=overall.date, y=overall["last_validated_on CC"],
                    mode='lines+markers',
                    name='last_validated_on'))
fig.add_trace(go.Scatter(x=overall.date, y=overall["elsevier_updated_on CC"],
                    mode='lines+markers',
                    name='elsevier_updated_on'))
fig.update_layout(
    title="MPCite Status",
    xaxis_title="Time",
    yaxis_title="# Submission",
    font=dict(
        family="Franklin Gothic",
        size=14,
        color="#0d0d0d"
    )
)

fig.show()
