In [6]:
import sqlite3
from time import time
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import itertools
from datetime import datetime


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.utils import shuffle
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

import parse_crossref

## Add New Articles to DB

Use crossref to add articles from the last couple of weeks

In [2]:
doi_retreiver = parse_crossref.DOIRetreiver()

db_file = 'test.db'
conn = sqlite3.connect(db_file)

for journal, issn in parse_crossref.retreive_all_journals(conn):
    print(journal, issn)
    parse_crossref.add_all_journal_articles(conn, issn, '2018-05-31', '2018-06-15', doi_retreiver)

ACS Central Science 2374-7943
ACS Medicinal Chemistry Letters 1948-5875
ACS Nano 1936-0851
Accounts of Chemical Research 0001-4842
Annual Reports in Computational Chemistry 1574-1400
Biochimica et Biophysica Acta (BBA) - Biomembranes 0005-2736
Biophysical Chemistry 0301-4622
Biophysical Journal 0006-3495
Biophysical Reviews 1867-2450
Chemical Physics 0301-0104
Chemical Physics Letters 0009-2614
Chemical Society Reviews 0306-0012
Chemistry and Physics of Lipids 0009-3084
Computational and Structural Biotechnology Journal 2001-0370
Computational and Theoretical Chemistry 2210-271X
Computer Physics Communications 0010-4655
Current Biology 0960-9822
Current Opinion in Cell Biology 0955-0674
Cytoskeleton 1949-3584
FEBS Letters 0014-5793
Journal of Biological Physics 0092-0606
Journal of Biomechanics 0021-9290
Journal of Biomolecular Structure and Dynamics 0739-1102
Journal of Chemical Information and Modeling 1549-9596
Journal of Chemical Sciences 0974-3626
Journal of Chemical Theory and Co

## Loading Model 

We can load the model that we previously built using `joblib`

In [3]:
from sklearn.externals import joblib
grid_search = joblib.load('model.pkl') 

## Retreiving Newest Articles From Database

Here, we use `sqlite3` to retreive all of the new articles.

In [11]:
db_file = 'test.db'
db_conn = sqlite3.connect(db_file, detect_types=sqlite3.PARSE_DECLTYPES|sqlite3.PARSE_COLNAMES)
c = db_conn.cursor()


article_titles = []
article_dois = []
results = c.execute('SELECT title, DOI FROM Articles WHERE created_at > ? and isVoth=0', ('2018-05-30 00:00:00',))
for title, doi in results:
    if 'Spotlights' not in title and 'Editorial' not in title and 'News at a glance' not in title:
        article_titles.append(title)
        article_dois.append(doi)



## Predict Relevant Articles

Since we already have a model, it is super simple to just predict relevant articles. Here, I use the best paramters for the linear SVM as determined by cross validation.

In [12]:
predicted = grid_search.best_estimator_.predict(article_titles)

Let's take a look at the articles and see which ones I should take a look at.

In [18]:
print('#title')
num = 0
for i, val in enumerate(predicted):
    if val == 1:
        print( str(num+1), 'http://dx.doi.org/' + article_dois[i], article_titles[i])
        num += 1

#title
1 http://dx.doi.org/10.1016/j.bpj.2018.04.022 Phospholipid Chain Interactions with Cholesterol Drive Domain Formation in Lipid Membranes
2 http://dx.doi.org/10.1016/j.bpj.2018.04.042 Lipid Configurations from Molecular Dynamics Simulations
3 http://dx.doi.org/10.1002/jcc.25348 Coarse-grained molecular dynamics simulations of polymerization with forward and backward reactions
4 http://dx.doi.org/10.1016/j.jcp.2018.06.015 An integral equation approach to calculate electrostatic interactions in many-body dielectric systems
5 http://dx.doi.org/10.1371/journal.pone.0197815 Integration of in vitro and in silico perspectives to explain chemical characterization, biological potential and anticancer effects of Hypericum salsugineum: A pharmacologically active source for functional drug formulations
6 http://dx.doi.org/10.1371/journal.pone.0198276 External release of entropy by synchronized movements of local secondary structures drives folding of a small, disulfide-bonded protein
7 http: