In [1]:
from scrape_scholar import *
import sqlite3 
import pandas as pd
import numpy as np
import networkx as nx
import altair as alt
import nx_altair as nxa
import numpy as np
import scipy.sparse as sp
import matplotlib.pyplot as plt
import pylab
from flask import Flask, render_template, request, redirect

## Função que coleta e armazena os dados

In [2]:
def scrape_and_store(author_name):

    conn = sqlite3.connect("db.db")
    cur = conn.cursor()
    
    # Coletando os dados
    data = scrape(author_name)

    # Inserindo nas tables paper e author
    for paper in range(len(data)):
        conn.execute("INSERT OR IGNORE INTO paper (paper) VALUES (?)", [data[paper].get("title")])
        for author in data[paper].get("authors"):
            conn.execute("INSERT OR IGNORE INTO author (author) VALUES (?)", [author])
            
    conn.commit()
    
    aux_paper = pd.read_sql("SELECT * from paper", conn)
    aux_paper.columns = ['paper_id', 'paper']
    
    aux_author = pd.read_sql("SELECT * from author", conn)
    aux_author.columns = ['author_id', 'author']
           
    author_paper = pd.DataFrame(columns = ["paper", "author"])
    for paper in range(len(data)):
        for author in data[paper].get("authors"):
            author_paper = author_paper.append({"paper" : data[paper].get("title"), "author" : author}, ignore_index = True)  
     
    try:    
        author_paper_old = pd.read_sql("SELECT author, paper FROM author_paper", conn)
        author_paper = pd.merge(author_paper_old, author_paper, how = "outer")

    except:
        pass 
        
    author_paper = pd.merge(pd.merge(author_paper, aux_author), aux_paper)
    author_paper.to_sql("author_paper", con = conn, if_exists = "replace")

    conn.close()

## Função que inicializa o database

In [3]:
def init_db():
    conn = sqlite3.connect("db.db")
    cur = conn.cursor()
 
    cur.executescript('''
    DROP TABLE IF EXISTS author;
    DROP TABLE IF EXISTS paper;
    DROP TABLE IF EXISTS author_paper;

    CREATE TABLE author(
    id INTEGER PRIMARY KEY NOT NULL,
    author TEXT NOT NULL UNIQUE);
    CREATE TABLE paper(
    id INTEGER PRIMARY KEY NOT NULL,
    paper VARCHAT NOT NULL UNIQUE);
    CREATE TABLE author_paper(
    id INTEGER PRIMARY KEY NOT NULL,
    author TEXT,
    paper TEXT,
    author_id INTEGER,
    paper_id INTEGER);
    ''')

    conn.commit()
    conn.close()

## Definindo as rotas do Flask

In [4]:
init_db()
app = Flask(__name__, template_folder = 'template')

@app.route('/')
def home():
    return redirect('/scrape', 302)

@app.route('/scrape') 
def scrape_(): # scrape já é o nome da função
    author_name = request.args.get("author_name")
    if author_name is None:
        return render_template("index.html", author_name = "")
    scrape_and_store(author_name)
    return render_template("index.html", author_name = author_name)

@app.route('/data/authors') # https://github.com/lemoncyb/flasked-altair
def data_authors():
    conn = sqlite3.connect("db.db")
    cur = conn.cursor()
    inpt = pd.read_sql("SELECT * FROM author_paper", conn)
    conn.close()

    inpt = inpt[["paper", "author"]]
    inpt["values"] = 1

    lst = list(inpt["author"])
    names = sorted(list(set(lst)))

    inpt = inpt.pivot_table(index = "paper", columns = "author").fillna(0)
    inpt.index.name = None

    cols = inpt.columns
    X = sp.csr_matrix(inpt.astype(int).values)
    Xc = X.T * X  
    Xc.setdiag(0)  

    inpt = pd.DataFrame(Xc.todense(), index = cols, columns = cols)
    graph = nx.from_numpy_matrix(inpt.values.astype(int))

    graph = nx.relabel_nodes(graph, dict(enumerate(names)))
    pos = nx.spring_layout(graph)

    cont = 0
    for n in graph.nodes():
        graph.nodes[n]["author"] = names[cont]
        cont += 1

    chart = nxa.draw_networkx(G = graph, pos = pos, node_tooltip = ["author"]).interactive()
    
    return chart.to_json()   
    
@app.route('/data/papers')
def data_papers():
    conn = sqlite3.connect("db.db")
    cur = conn.cursor()
    inpt = pd.read_sql("SELECT * FROM author_paper", conn)
    conn.close()

    inpt2 = inpt.copy()
    inpt = inpt[["paper", "author"]]
    inpt["values"] = 1
    
    lst = list(inpt["paper"])
    names = sorted(list(set(lst)))

    inpt = inpt.pivot_table(index = "author", columns = "paper").fillna(0)
    inpt.index.name = None

    cols = inpt.columns
    X = sp.csr_matrix(inpt.astype(int).values)
    Xc = X.T * X  
    Xc.setdiag(0)  

    inpt = pd.DataFrame(Xc.todense(), index = cols, columns = cols)
    graph = nx.from_numpy_matrix(inpt.values.astype(int))

    graph = nx.relabel_nodes(graph, dict(enumerate(names)))
    pos = nx.spring_layout(graph)

    unique_papers = list(set(inpt2["paper"].values))
    unique_papers.sort()

    authors = []
    for papers in unique_papers:
        authors.append(str(list(inpt2[inpt2['paper'] == papers]['author'].values)).replace("[", "").replace("]", "").replace("'", ""))

    cont = 0
    for n in graph.nodes():
        graph.nodes[n]["paper"] = names[cont]
        graph.nodes[n]["authors"] = authors[cont]
        cont += 1

    chart = nxa.draw_networkx(G = graph, pos = pos, node_tooltip = ["paper", "authors"]).interactive()
    
    return chart.to_json()   

## Iniciando o app

In [None]:
if __name__ == "__main__":
    app.run()

 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [28/Jul/2019 22:11:24] "GET / HTTP/1.1" 302 -
127.0.0.1 - - [28/Jul/2019 22:11:24] "GET /scrape HTTP/1.1" 200 -
[2019-07-28 22:11:24,822] ERROR in app: Exception on /data/authors [GET]
Traceback (most recent call last):
  File "C:\Users\LuizFernando\Anaconda3\lib\site-packages\flask\app.py", line 1982, in wsgi_app
    response = self.full_dispatch_request()
  File "C:\Users\LuizFernando\Anaconda3\lib\site-packages\flask\app.py", line 1614, in full_dispatch_request
    rv = self.handle_user_exception(e)
  File "C:\Users\LuizFernando\Anaconda3\lib\site-packages\flask\app.py", line 1517, in handle_user_exception
    reraise(exc_type, exc_value, tb)
  File "C:\Users\LuizFernando\Anaconda3\lib\site-packages\flask\_compat.py", line 33, in reraise
    raise value
  File "C:\Users\LuizFernando\Anaconda3\lib\site-packages\flask\app.py", line 1612, in full_dispatch_request
    rv = self.dispatch_request()
  File "C:\Users\