# EEG Scraper

This script is used to interact with PubMed to determine how many EEG articles with time-frequency/topography plotting exist, and then determine what colour scheme they use.

In [None]:
import sys
# !{sys.executable} -m pip install pymed
# !{sys.executable} -m pip install pandas
# !{sys.executable} -m pip install bs4
# !{sys.executable} -m pip install matplotlib
# !{sys.executable} -m pip install squarify
# !{sys.executable} -m pip install plotly
# !{sys.executable} -m pip install psutil
# !{sys.executable} -m pip install hsluv

In [None]:
from pymed import PubMed
import pandas as pd
import datetime

In [None]:
# Create a PubMed object that GraphQL can use to query
# Note that the parameters are not required but kindly requested by PubMed Central
# https://www.ncbi.nlm.nih.gov/pmc/tools/developers/
pubmed = PubMed(tool="EEGSearchTool", email="patrick.cooper@monash.edu")
# Create a GraphQL query in plain text
query = "(electroencephalography[MeSH Terms] OR electroencephalography[All Fields] OR eeg[All Fields]) AND time-frequency[All Fields]"
# Execute the query against the API
results = pubmed.query(query, max_results=10000)
results

In [None]:
title=[]
authors = []
year = []
journal = []
doi = []
for article in results:
    title.append(article.title)
    authors.append(article.authors)
    year.append(article.publication_date.year)
    journal.append(article.journal)
    doi.append(article.doi)

In [None]:
df=pd.DataFrame(list(zip(authors,year,title,journal,doi)),columns=['authors','year','title','journal','doi'])
df

In [None]:
# store dataframe as .json file
df.to_json (r'/Users/pcoo0005/Documents/GitHub/EEGScraper/PMC.json', orient='split')

In [None]:
import json
import csv
with open('PMC.json', 'r') as file:
    data = json.load(file)

In [None]:
def pushAndSave(colour_list,colour_scheme,filename):
    if colour_scheme != "STOP":
        # we use "STOP" as a safe word to pause recording
        # push to end of list
        colour_list.append(colour_scheme)
        with open(filename, 'w') as f:
            # using csv.writer method from CSV package
            write = csv.writer(f)
            write.writerow(colour_list)
        nPapers = len(colour_list)
    else:
        nPapers = "STOPPED"
    return nPapers

In [None]:
count=0# change this to current value if running over multiple sessions

suffix=data['data'][count][4]
print(data['data'][count][2],data['data'][count][1])
print(suffix)
from IPython.display import IFrame

url = 'https://www.doi.org/' + suffix
# another website could be substituted here if needed...
IFrame(url, width=800, height=200)

In [None]:
# * denotes no spectograph but a topoplot displayed (we'll filter these out later)
filename = 'colour_schemes.csv'
with open(filename) as f:
    reader = csv.reader(f)
    my_list = list(reader)
colour_schemes = my_list[0]
count = len(colour_schemes)
count
# if you want to stop for the session/day, colour_scheme = "STOP" will do the trick
colour_scheme = "greyscale" 
count = pushAndSave(colour_schemes,colour_scheme,filename)
print(count)


In [None]:
# read in filename
filename = 'colour_schemes.csv'
with open(filename) as f:
    reader = csv.reader(f)
    my_list = list(reader)
colour_schemes = my_list[0]
from collections import Counter
# extract trial counts for used data
article_count  = 0
NA_count       = 0
REMOVE_count   = 0
NOACCESS_count = 0
TOPOONLY_count = 0

jet_count       = 0
parula_count    = 0
cbrewer_count   = 0
hot_count       = 0
greyscale_count = 0
other_count     = 0

for (ind,article) in enumerate(colour_schemes):
    if data['data'][ind][1]>1999:
        article_count += 1
        if article == "NA":
            NA_count += 1
        elif article == "REMOVE":
            REMOVE_count += 1
        elif article == "NOACCESS":
            NOACCESS_count += 1
        elif article[-1]=="*":
            TOPOONLY_count += 1
        elif (article == "jet") | (article == "jet-like"):
            jet_count += 1
        elif article == "parula":
            parula_count += 1
        elif article == "greyscale":
            greyscale_count += 1
        elif ((article == "RdBu") | (article == "RdYlBu") | (article == "RdPu") | 
        (article == "YlRdBkBu") | (article == "YlGrBu") | (article == "BuYl") |
        (article == "YlPuRd") | (article == "RdYlGr") | (article == "Cyan-Black") |
        (article == "YlRdBlackBl") | (article == "BlYlGr") |
        (article == "YlRdBlackBu") | (article == "YlRdBuPr") |
        (article == "Blues") | (article == "YlRdBu") | (article == "YlOrRd") |
        (article == "YlBlack") | (article == "YlGrPrRd") |
        (article == "YlRdPrBu") |  (article == "YlRdBlBu") |
        (article == "BlRdYlBuPr") | (article == "YlRdBlBuPr") |
        (article == "RdYlBuBl") | (article == "GnBk") | (article == "RdBl") |
        (article == "PuBu") | (article == "GrBk") | (article == "RdBuGrBl") |
        (article == "GrBuRdYl") | (article == "Reds") | (article == "RdPuBuGn")):
            cbrewer_count += 1
        elif article == "hot":
            hot_count += 1
        elif (article == "bone") | (article == "cool") | (article == "viridis") | (article == "plasma"):
            other_count +=1
            
valid_count = article_count - NA_count - REMOVE_count - NOACCESS_count - TOPOONLY_count


scheme_totals={"Jet": round((jet_count/valid_count)*100,2),
"Greyscale": round((greyscale_count/valid_count)*100,2),
"cbrewer": round((cbrewer_count/valid_count)*100,2),
"Parula": round((parula_count/valid_count)*100,2),
"Hot": round((hot_count/valid_count)*100,2),
"Other": round((other_count/valid_count)*100,2)}


print("Total:\t\t",article_count,
     "\nNA:\t\t",NA_count+TOPOONLY_count,
     "\nREMOVE:\t\t",REMOVE_count,
     "\nNOACCESS:\t",NOACCESS_count,
     "\nRemaining:\t",valid_count,
      "\n---",
     "\nJet:\t\t",round((jet_count/valid_count)*100,3),'\b%',
     "\nGreyscale:\t",round((greyscale_count/valid_count)*100,3),'\b%',
     "\ncbrewer:\t",round((cbrewer_count/valid_count)*100,3),'\b%',
     "\nParula:\t\t",round((parula_count/valid_count)*100,3),'\b%',
     "\nHot:\t\t",round((hot_count/valid_count)*100,3),'\b%',
     "\nOther:\t\t",round((other_count/valid_count)*100,3),'\b%',
     "\nTotal:\t\t",round(((jet_count + parula_count + hot_count + 
                            cbrewer_count + greyscale_count + other_count)/valid_count)*100,3))
    


In [None]:
import pandas as pd
import numpy as np
# extract year on year use of rainbow plot
jet_props = [];
for year in range(2000,2021):
    inds=[i for (i,j) in enumerate(data['data']) if j[1]==year]
    cols=[colour_schemes[i] for i in inds]
    a=[i for (i,word) in enumerate(cols) if(word[-1] == "*")]
    jet_props.append(round(cols.count("jet")/(len(cols)-
                             cols.count("NA")-cols.count("NOACCESS")-
                             cols.count("REMOVE")-len(a))*100,2))


d={'Colour Scheme':['Jet','Greyscale','cbrewer','Parula','Hot','Other'],
     'Amount':[scheme_totals['Jet'],scheme_totals['Greyscale'],
              scheme_totals['cbrewer'],scheme_totals['Parula'],
              scheme_totals['Hot'],scheme_totals['Other']]}
df = pd.DataFrame(d, columns = ['Colour Scheme', 'Amount'])

# print(df)

from plotly.subplots import make_subplots
import plotly.graph_objects as go

labels  = [i for i in d['Colour Scheme']]
# rename to more generic labels
labels[labels.index("Jet")]     = "Rainbow"
labels[labels.index("cbrewer")] = "ColorBrewer"
values  = [i for i in d['Amount']]
parents = ["" for i in d['Amount']]
# print(labels,values,parents)

fig = make_subplots(
    rows=1, cols=2,
    specs=[[{"type": "treemap"},{"type": "scatter"}]])

fig.add_trace(go.Treemap(
    labels = labels,
    parents = parents,
    values  = values,
    textinfo = "label",
    marker_colorscale = "spectral"),
              row=1, col=1)

fig.add_annotation(
    xref="x domain",yref="y domain",
    x=-1.3,y=1.1,
    text="<b>a</b>",
    showarrow=False)

fig.add_trace(go.Scatter(x=np.arange(2000,2021), 
                         y=jet_props,
                         mode='lines+markers',
                         name='lines+markers',
                        marker_color='rgba(0, 0, 0, .7)'),
              row=1, col=2)

fig.add_annotation(
    xref="x domain",yref="y domain",
    x=-0.2,y=1.1,
    text="<b>b</b>",
    showarrow=False)

fig.add_annotation(
        x=-.3,
        y=0,
        xref="x domain",
        yref="y domain",
        text="Other",
        showarrow=True,
        align="center",
        arrowhead=2,
        arrowsize=1,
        arrowwidth=2,
        arrowcolor="black",
        ax=35,
        ay=35,
        )




fig.update_layout(font=dict(
    family="Helvetica",
    size=18,
    color="Black"),
                  yaxis_title="<b>Rainbow Spectogram (%)</b>",
                  xaxis_title="<b>Year</b>",
                  plot_bgcolor="white",
                  xaxis=dict(showgrid=False,tickangle=360-45,tickfont=dict(family="Helvetica",size=16),
                               linewidth=2, linecolor='rgba(0, 0, 0, .7)'),
                  yaxis=dict(showgrid=False,tickfont=dict(family="Helvetica",size=16),
                             linewidth=2, linecolor='rgba(0, 0, 0, .7)',range=[0,100]))

fig.show()
import os
if not os.path.exists("images"):
    os.mkdir("images")
print(jet_props)

In [None]:
import pandas as pd
import numpy as np
# extract year on year use of rainbow plot
alldata = {
  "author": [],
  "year": [],
  "title": [],
  "journal": [],
  "doi": [],
  "cmap": []
}

for year in range(2000,2021):
    inds=[i for (i,j) in enumerate(data['data']) if j[1]==year]
    for i in inds:
        authors = data['data'][i][0]
        title   = data['data'][i][2]
        journal = data['data'][i][3]
        doi     = data['data'][i][4]
        cmap    = colour_schemes[i]
        if cmap[-1]=="*":
            cmap = "NA"
        alldata["author"].append(authors)
        alldata["year"].append(year)
        alldata["title"].append(title)
        alldata["journal"].append(journal)
        alldata["doi"].append(doi)
        alldata["cmap"].append(cmap)
    
df=pd.DataFrame(alldata)

#create JSON file 
json_file = df.to_json(orient='table') 

#export JSON file
with open('pubmed_data.json', 'w') as f:
    f.write(json_file)