In [89]:
import pandas as pd
import glob
import json
import github_api
import search_for_url
import numpy as np
from urllib import parse

In [49]:
papes = glob.glob('../git_papes/*')

In [50]:
papes = sorted(papes)

In [51]:
len(papes)

176

In [52]:
with open(papes[5]) as infile:
    dat = json.load(infile)

In [122]:
def standardize_url(u):
    pp = parse.urlparse(u)
    u = pp.netloc + pp.path
    if u[-1]=='/':
        u = u[:-1]
    return u

In [123]:
pairs = []
for p in papes:
    with open(p) as infile:
        dat = json.load(infile)
        for d in dat:
            #assert(len(d['documents'])==1)
            pmcid = d['documents'][0]['id']
            urls = search_for_url.search_for_url(' '.join([i['text'] for i in d['documents'][0]['passages']]))
            for u in urls:
                u = standardize_url(u)
                if 'github' in u:
                    pairs.append((pmcid, u))
pairs = np.array(pairs)

In [124]:
standardize_url('https://github.com/hello/')

'github.com/hello'

In [125]:
urls_in_papes = pd.DataFrame({'pmcid': np.array(pairs)[:, 0], 'url': np.array(pairs)[:, 1]})

In [126]:
urls_in_papes

Unnamed: 0,pmcid,url
0,4910214,github.com/lh3/wgsim
1,5071354,github.com/JAvRZ/andi-dataprocessing
2,5013563,github.com/AharoniLab/MatchWeiz
3,4958917,broadinstitute.github.io/picard
4,5107194,github.com/ijmarshall/picotron
5,5127779,github.com/awilfert/PSAP-pipeline
6,4928833,gist.github.com/darmitage/3179407
7,5140850,github.com/confunguido/PokemonDataAnalysis
8,4966778,uhkniazi.github.io/dismiss
9,5045566,github.com/alexandrovteam/spatial-corals


In [127]:
urls_in_papes.to_csv('../data/papes_and_github_urls.csv', index=False)

In [128]:
len(np.unique(urls_in_papes.pmcid))

16510

## Diagnose some cases

In [79]:

pmcid_to_doc = {}
for p in papes:
    with open(p) as infile:
        dat = json.load(infile)
        for d in dat:
            #assert(len(d['documents'])==1)
            pmcid = d['documents'][0]['id']
            pmcid_to_doc[pmcid] = d['documents'][0]

In [80]:
pmcid_to_doc['6418218']

{'passages': [{'text': '‘Genome skimming’ with the MinION hand-held sequencer identifies CITES-listed shark species in India’s exports market',
   'offset': 0,
   'relations': [],
   'infons': {'name_2': 'surname:Cantu;given-names:Vito Adrian',
    'name_3': 'surname:Fellows;given-names:Sam R.',
    'name_0': 'surname:Johri;given-names:Shaili',
    'name_1': 'surname:Solanki;given-names:Jitesh',
    'name_6': 'surname:Vyas;given-names:Asit',
    'article-id_publisher-id': '40940',
    'license': 'Open Access This article is licensed under a Creative Commons Attribution 4.0 International License, which permits use, sharing, adaptation, distribution and reproduction in any medium or format, as long as you give appropriate credit to the original author(s) and the source, provide a link to the Creative Commons license, and indicate if changes were made. The images or other third party material in this article are included in the article’s Creative Commons license, unless indicated otherwis

In [81]:
[t['text'] for t in pmcid_to_doc['6414705']['passages'] if 'github' in t['text']]

['Images were processed and analyzed with ImageJ Fiji v1.50c. Demographs and line plots (Fig.\xa01 and Fig.\xa0S6) were constructed by manually measuring the fluorescence intensity profiles in Fiji and processing the data in R (version 3.3.1 [http://www.r-project.org]), with the cell profiles script ( [http://github.com/ta-cameron/cell-profiles]) and ggplot2 package (version 2.1.0; Hadley Wickham, Department of Statistics, Rice University [https://ggplot2.tidyverse.org/]). Intensity profiles either were oriented with the fluorescent cell pole toward the left (E. coli and R. sphaeroides), or random sorting was applied (M. gryphiswaldense and R. rubrum). In the case of C. crescentus, the stalk was used as the old pole reference. To confidently and accurately identify stalks, FM4-64 staining (which in our hands did not always stain all cells) and DAPI staining (which we found to unspecifically stain stalks) were used, in addition to DIC microscopy.']

In [83]:
[t['text'] for t in pmcid_to_doc['6294587']['passages'] if 'github' in t['text']]

['The fMRI imaging analysis steps are outlined in Fig. 1 The first 5 time frames (15\u200as) were removed to allow the MR signal to achieve T1 equilibrium. Time frames were slice-timing corrected, realigned to the mean echo-planar image using SPM12 (http://www.fil.ion.ucl.ac.uk/spm/), co-registered to the subject T1 space and then normalized to the standard MNI-152 2mm-template using Advanced Normalization Tools software (http://stnava.github.io/ANTs/). Signals from subject white matter and cerebrospinal fluid (3-mm cubes centered at Montreal Neurological Institute coordinates (MNI) (26, –12, 35) and (19, –33, 18) as these locations are away from the grey matter), as well as six head motion parameters were regressed out from each dataset. Global signal regression was not performed as it is controversial in resting state fMRI data preprocessing. Correction for global signal fluctuations with regression has a significant impact on resting state functional connectivity results and can fur

## See if unique URLs are author repos

In [129]:
from collections import Counter

In [130]:
url_counts = Counter(urls_in_papes.url)

In [131]:
url_counts.most_common(50)

[('broadinstitute.github.io/picard', 1106),
 ('github.com/najoshi/sickle', 275),
 ('github.com/jstjohn/SeqPrep', 163),
 ('github.com', 141),
 ('github.com/lh3/seqtk', 139),
 ('transdecoder.github.io', 131),
 ('cole-trapnell-lab.github.io/cufflinks', 116),
 ('github.com/alexdobin/STAR', 106),
 ('stnava.github.io/ANTs', 96),
 ('github.com/taoliu/MACS', 93),
 ('github.com/lh3/wgsim', 86),
 ('trinotate.github.io', 83),
 ('github.com/vsbuffalo/scythe', 65),
 ('github.com/broadinstitute/picard', 55),
 ('networkx.github.io', 54),
 ('github.com/korseby/container-mtbls520', 49),
 ('github.com/bulik/ldsc', 47),
 ('github.com/tseemann/snippy', 40),
 ('trinityrnaseq.github.io', 38),
 ('github.com/ekg/freebayes', 36),
 ('github.com/jdstorey/qvalue', 35),
 ('github.com/trinityrnaseq/trinityrnaseq/wiki', 34),
 ('github.com/tseemann/abricate', 34),
 ('picrust.github.io/picrust', 33),
 ('github.com/arq5x/bedtools2', 33),
 ('github.com/genome/bam-readcount', 32),
 ('gephi.github.io', 31),
 ('github.com/

In [136]:
dict(url_counts)

{'github.com/lh3/wgsim': 86,
 'github.com/JAvRZ/andi-dataprocessing': 1,
 'github.com/AharoniLab/MatchWeiz': 1,
 'broadinstitute.github.io/picard': 1106,
 'github.com/ijmarshall/picotron': 1,
 'github.com/awilfert/PSAP-pipeline': 1,
 'gist.github.com/darmitage/3179407': 1,
 'github.com/confunguido/PokemonDataAnalysis': 1,
 'uhkniazi.github.io/dismiss': 1,
 'github.com/alexandrovteam/spatial-corals': 1,
 'uubram.github.io/RTCR/along': 1,
 'github.com/bulik/ldsc': 47,
 'github.com/tanghaibao/treecut/blob/master/scripts/eisen_to_newick.py': 1,
 'github.com/aboyle/F-seq': 3,
 'www.github.com/rhenley/Pythion': 1,
 'github.com/Caleydo/caleydo_clue': 1,
 'github.com/Caleydo/pathfinder': 1,
 'github.com/GauravPandeyLab': 1,
 'github.com/tdsmith/eleven': 1,
 'github.com/nmcnulty/COPRO-Seq': 1,
 'github.com/rhenley/Pyth-Ion': 4,
 'github.com/rgcgithub/clamms': 2,
 'github.com/ZWang-Lab/LSKAT': 1,
 'github.com/markus-nilsson/md-dmri': 2,
 'github.com/divara01/PSB2017_ReproducibilityOfBNs': 1,
 'g

In [138]:
unique_urls = [k for k,v in url_counts.items() if v==1]

In [140]:
unique_url_filt = np.isin(urls_in_papes.url, unique_urls)

In [143]:
#urls_in_papes.loc[unique_url_filt, :]

In [188]:
ids_to_check = np.sort(np.unique(urls_in_papes.pmcid[unique_url_filt].values))

In [189]:
ids_to_check

array(['2712337', '2795760', '2820491', ..., '6484778', '6485367',
       '6485654'], dtype=object)

+

All our code is hosted on Github (http://github.com/intermine/) and contributions are welcome.

The raw data and complete source code to reproduce the results of the analysis is freely available at https://github.com/akastrin/kastrin2014large.

```
{'text': '2Software available at https://github.com/vellamike/optimizer, online documentation at http://optimizer.readthedocs.org/',
   'offset': 67424,
   'relations': [],
   'infons': {'type': 'footnote', 'section_type': 'CONCL'},
   'sentences': [],
   'annotations': []}
   
{'text': '– To handle the RRF syntax we have developed the UMLS2RDF project.1 UMLS2RDF is the set of scripts that connect to the UMLS MySQL release and transforms its content into RDF triples.',
   'offset': 17722,
   'relations': [],
   'infons': {'type': 'paragraph', 'section_type': 'INTRO'},
   'sentences': [],
   'annotations': []}   
{'text': '           https://github.com/ncbo/umls2rdf         ',
   'offset': 20138,
   'relations': [],
   'infons': {'type': 'footnote', 'section_type': 'CONCL'},
   'sentences': [],
   'annotations': []},
   
{'passages': [{'text': 'Open-Source Syringe Pump Library',
   'offset': 0,
   'relations': [],
   'infons': {'name_2': 'surname:Anzalone;given-names:Gerald C.',
    'license': 'This is an open-access article distributed under the terms of the Creative Commons Attribution License, which permits unrestricted use, distribution, and reproduction in any medium, provided the original author and source are properly credited.',
    'name_0': 'surname:Wijnen;given-names:Bas',
    'name_1': 'surname:Hunt;given-names:Emily J.',
    'alt-title': 'Open-Source Syringe Pump Library',
    'article-id_publisher-id': 'PONE-D-14-11249',
    'name_3': 'surname:Pearce;given-names:Joshua M.',
    'type': 'front',
    'elocation-id': 'e107216',
    'article-id_doi': '10.1371/journal.pone.0107216',
    'section_type': 'TITLE',
    'article-id_pmid': '25229451',
    'volume': '9',
    'article-id_pmc': '4167991',
    'year': '2014',
    'title': 'Data Availability',
    'name_4': 'surname:Gilestro;given-names:Giorgio F.',
    'issue': '9',
    'notes': 'The authors confirm that all data underlying the findings are fully available without restriction. All files needed to construct the device are held at https://github.com/mtu-most/linear-actuator All data for the paper presented within it.'},
```

To foster reproducibility and fast development of future work based upon these results, we have published functional graph analysis tools under an open source, GPLv3 license, available here: https://github.com/ssgrn/GraphInvariantsNeocortex.'

Software and user manual are available at http://bioinformaticstools.mayo.edu/research/patterncnv/, and R package at https://github.com/topsoil/patternCNV/.


Availability and implementation: The software for PatternCNV is implemented using Perl and R, and can be used in Mac or Linux environments. Software and user manual are available at http://bioinformaticstools.mayo.edu/research/patterncnv/, and R package at https://github.com/topsoil/patternCNV/.


To quantify the number of c-Fos positive and β-gal positive cells, the red (c-Fos) and green (β-gal) color channels in each image were first filtered with a stringent threshold (mean +2 standard deviations of background pixel intensity level), and each color channel was then converted to a binary, black/white (BW) image using ImageBWconvertGUI- a custom-made program running in the 2013a Matlab Computing Environment with the Image Processing Toolbox (The MathWorks, Natick, MA; program available on Github: https://github.com/neuropil/ImageBWconvert/).

In addition to quantification of the number of c-Fos and β-gal positive cell bodies, we also quantified, across different experimental conditions, the number and percentage of pixels within each nTS polygon with a pixel intensity value that exceeded 2 standard deviations for each stain, which includes both histological-positive cell bodies as well as associated cellular morphology (e.g., dendrites and axons) using ROIImageAnalysis - another custom-made MATLAB program (available on Github: https://github.com/neuropil/ROIImageAnalysis/).

The complete R scripts to reproduce the analysis can be downloaded from http://sgibb.github.io/Culicoides/.

Project home page:ftp://ftp.ebi.ac.uk/pub/databases/chembl/text-mining, and https://github.com/chembl/chembl_literature_classifier

Proteome clustering results and matrix, scripts, and sequences alignments are available from GitHub (https://github.com/ocisse/Pneumocystis_comparative, last accessed July 27, 2014).

All the figures for this article were produced using matplotlib, and figure scripts are available from https://github.com/rougier/ten-rules.

New scripts written for this protocol, an example data set, and any future updates are available at https://github.com/listonlab/.

A copy of the macro code is available as open source freeware; https://github.com/graemeball/ij_scripts/blob/master/Macros/Sum_Masked_Signal.ijm.

The code is also available online at http://github.com/uygarsumbul/rgc.

Source code repository URL: http://github.com/CovertLab/WholeCellSimDB

The raw count data, script, script output, addition sample metadata, and saved Cytoscape session are available (https://github.com/brwnj/interaction_network).

The R code used for all of the analysis has been made available on an open source basis (https://github.com/SEEG-Oxford/ebola_zoonotic).

The data and source code used to generate this table are provided through http://jbloom.github.io/phyloExpCM/example_2014Analysis_lactamase.html (last accessed July 28, 2014).

Code developed for this project, as well as issue trackers and full documentation for deploying the software are all available on github at https://github.com/apfejes/epigenetics-software.

Source code can be freely obtained from https://github.com/bps10/color/tree/JOSA.

The R package can be downloaded from our website (http://genemed.uchicago.edu/~pgeeleher/pRRophetic) or GitHub (https://github.com/paulgeeleher/pRRophetic).

Phages genes were analyzed using Phamerator , an open-source program (GNU general public license) designed to compare phage genes and genomes. For this study, Phamerator was adapted and stored in a GitHub repository ( http://github.com/byuphamerator/phamerator-dev) separate from the original version.

A reference implementation of SILP2 is provided at https://github.com/jim-bo/silp2.

The code used for generating most of the reported results, including the implementation of the ALN50 metric, is available at https://github.com/jim-bo/scafathon.

ARYANA with complete source code can be obtained from http://github.com/aryana-aligner



-

To process huge numbers of queries in parallel in the background, we use the Resque library (https://github.com/resque/resque).

Networks were constructed using the online application Population Graphs v2 (http://dyerlab.github.io/popgraph/. Accessed 2014 June 16), and the analyses were performed with the software Genetic Studio.

The phylogenetic analyses were performed using the software package phyloExpCM (phylogenetic analyses with experimental codon models, https://github.com/jbloom/phyloExpCM, last accessed July 28, 2014), which primarily serves as an interface to run HYPHY.

The PyJade module (https://github.com/SyrusAkbary/pyjade) is used to render Jade templates into HTML to generate the interface.

```
{'text': 'MGFLI. count_genes_per_transcript.pl. https://github.com/MGFLI/RNAseq/tree/master',
   'offset': 25770,
   'relations': [],
   'infons': {'type': 'ref', 'section_type': 'REF'},
   'sentences': [],
   'annotations': []}
```

In [220]:
[t['text'] for t in pmcid_to_doc[ids_to_check[521]]['passages'] if 'github' in t['text']]

['ARYANA with complete source code can be obtained from http://github.com/aryana-aligner',
 'The experiments were performed on a platform with 48 AMD Opteron Processor 6174 CPUs each having 12 cores with clock speed of 2.2 GHz. The hg19 human genome assembly was used as the reference for all test cases. We used dwgsirn (https://github.com/nh13/DWGSIM/wiki) to simulate data sets similar to real reads produced by Illumina NGS platforms.']

In [201]:
ids_to_check[505]

'4159173'

In [219]:
pmcid_to_doc[ids_to_check[520]]

{'passages': [{'text': 'ILP-based maximum likelihood genome scaffolding',
   'offset': 0,
   'relations': [],
   'infons': {'name_2': 'surname:Măndoiu;given-names:Ion',
    'name_3': 'surname:Zelikovsky;given-names:Alex',
    'name_0': 'surname:Lindsay;given-names:James',
    'name_1': 'surname:Salooti;given-names:Hamed',
    'license': 'This is an Open Access article distributed under the terms of the Creative Commons Attribution License (http://creativecommons.org/licenses/by/4.0), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work is properly cited. The Creative Commons Public Domain Dedication waiver (http://creativecommons.org/publicdomain/zero/1.0/) applies to the data made available in this article, unless otherwise stated.',
    'article-id_publisher-id': '1471-2105-15-S9-S9',
    'issue': 'Suppl 9',
    'article-id_doi': '10.1186/1471-2105-15-S9-S9',
    'section_type': 'TITLE',
    'article-id_pmid': '25253180',
    'lpage