In [1]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction import stop_words
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, f1_score, precision_score, recall_score

from imblearn.ensemble import EasyEnsembleClassifier

import nltk
from nltk.corpus import wordnet

from scipy.sparse import hstack

from urlextract import URLExtract

import re

import sys

import numpy as np

import json

import pickle

sys.path.append('..')
from src.features.build_features import syns, sep_urls, check_paren, repo_label
from src.data.make_dataset import return_passages, test_suitability



In [2]:
extract = URLExtract()

In [3]:
df = pd.read_csv('/data/riddleta/data_sharing_reuse/external/combined_labels_incomplete.csv')
df.head()

Unnamed: 0,data_statement,doi,pmcid,section,text,Journal Title,Year,Volume,Issue,Page,PMCID,PMID,cv_run0,cv_run1,cv_run2,cv_run3,cv_run4,cv_sum
0,0,10.1007/s11481-008-9113-7,2581635,DISCUSS,In the Golgi-impregnation experiment we found ...,J Neuroimmune Pharmacol,2008,3,4.0,241,PMC2581635,18594991,0,0,0,0,0,0
1,0,10.1016/j.biopsych.2008.07.009,2586327,METHODS,The individuals with at least 3 usable voxels ...,Biol Psychiatry,2008,64,10.0,856,PMC2586327,18707679,0,0,0,0,0,0
2,0,10.1016/j.brainres.2008.07.008,2612637,METHODS,Although our procedure was intended to target ...,Brain Res,2008,1230,,202,PMC2612637,18662678,0,0,0,0,0,0
3,0,10.1371/journal.pone.0004156,2612746,METHODS,"low status face pictures, and included genotyp...",PLoS One,2009,4,1.0,e4156,PMC2612746,19142220,0,0,0,0,0,0
4,0,10.1016/j.neuron.2008.07.022,2614916,METHODS,"For each word, participants made a recognition...",Neuron,2008,59,4.0,547,PMC2614916,18760691,0,0,0,0,0,0


In [5]:
df.text.fillna('', inplace=True)
df['has_url'] = df.text.apply(lambda x: extract.has_urls(x))
df['has_parenth'] = df.text.apply(lambda x: check_paren(x))
df['repo'] = df.text.apply(lambda x: repo_label(x))
df['text'] = df.text.apply(lambda x: sep_urls(x))
df['syn_text'] = df.text.apply(lambda x: syns(x))
df['all_text'] = df.text + ' ' + df.syn_text

In [6]:
cv = CountVectorizer(stop_words=stop_words.ENGLISH_STOP_WORDS)
enc = OneHotEncoder(handle_unknown='ignore')

In [7]:
x_tr, x_tst, y_tr, y_tst = train_test_split(df.all_text, df.data_statement, test_size=.25, random_state=42, stratify=df.data_statement)

In [8]:
x_train = cv.fit_transform(x_tr)
one_hots_train = enc.fit_transform(df[['section', 'Journal Title', 'Year', 'has_url', 'has_parenth', 'repo']].loc[x_tr.index])
y_train = df.data_statement[x_tr.index]
x_test = cv.transform(df.all_text[x_tst.index])
one_hots_test = enc.transform(df[['section', 'Journal Title', 'Year', 'has_url', 'has_parenth', 'repo']].iloc[x_tst.index])
y_test = df.data_statement[x_tst.index]

x_train = hstack([x_train, one_hots_train])
x_test = hstack([x_test, one_hots_test])
#x_res, y_res = ros.fit_resample(x_train, y_train)

clf = EasyEnsembleClassifier()
#y_score = clf.fit(x_res, y_res).decision_function(x_test)
y_score = clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted']))
print(classification_report(y_test, y_pred))

Predicted    0   1
True              
0          548  33
1            3  50
              precision    recall  f1-score   support

           0       0.99      0.94      0.97       581
           1       0.60      0.94      0.74        53

    accuracy                           0.94       634
   macro avg       0.80      0.94      0.85       634
weighted avg       0.96      0.94      0.95       634



## Load all the papers

In [9]:
nimh_papers = pd.read_csv('/data/riddleta/data_sharing_reuse/external/nimh_papers.csv')
#load file index
file_ix = pd.read_csv('/data/riddleta/data_sharing_reuse/external/file_index.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [10]:
file_ix['pmcid'] = file_ix.pmcid.astype('str')
nimh_papers['pmcid'] = nimh_papers.pmcid.astype('str')

In [11]:
target_papers = file_ix[file_ix.pmcid.isin(nimh_papers.pmcid)]
target_papers.shape

(57692, 3)

In [11]:
target_papers = target_papers.sort_values('file')
status_prints = range(0, len(target_papers.file.tolist()), 250)
len(status_prints)

231

In [12]:
data_collect = []
last_file = np.nan
for i, file in enumerate(target_papers.file.tolist()):
    if i in status_prints:
        print(i)
    if file == last_file:
        paper = dat[target_papers.paper_number.iloc[i]]
        out_dat = return_passages(paper)
        data_collect.extend(out_dat)
    else:
        with open(file) as infile:
            dat = json.load(infile)
            paper = dat[target_papers.paper_number.iloc[i]]
            out_dat = return_passages(paper)
            data_collect.extend(out_dat)
            last_file = file

0
250
500
750
1000
1250
1500
1750
2000
2250
2500
2750
3000
3250
3500
3750
4000
4250
4500
4750
5000
5250
5500
5750
6000
6250
6500
6750
7000
7250
7500
7750
8000
8250
8500
8750
9000
9250
9500
9750
10000
10250
10500
10750
11000
11250
11500
11750
12000
12250
12500
12750
13000
13250
13500
13750
14000
14250
14500
14750
15000
15250
15500
15750
16000
16250
16500
16750
17000
17250
17500
17750
18000
18250
18500
18750
19000
19250
19500
19750
20000
20250
20500
20750
21000
21250
21500
21750
22000
22250
22500
22750
23000
23250
23500
23750
24000
24250
24500
24750
25000
25250
25500
25750
26000
26250
26500
26750
27000
27250
27500
27750
28000
28250
28500
28750
29000
29250
29500
29750
30000
30250
30500
30750
31000
31250
31500
31750
32000
32250
32500
32750
33000
33250
33500
33750
34000
34250
34500
34750
35000
35250
35500
35750
36000
36250
36500
36750
37000
37250
37500
37750
38000
38250
38500
38750
39000
39250
39500
39750
40000
40250
40500
40750
41000
41250
41500
41750
42000
42250
42500
42750
43000
43250
43

In [13]:
df_pool = pd.DataFrame(data_collect)
df_pool.columns = ['context', 'paper_offset', 'pmcid', 'doi', 'section']
df_pool.head()

Unnamed: 0,context,paper_offset,pmcid,doi,section
0,Cognition and Mood-Related Behaviors in L3mbtl...,0,4388653,10.1371/journal.pone.0121252,TITLE
1,Alterations in histone lysine methylation and ...,65,4388653,10.1371/journal.pone.0121252,ABSTRACT
2,Introduction,1608,4388653,10.1371/journal.pone.0121252,INTRO
3,"Mood spectrum disorders, including depression ...",1621,4388653,10.1371/journal.pone.0121252,INTRO
4,Malignant Brain Tumor (MBT) domain chromatin ...,2704,4388653,10.1371/journal.pone.0121252,INTRO


In [16]:
tk_file = open('/data/riddleta/data_sharing_reuse/external/tokenizer.pk', 'rb')
tokenizer = pickle.load(tk_file)
tk_file.close()
df_pool['context'] = df_pool.context.apply(lambda x: tokenizer.tokenize(x))
df_pool = df_pool.explode('context')
df_pool.shape# all sentence 18406892

(18406892, 5)

In [17]:
df_pool = df_pool[~df_pool.section.isin(['REF', 'TABLE', 'TITLE'])]

In [18]:
df_pmcids = pd.read_csv('/data/riddleta/data_sharing_reuse/external/PMC-ids.csv')
df_pmcids['pmcid'] = df_pmcids.PMCID.apply(lambda x: str(x)[3:])
df_pool = df_pool.merge(df_pmcids, how='left', on='pmcid')
df_pool['pmcid'] = df_pool.pmcid.astype('str')
df_pool['offset'] = df_pool.paper_offset.astype('str')
df_pool['pmcid-offset'] = df_pool.apply(lambda x: x['pmcid']+'-'+x['offset'], axis=1)
df_pool['context'] = df_pool.context.astype('str')
df_pool['text'] = df_pool.context.apply(lambda x: sep_urls(x))
df_pool['syn_text'] = df_pool.text.apply(lambda x: syns(x))
df_pool['all_text'] = df_pool.text + ' ' + df.syn_text

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


In [20]:
df_pool.text.fillna('', inplace=True)
df_pool['has_url'] = df_pool.text.apply(lambda x: extract.has_urls(x))
df_pool['has_parenth'] = df_pool.text.apply(lambda x: check_paren(x))
df_pool['repo'] = df_pool.text.apply(lambda x: repo_label(x))
df_pool.all_text.fillna('', inplace=True)

In [24]:
x_pool = cv.transform(df_pool.all_text)
one_hots_pool = enc.transform(df_pool[['section', 'Journal Title', 'Year', 'has_url', 'has_parenth', 'repo']])

x_pool = hstack([x_pool, one_hots_pool])
y_pool_pred = clf.predict(x_pool)
pd.Series(y_pool_pred).value_counts()

In [30]:
df_pool['data_sharing_pred'] = y_pool_pred
df_data_statements = df_pool[df_pool.data_sharing_pred==1]

In [34]:
statements_to_label = df_data_statements.sample(n=500, random_state=42)
out_file = statements_to_label[['context', 'paper_offset', 'pmcid', 'doi', 'section', 
                                'Journal Title', 'text', 'has_url', 'has_parenth', 'repo',]]

In [40]:
out_file.to_csv('/data/riddleta/data_sharing_reuse/interim/high_recall_labelling.csv', index=False)