In [11]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction import stop_words
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, f1_score, precision_score, recall_score

from imblearn.ensemble import EasyEnsembleClassifier

import nltk
from nltk.corpus import wordnet

from scipy.sparse import hstack

from urlextract import URLExtract

import re

import sys

sys.path.append('..')
from src.features.build_features import syns, sep_urls, check_paren, repo_label

import warnings
warnings.filterwarnings('ignore')

Here, we are trying to use two classifiers at once. One that is relatively high recall, and a second that is relatively high precision. After training them both on the same data, we apply both to the test set and only take the items that were labeled by both as true positives. 

Generally, this doesn't perform as we wanted. Even among the subset of true positives identified by the high recall classifier, the high precision still does not accurately label some as true positives. In other words, the recall of the high-precision calssifier is middling on the full test set, and is middling on the test set that is more constrained to those that are just identified as probable data statements by the high recall classifier.

In [2]:
extract = URLExtract()

In [3]:
df = pd.read_csv('/data/riddleta/data_sharing_reuse/external/combined_labels_incomplete.csv')
df.head()

Unnamed: 0,data_statement,doi,pmcid,section,text,Journal Title,Year,Volume,Issue,Page,PMCID,PMID,cv_run0,cv_run1,cv_run2,cv_run3,cv_run4,cv_sum
0,0,10.1007/s11481-008-9113-7,2581635,DISCUSS,In the Golgi-impregnation experiment we found ...,J Neuroimmune Pharmacol,2008,3,4.0,241,PMC2581635,18594991,0,0,0,0,0,0
1,0,10.1016/j.biopsych.2008.07.009,2586327,METHODS,The individuals with at least 3 usable voxels ...,Biol Psychiatry,2008,64,10.0,856,PMC2586327,18707679,0,0,0,0,0,0
2,0,10.1016/j.brainres.2008.07.008,2612637,METHODS,Although our procedure was intended to target ...,Brain Res,2008,1230,,202,PMC2612637,18662678,0,0,0,0,0,0
3,0,10.1371/journal.pone.0004156,2612746,METHODS,"low status face pictures, and included genotyp...",PLoS One,2009,4,1.0,e4156,PMC2612746,19142220,0,0,0,0,0,0
4,0,10.1016/j.neuron.2008.07.022,2614916,METHODS,"For each word, participants made a recognition...",Neuron,2008,59,4.0,547,PMC2614916,18760691,0,0,0,0,0,0


In [4]:
df.text.fillna('', inplace=True)
df['has_url'] = df.text.apply(lambda x: extract.has_urls(x))
df['has_parenth'] = df.text.apply(lambda x: check_paren(x))
df['repo'] = df.text.apply(lambda x: repo_label(x))
df['text'] = df.text.apply(lambda x: sep_urls(x))
df['syn_text'] = df.text.apply(lambda x: syns(x))
df['all_text'] = df.text + ' ' + df.syn_text

In [5]:
df.repo.value_counts()

no_repo_mentioned      2360
dbgap                    43
github                   30
fcon_1000.projects       18
nitrc                    16
 ndar                    11
loni.usc.edu             10
brain-map.org             9
osf.io                    9
humanconnectome.org       9
(ndar)                    8
zenodo                    4
figshare                  2
dryad                     2
openneuro                 1
fmridc                    1
Name: repo, dtype: int64

In [6]:
cv = CountVectorizer(stop_words=stop_words.ENGLISH_STOP_WORDS)
enc = OneHotEncoder(handle_unknown='ignore')

In [82]:
x_tr, x_tst, y_tr, y_tst = train_test_split(df.all_text, df.data_statement, test_size=.25, random_state=42, stratify=df.data_statement)

In [83]:
x_train = cv.fit_transform(x_tr)
one_hots_train = enc.fit_transform(df[['section', 'Journal Title', 'Year', 'has_url', 'has_parenth', 'repo']].loc[x_tr.index])
y_train = df.data_statement[x_tr.index]
x_test = cv.transform(df.all_text[x_tst.index])
one_hots_test = enc.transform(df[['section', 'Journal Title', 'Year', 'has_url', 'has_parenth', 'repo']].iloc[x_tst.index])
y_test = df.data_statement[x_tst.index]

x_train = hstack([x_train, one_hots_train])
x_test = hstack([x_test, one_hots_test])

clf_high_rec = EasyEnsembleClassifier()
clf_high_prec = RandomForestClassifier()

y_score_high_rec = clf_high_rec.fit(x_train, y_train)
y_score_high_prec = clf_high_prec.fit(x_train, y_train)
y_pred_high_rec = clf_high_rec.predict(x_test)
y_pred_high_prec = clf_high_prec.predict(x_test)
print(pd.crosstab(y_test, y_pred_high_rec, rownames=['True'], colnames=['Predicted']))
print(pd.crosstab(y_test, y_pred_high_prec, rownames=['True'], colnames=['Predicted']))

df_test = df.loc[x_tst.index]

Predicted    0   1
True              
0          551  30
1            2  51
Predicted    0   1
True              
0          579   2
1           16  37


In [84]:
df_test['high_rec_pred'] = y_pred_high_rec
df_test['high_prec_pred'] = y_pred_high_prec

In [85]:
df_test['overall_pred'] = 0

In [86]:
df_test.overall_pred[(df_test.high_rec_pred==1) & (df_test.high_prec_pred == 1)] = 1

In [87]:
df_test.overall_pred.value_counts()

0    595
1     39
Name: overall_pred, dtype: int64

In [88]:
print(pd.crosstab(y_test, df_test.overall_pred, rownames=['True'], colnames=['Predicted']))
print(classification_report(y_test, df_test.overall_pred))

Predicted    0   1
True              
0          579   2
1           16  37
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       581
           1       0.95      0.70      0.80        53

    accuracy                           0.97       634
   macro avg       0.96      0.85      0.89       634
weighted avg       0.97      0.97      0.97       634

