In [38]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction import stop_words
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression

from imblearn.ensemble import EasyEnsembleClassifier

import re

from urlextract import URLExtract

from scipy.sparse import hstack

import numpy as np

import json

import pickle

import sys

import seaborn as sns

sys.path.append('..')
from src.features.build_features import syns, sep_urls, check_paren, repo_label
from src.data.make_dataset import return_passages, test_suitability

In this notebook, I'm doing the following:

1. Split data into train and test set.
2. Train high recall clf on training and apply to test set.
3. Filter the test set to just those predicted to be data statements by the high recall classifier.
4. Split that set into train and test.
5. Train a high precision classifier on the sub training set.
6. Predict on final test set.

In [2]:
def code_kw(text):
    passage_marked = 0
    
    reg_matches = re.compile(r"""(software)|(tool)|(code)|(package)|(\sR\s)|(python)|
                                 (matlab)|(SPM8)|(implement.)""", re.X|re.VERBOSE)
    
    m = re.search(reg_matches, text.lower())
    if m:
        return(1)
    else:
        return(0)

In [60]:
df_labeled = pd.read_csv('/data/riddleta/data_sharing_reuse/interim/high_recall_labelling - high_recall_labelling.csv')
df_labeled['data_statement'] = df_labeled.n2.replace({'c':0, 'n':0, '2':1, 'd':1, 'n2':0, 'nd':0})

In [61]:
extract = URLExtract()
df = pd.read_csv('/data/riddleta/data_sharing_reuse/external/combined_labels_incomplete.csv')
df = pd.concat([df[['text', 'section', 'doi', 'Journal Title', 
                   'pmcid', 'data_statement']],
              df_labeled[['text', 'section', 'doi', 'Journal Title', 
                          'pmcid', 'data_statement']]])
df.text.fillna('', inplace=True)
df.shape

(3033, 6)

In [62]:
df_nimh = pd.read_csv('/data/riddleta/data_sharing_reuse/external/nimh_papers.csv')
df_nimh['Year'] = df_nimh['journal_year']
df_nimh = df_nimh[['pmcid', 'Year']].drop_duplicates()
df = df.merge(df_nimh, how='left', on='pmcid')
df.head()

Unnamed: 0,text,section,doi,Journal Title,pmcid,data_statement,Year
0,In the Golgi-impregnation experiment we found ...,DISCUSS,10.1007/s11481-008-9113-7,J Neuroimmune Pharmacol,2581635,0,2008
1,The individuals with at least 3 usable voxels ...,METHODS,10.1016/j.biopsych.2008.07.009,Biol Psychiatry,2586327,0,2008
2,Although our procedure was intended to target ...,METHODS,10.1016/j.brainres.2008.07.008,Brain Res,2612637,0,2008
3,"low status face pictures, and included genotyp...",METHODS,10.1371/journal.pone.0004156,PLoS One,2612746,0,2009
4,"For each word, participants made a recognition...",METHODS,10.1016/j.neuron.2008.07.022,Neuron,2614916,0,2008


In [63]:
df['has_url'] = df.text.apply(lambda x: extract.has_urls(x))
df['has_parenth'] = df.text.apply(lambda x: check_paren(x))
df['repo'] = df.text.apply(lambda x: repo_label(x))
df['text'] = df.text.apply(lambda x: sep_urls(x))
df['syn_text'] = df.text.apply(lambda x: syns(x))
df['all_text'] = df.text + ' ' + df.syn_text

In [64]:
cv = CountVectorizer(stop_words=stop_words.ENGLISH_STOP_WORDS)
enc = OneHotEncoder(handle_unknown='ignore')

x_tr, x_tst, y_tr, y_tst = train_test_split(df.all_text, df.data_statement, test_size=.33, random_state=42, stratify=df.data_statement)

In [65]:
x_train = cv.fit_transform(x_tr)
one_hots_train = enc.fit_transform(df[['section', 'Journal Title', 'Year', 'has_url', 'has_parenth', 'repo']].loc[x_tr.index])
y_train = df.data_statement[x_tr.index]
x_test = cv.transform(df.all_text[x_tst.index])
one_hots_test = enc.transform(df[['section', 'Journal Title', 'Year', 'has_url', 'has_parenth', 'repo']].iloc[x_tst.index])
y_test = df.data_statement[x_tst.index]

x_train = hstack([x_train, one_hots_train])
x_test = hstack([x_test, one_hots_test])
#x_res, y_res = ros.fit_resample(x_train, y_train)

clf = EasyEnsembleClassifier() #should have set random seed here
#y_score = clf.fit(x_res, y_res).decision_function(x_test)
y_score = clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
y_pred_proba = clf.predict_proba(x_test)
print(pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted']))
print(classification_report(y_test, y_pred))

Predicted    0    1
True               
0          804   75
1            3  119
              precision    recall  f1-score   support

           0       1.00      0.91      0.95       879
           1       0.61      0.98      0.75       122

    accuracy                           0.92      1001
   macro avg       0.80      0.95      0.85      1001
weighted avg       0.95      0.92      0.93      1001



# apply code detector to hold-out set

In [66]:
df_test = df.loc[x_tst.index]
df_test['kw_code'] = df_test.text.apply(lambda x: code_kw(x))

In [67]:
df_test['y_pred'] = y_pred
df_test['y_prob'] = y_pred_proba[:,1]

In [68]:
df_test_rnd2 = df_test[df_test.y_pred==1]
df_test_rnd2.reset_index(inplace=True)

In [70]:
x_tr, x_tst, y_tr, y_tst = train_test_split(df_test_rnd2.kw_code, df_test_rnd2.data_statement, test_size=.25, random_state=42, stratify=df_test_rnd2.data_statement)
x_tr = df_test_rnd2[['kw_code', 'y_prob']].loc[x_tr.index]
x_tst = df_test_rnd2[['kw_code', 'y_prob']].loc[x_tst.index]
#x_tr.shape
#x_tr = np.reshape(x_tr.tolist(), (x_tr.shape[0], 1))
#x_tst = np.reshape(x_tst.tolist(), (x_tst.shape[0], 1))

In [71]:
clf_log = LogisticRegression(random_state=42)
clf.fit(x_tr, y_tr)
y_pred = clf.predict(x_tst)

In [72]:
print(pd.crosstab(y_tst, y_pred, rownames=['True'], colnames=['Predicted']))
print(classification_report(y_tst, y_pred))

Predicted   0   1
True             
0          12   7
1          10  20
              precision    recall  f1-score   support

           0       0.55      0.63      0.59        19
           1       0.74      0.67      0.70        30

    accuracy                           0.65        49
   macro avg       0.64      0.65      0.64        49
weighted avg       0.67      0.65      0.66        49

