In [315]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction import stop_words
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, f1_score, precision_score, recall_score

from imblearn.ensemble import EasyEnsembleClassifier

import nltk
from nltk.corpus import wordnet

from scipy.sparse import hstack

from urlextract import URLExtract

import re

import sys

sys.path.append('..')
from src.features.build_features import syns, sep_urls, check_paren, repo_label

In [296]:
extract = URLExtract()

In [297]:
df = pd.read_csv('/data/riddleta/data_sharing_reuse/external/combined_labels_incomplete.csv')
df.head()

Unnamed: 0,data_statement,doi,pmcid,section,text,Journal Title,Year,Volume,Issue,Page,PMCID,PMID,cv_run0,cv_run1,cv_run2,cv_run3,cv_run4,cv_sum
0,0,10.1007/s11481-008-9113-7,2581635,DISCUSS,In the Golgi-impregnation experiment we found ...,J Neuroimmune Pharmacol,2008,3,4.0,241,PMC2581635,18594991,0,0,0,0,0,0
1,0,10.1016/j.biopsych.2008.07.009,2586327,METHODS,The individuals with at least 3 usable voxels ...,Biol Psychiatry,2008,64,10.0,856,PMC2586327,18707679,0,0,0,0,0,0
2,0,10.1016/j.brainres.2008.07.008,2612637,METHODS,Although our procedure was intended to target ...,Brain Res,2008,1230,,202,PMC2612637,18662678,0,0,0,0,0,0
3,0,10.1371/journal.pone.0004156,2612746,METHODS,"low status face pictures, and included genotyp...",PLoS One,2009,4,1.0,e4156,PMC2612746,19142220,0,0,0,0,0,0
4,0,10.1016/j.neuron.2008.07.022,2614916,METHODS,"For each word, participants made a recognition...",Neuron,2008,59,4.0,547,PMC2614916,18760691,0,0,0,0,0,0


In [298]:
df.text.fillna('', inplace=True)
df['has_url'] = df.text.apply(lambda x: extract.has_urls(x))
df['has_parenth'] = df.text.apply(lambda x: check_paren(x))
df['repo'] = df.text.apply(lambda x: repo_label(x))
df['text'] = df.text.apply(lambda x: sep_urls(x))
df['syn_text'] = df.text.apply(lambda x: syns(x))
df['all_text'] = df.text + ' ' + df.syn_text

In [299]:
df.repo.value_counts()

no_repo_mentioned      2367
dbgap                    43
github                   30
fcon_1000.projects       18
nitrc                    16
 ndar                    12
loni.usc.edu             10
osf.io                    9
brain-map.org             9
humanconnectome.org       9
zenodo                    4
figshare                  2
dryad                     2
fmridc                    1
openneuro                 1
Name: repo, dtype: int64

In [300]:
cv = CountVectorizer(stop_words=stop_words.ENGLISH_STOP_WORDS)
enc = OneHotEncoder(handle_unknown='ignore')

In [301]:
x_tr, x_tst, y_tr, y_tst = train_test_split(df.all_text, df.data_statement, test_size=.25, random_state=42, stratify=df.data_statement)

In [302]:

x_train = cv.fit_transform(x_tr)
one_hots_train = enc.fit_transform(df[['section', 'Journal Title', 'Year', 'has_url', 'has_parenth', 'repo']].loc[x_tr.index])
y_train = df.data_statement[x_tr.index]
x_test = cv.transform(df.all_text[x_tst.index])
one_hots_test = enc.transform(df[['section', 'Journal Title', 'Year', 'has_url', 'has_parenth', 'repo']].iloc[x_tst.index])
y_test = df.data_statement[x_tst.index]

x_train = hstack([x_train, one_hots_train])
x_test = hstack([x_test, one_hots_test])
#x_res, y_res = ros.fit_resample(x_train, y_train)

clf = AdaBoostClassifier()
#y_score = clf.fit(x_res, y_res).decision_function(x_test)
y_score = clf.fit(x_train, y_train).decision_function(x_test)
y_pred = clf.predict(x_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



[[572   9]
 [ 11  42]]
              precision    recall  f1-score   support

           0       0.98      0.98      0.98       581
           1       0.82      0.79      0.81        53

    accuracy                           0.97       634
   macro avg       0.90      0.89      0.90       634
weighted avg       0.97      0.97      0.97       634



## Post-Francisco meeting (1/24)

Francisco suggested a few changes. First, look into adjustments to the classifier to deal with class imbalance. Second, in light of the reasonably good performance achieved by a single, small decision tree, it may be that a random forest classifier can yield mor eimprovements. Below implements those change

### Adaboost w/ class imbalance adjustments

I found an extension of scikit learn that specifically deals with imbalanced data. In their documentation, they describe the following classifier:

The EasyEnsembleClassifier allows to bag AdaBoost learners which are trained on balanced bootstrap samples LWZ2009.
 
with LWZ2009 referring to the following paper:

X. Y. Liu, J. Wu and Z. H. Zhou, “Exploratory Undersampling for Class-Imbalance Learning,” in IEEE Transactions on Systems, Man, and Cybernetics, Part B (Cybernetics), vol. 39, no. 2, pp. 539-550, April 2009.

The below implements this modification. We see that we're now capturing nearly every data statement, though the precision has dropped a bit. I think for future labeling, this is a worthwhile trade-off.

In [314]:

x_train = cv.fit_transform(x_tr)
one_hots_train = enc.fit_transform(df[['section', 'Journal Title', 'Year', 'has_url', 'has_parenth', 'repo']].loc[x_tr.index])
y_train = df.data_statement[x_tr.index]
x_test = cv.transform(df.all_text[x_tst.index])
one_hots_test = enc.transform(df[['section', 'Journal Title', 'Year', 'has_url', 'has_parenth', 'repo']].iloc[x_tst.index])
y_test = df.data_statement[x_tst.index]

x_train = hstack([x_train, one_hots_train])
x_test = hstack([x_test, one_hots_test])
#x_res, y_res = ros.fit_resample(x_train, y_train)

clf = EasyEnsembleClassifier()
#y_score = clf.fit(x_res, y_res).decision_function(x_test)
y_score = clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted']))
print(classification_report(y_test, y_pred))

Predicted    0   1
True              
0          552  29
1            2  51
              precision    recall  f1-score   support

           0       1.00      0.95      0.97       581
           1       0.64      0.96      0.77        53

    accuracy                           0.95       634
   macro avg       0.82      0.96      0.87       634
weighted avg       0.97      0.95      0.96       634



### Random Forest

Generally this doesn't appear to work as well as adaboost. Changing the class weights appears to have negligible impact.

In [318]:
x_train = cv.fit_transform(x_tr)
one_hots_train = enc.fit_transform(df[['section', 'Journal Title', 'Year', 'has_url', 'has_parenth', 'repo']].loc[x_tr.index])
y_train = df.data_statement[x_tr.index]
x_test = cv.transform(df.all_text[x_tst.index])
one_hots_test = enc.transform(df[['section', 'Journal Title', 'Year', 'has_url', 'has_parenth', 'repo']].iloc[x_tst.index])
y_test = df.data_statement[x_tst.index]

x_train = hstack([x_train, one_hots_train])
x_test = hstack([x_test, one_hots_test])
#x_res, y_res = ros.fit_resample(x_train, y_train)

clf = RandomForestClassifier()
#y_score = clf.fit(x_res, y_res).decision_function(x_test)
y_score = clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted']))
print(classification_report(y_test, y_pred))

Predicted    0   1
True              
0          579   2
1           16  37
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       581
           1       0.95      0.70      0.80        53

    accuracy                           0.97       634
   macro avg       0.96      0.85      0.89       634
weighted avg       0.97      0.97      0.97       634



In [324]:
x_train = cv.fit_transform(x_tr)
one_hots_train = enc.fit_transform(df[['section', 'Journal Title', 'Year', 'has_url', 'has_parenth', 'repo']].loc[x_tr.index])
y_train = df.data_statement[x_tr.index]
x_test = cv.transform(df.all_text[x_tst.index])
one_hots_test = enc.transform(df[['section', 'Journal Title', 'Year', 'has_url', 'has_parenth', 'repo']].iloc[x_tst.index])
y_test = df.data_statement[x_tst.index]

x_train = hstack([x_train, one_hots_train])
x_test = hstack([x_test, one_hots_test])
#x_res, y_res = ros.fit_resample(x_train, y_train)

clf = RandomForestClassifier(class_weight
                            ) # adjusting this hasn't dramatically changed the predictions.
#y_score = clf.fit(x_res, y_res).decision_function(x_test)
y_score = clf.fit(x_train, y_train)
y_pred = clf.predict(x_test)
print(pd.crosstab(y_test, y_pred, rownames=['True'], colnames=['Predicted']))
print(classification_report(y_test, y_pred))

Predicted    0   1
True              
0          574   7
1           20  33
              precision    recall  f1-score   support

           0       0.97      0.99      0.98       581
           1       0.82      0.62      0.71        53

    accuracy                           0.96       634
   macro avg       0.90      0.81      0.84       634
weighted avg       0.95      0.96      0.95       634

