In [121]:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.feature_extraction import stop_words
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc, f1_score, precision_score, recall_score

import nltk
from nltk.corpus import wordnet

from scipy.sparse import hstack

from urlextract import URLExtract

import re

import sys

sys.path.append('..')
from src.features.build_features import syns, sep_urls, check_paren, repo_label

import warnings
warnings.filterwarnings('ignore')

In [122]:
extract = URLExtract()

In [123]:
df = pd.read_csv('/data/riddleta/data_sharing_reuse/external/combined_labels_incomplete.csv')
df.head()

Unnamed: 0,data_statement,doi,pmcid,section,text,Journal Title,Year,Volume,Issue,Page,PMCID,PMID,cv_run0,cv_run1,cv_run2,cv_run3,cv_run4,cv_sum
0,0,10.1007/s11481-008-9113-7,2581635,DISCUSS,In the Golgi-impregnation experiment we found ...,J Neuroimmune Pharmacol,2008,3,4.0,241,PMC2581635,18594991,0,0,0,0,0,0
1,0,10.1016/j.biopsych.2008.07.009,2586327,METHODS,The individuals with at least 3 usable voxels ...,Biol Psychiatry,2008,64,10.0,856,PMC2586327,18707679,0,0,0,0,0,0
2,0,10.1016/j.brainres.2008.07.008,2612637,METHODS,Although our procedure was intended to target ...,Brain Res,2008,1230,,202,PMC2612637,18662678,0,0,0,0,0,0
3,0,10.1371/journal.pone.0004156,2612746,METHODS,"low status face pictures, and included genotyp...",PLoS One,2009,4,1.0,e4156,PMC2612746,19142220,0,0,0,0,0,0
4,0,10.1016/j.neuron.2008.07.022,2614916,METHODS,"For each word, participants made a recognition...",Neuron,2008,59,4.0,547,PMC2614916,18760691,0,0,0,0,0,0


In [124]:
df.text.fillna('', inplace=True)
df['has_url'] = df.text.apply(lambda x: extract.has_urls(x))
df['has_parenth'] = df.text.apply(lambda x: check_paren(x))
df['repo'] = df.text.apply(lambda x: repo_label(x))
df['text'] = df.text.apply(lambda x: sep_urls(x))
df['syn_text'] = df.text.apply(lambda x: syns(x))
df['all_text'] = df.text + ' ' + df.syn_text


In [125]:
seed = 42
out_dat = df.copy()
out_dat.drop(['cv_run0', 'cv_run1', 'cv_run2', 'cv_run3', 'cv_run4', 'cv_sum'], axis=1, inplace=True)

for i in range(0, 5):
    s = 'cv_run'+str(i)
    print(s)
    out_dat[s] = -99
    kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=seed)
    cv = CountVectorizer(stop_words=stop_words.ENGLISH_STOP_WORDS)
    enc = OneHotEncoder(handle_unknown='ignore')
    for train_index, test_index in kfold.split(df.all_text, df.data_statement):
        x_train = cv.fit_transform(df.all_text[train_index])
        one_hots_train = enc.fit_transform(df[['section', 'Journal Title', 'Year', 'has_url', 'has_parenth', 'repo']].iloc[train_index])
        y_train = df.data_statement[train_index]
        x_test = cv.transform(df.all_text[test_index])
        one_hots_test = enc.transform(df[['section', 'Journal Title', 'Year', 'has_url', 'has_parenth', 'repo']].iloc[test_index])
        y_test = df.data_statement[test_index]

        x_train = hstack([x_train, one_hots_train])
        x_test = hstack([x_test, one_hots_test])

        clf = AdaBoostClassifier()
        #y_score = clf.fit(x_res, y_res).decision_function(x_test)
        y_score = clf.fit(x_train, y_train).decision_function(x_test)
        y_pred = clf.predict(x_test)
        
        out_dat[s].iloc[test_index] = y_pred

    seed = seed+1

cv_run0
cv_run1
cv_run2
cv_run3
cv_run4


In [126]:
out_dat['cv_sum'] = out_dat.cv_run0 + out_dat.cv_run1 + out_dat.cv_run2 + out_dat.cv_run3 + out_dat.cv_run4

In [127]:
out_dat.to_csv('/data/riddleta/data_sharing_reuse/processed/cross_val_preds.csv', index=False)