In [1]:
import random
import pandas as pd
import csv
from sklearn.model_selection import train_test_split
import re
from datasets import Dataset
from tqdm import tqdm

### Load the positive, negative, and test datasets

Test set

In [2]:
def filter_ab(string):
    #Remove the problematic parts of strings
    string = re.sub('\n', ' ', string)
    string = re.sub('\t', ' ', string)
    return string

In [3]:
curator_labeled = pd.read_excel('curator_labeled_dataset.xlsx',engine='openpyxl').drop(columns=['Label', 'PMID_Links','Unnamed: 6'])
curator_pmids = list(curator_labeled['PMID'])
for i in range(len(curator_pmids)):
    curator_pmids[i] = str(curator_pmids[i])

In [4]:
curator_labeled.head()

Unnamed: 0,PMID,Title,Abs,Validation
0,9207112,Evolution of the Friedreich's ataxia trinucleo...,"Friedreich's ataxia, the most frequent inherit...",Yes
1,19368978,Cleft palate lateral synechia syndrome: an opp...,OBJECTIVES: To report two cases of cleft palat...,no
2,7593576,Craniofacial conodysplasia.,A family with dominant inheritance of a previo...,no
3,16839424,"Cardiomyopathy, familial dilated.",Dilated cardiomyopathy (DCM) is a heart muscle...,yes
4,3409540,Autosomal dominant antecubital pterygium: synd...,An autosomal dominant (AD) antecubital pterygi...,no


In [5]:
test_titles = [filter_ab(title) for title in list(curator_labeled['Title'])]
test_abstracts = [filter_ab(abstract) for abstract in list(curator_labeled['Abs'])]

#test_titles = list(curator_labeled['Title'])
#test_abstracts = list(curator_labeled['Abs'])
curator_labels = list(curator_labeled['Validation'])

#convert curator labels from yes/no to 1/0
for i in range(len(curator_labels)):
    if curator_labels[i][0].lower() == 'y':
        curator_labels[i] = 1
    elif curator_labels[i][0].lower() == 'n':
        curator_labels[i] = 0

In [6]:
with open('epi_classify_test.tsv', "w") as f:
    f.write('abstract\tlabel\n')
    for i in tqdm(range(len(test_titles))):
        if i ==len(test_titles)-1:
            output = test_titles[i]+' '+test_abstracts[i]+'\t' +str(curator_labels[i])
        else:
            output = test_titles[i]+' '+test_abstracts[i]+'\t' +str(curator_labels[i])+'\n'
        f.write(output)
f.close()

100%|██████████| 98/98 [00:00<00:00, 105314.32it/s]


positive/negative

In [7]:
abstracts = []
labels = []
pmids = []

# Read in negative dataset

with open("negative_dataset.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        abstract = row[1]
        # Only keep the article if the abstract has more than 5 characters, and it's not one of the curator articles
        if len(abstract)>75 and row[0] not in curator_pmids:
            abstracts.append(filter_ab(abstract))
            labels.append(0)
            pmids.append(row[0])

# Read in positive dataset
            
with open("positive_dataset.csv", 'r') as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    next(reader)
    for row in reader:
        abstract = row[1]
        # Only keep the article if the abstract has more than 5 chars, and it's not one of the curator articles
        if len(abstract)>75 and row[0] not in curator_pmids:
            abstracts.append(filter_ab(abstract))
            labels.append(1)
            pmids.append(row[0])

print(len(labels), len(abstracts), len(pmids))

26334 26334 26334


In [8]:
combined = list(zip(abstracts,labels))
train_set, val_set = train_test_split(combined, train_size=0.7, random_state=4)
print(len(train_set),len(val_set))

18433 7901


### Save

In [9]:
with open('epi_classify_train.tsv', "w") as f:
    f.write('abstract\tlabel\n')
    for i in tqdm(range(len(train_set))):
        if i ==len(train_set)-1:
            output = str(train_set[i][0])+'\t'+str(train_set[i][1])
        else:
            output = str(train_set[i][0])+'\t'+str(train_set[i][1])+'\n'
        f.write(output)
f.close()

100%|██████████| 18433/18433 [00:00<00:00, 176894.31it/s]


In [10]:
with open('epi_classify_val.tsv', "w") as f:
    f.write('abstract\tlabel\n')
    for i in tqdm(range(len(val_set))):
        if i ==len(val_set)-1:
            output = str(val_set[i][0])+'\t'+str(val_set[i][1])
        else:
            output = str(val_set[i][0])+'\t'+str(val_set[i][1])+'\n'
        f.write(output)
f.close()

100%|██████████| 7901/7901 [00:00<00:00, 173065.99it/s]


### Unused Code

Compress

In [None]:
import gzip

In [None]:
with open('epi_classify_train.tsv','rb') as tr:
    train = tr.read()

traindata = bytearray(train)
with gzip.open('epi_classify_train.tsv.gz', "wb") as f:
    f.write(traindata)

In [None]:
with open('epi_classify_val.tsv','rb') as v:
    val = v.read()

valdata = bytearray(val)
with gzip.open('epi_classify_val.tsv.gz', "wb") as f:
    f.write(valdata)

In [None]:
with open('epi_classify_test.tsv','rb') as tst:
    test = tst.read()

testdata = bytearray(test)
with gzip.open('epi_classify_test.tsv.gz', "wb") as f:
    f.write(testdata)

In [None]:
training = []
for i in range(1,len(train_set)):
    training.append({'abstract':train_set[i][0],'label':train_set[i][1],'index':i-1})

In [None]:
validation = []
for i in range(1,len(val_set)):
    validation.append({'abstract':val_set[i][0],'label':val_set[i][1],'index':i-1})

In [None]:
my_dict = {'train':training,'validation':validation}

In [None]:
#import sys
#!{sys.executable} -m pip install jsonlines
import jsonlines
with jsonlines.open('training_format3.jsonl', mode='w') as writer:
    #writer.write_all(training)
    writer.write(my_dict)

In [None]:
import json
with open('epiclassify.json', 'w') as f:
    json.dump(my_dict, f)

In [None]:
ds_dict = DatasetDict.from_json('training_format3.jsonl')

In [None]:
ds_dict

In [None]:
ds = Dataset.from_dict(my_dict)

In [None]:
ds_dict = DatasetDict.from_dict(my_dict)

In [None]:
ds_dict

In [None]:
with open('training.json', "w") as f:
    for i in range(len(training)):
        output = str(training[i])+'\n'
        f.write(output)
        if i%500==0:
            print('abstract num',i,'done')
f.close()

In [None]:
with open('validation.json', "w") as f:
    for i in range(len(training)):
        output = str(training[i])+'\n'
        f.write(output)
        if i%500==0:
            print('abstract num',i,'done')
f.close()

data format 1

In [None]:
training = []
for i in range(1,len(train_set)):
    training.append({'abstract':train_set[i][0],'label':train_set[i][1],'index':i-1})

In [None]:
validation = []
for i in range(1,len(val_set)):
    validation.append({'abstract':val_set[i][0],'label':val_set[i][1],'index':i-1})

In [None]:
with open('training.jsonl', "w") as f:
    for i in range(len(training)):
        output = str(training[i])+'\n'
        f.write(output)
        if i%500==0:
            print('abstract num',i,'done')
f.close()

In [None]:
data_format1 = {'train':{'abstract':train_set[i][0],'label':train_set[i][1],'index':i-1 for i in range(1,len(train_set))},
                'validation':{'abstract':val_set[i][0],'label':val_set[i][1],'index':i-1 for i in range(1,len(val_set))}}

In [None]:
data_format1

In [None]:
import json
with open('data_format1.json', 'w') as f:
    json.dump(data_format1, f)

In [None]:
dataset = load_dataset('json', data_files='data_format1.json', field='validation')

In [None]:
dataset

In [None]:
with open('epi_classify_test.csv', "w") as f:
    f.write('abstract,label\n')
    for i in range(len(test_titles)):
        if i ==len(test_titles)-1:
            output = test_titles[i]+' '+test_abstracts[i]+',' +str(curator_labels[i])
        else:
            output = test_titles[i]+' '+test_abstracts[i]+',' +str(curator_labels[i])+'\n'
        f.write(output)
f.close()

In [None]:
with open('epi_classify_train.csv', "w") as f:
    f.write('abstract,label\n')
    for i in range(len(train_set)):
        if i ==len(train_set)-1:
            output = str(train_set[i][0])+','+str(train_set[i][1])
        else:
            output = str(train_set[i][0])+','+str(train_set[i][1])+'\n'
        f.write(output)
        if i%500==0:
            print('abstract num',i,'done')
f.close()

In [None]:
with open('epi_classify_val.csv', "w") as f:
    f.write('abstract,label\n')
    for i in range(len(val_set)):
        if i ==len(val_set)-1:
            output = str(val_set[i][0])+','+str(val_set[i][1])
        else:
            output = str(val_set[i][0])+','+str(val_set[i][1])+'\n'
        f.write(output)
        if i%500==0:
            print('abstract num',i,'done')
f.close()