In [1]:
# !pip install beautifulsoup4 lxml

In [2]:
import pandas as pd
import glob
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize, word_tokenize

In [3]:
train_files = glob.glob("XML/train/*.xml")

In [4]:
test_files = glob.glob("XML/test/*.xml")

In [5]:
train_df = pd.DataFrame(columns=['text', 'sentence', 'snp', 'phenotype', 'ASSOCIATION', 'CONFIDENCE'])
test_df  = pd.DataFrame(columns=['text', 'sentence', 'snp', 'phenotype', 'ASSOCIATION', 'CONFIDENCE'])

In [6]:
for test_file in test_files[:]:
    
    with open(test_file, 'r', encoding='utf-8') as f:
        xml_data = f.read()

    pars_data = BeautifulSoup(xml_data, "xml")
    items = pars_data.find_all('sentence')
    
    text = pars_data.find_all('abstract')[0]['TEXT']
    sents = sent_tokenize(text)
    
    for item in items:
        snps = item.find_all('snp')
        phenotypes = item.find_all('phenotype')
        pairs = item.find_all('pair')
        
        snps_dic = {}
        for st in snps:
            snps_dic[st['ID']] = st['TEXT']
            
        phenotypes_dic = {}
        for pt in phenotypes:
            phenotypes_dic[pt['ID']] = pt['text']
        
        for pair in pairs:
            idx = len(test_df)
            test_df.at[idx, 'snp'] = snps_dic[pair['SNPID']]
            test_df.at[idx, 'phenotype'] = phenotypes_dic[pair['PHENOTYPEID']]
            test_df.at[idx, 'ASSOCIATION'] = pair['ASSOCIATION']
            test_df.at[idx, 'CONFIDENCE'] = pair['CONFIDENCE']
            test_df.at[idx, 'text'] = text

            for se in sents:
                if((phenotypes_dic[pair['PHENOTYPEID']] in se)&(snps_dic[pair['SNPID']] in se)):
                    test_df.at[idx, 'sentence'] = se

In [7]:
test_df

Unnamed: 0,text,sentence,snp,phenotype,ASSOCIATION,CONFIDENCE
0,OBJECTIVE: To examine the effects of genetic p...,Random-effects meta-analyses failed to detect ...,rs180040,POAG,negative,-
1,OBJECTIVE: To examine the effects of genetic p...,Random-effects meta-analyses failed to detect ...,rs1056836,POAG,negative,-
2,OBJECTIVE: To examine the effects of genetic p...,Random-effects meta-analyses failed to detect ...,rs10012,POAG,negative,-
3,OBJECTIVE: To examine the effects of genetic p...,Random-effects meta-analyses failed to detect ...,rs1056827,POAG,negative,-
4,OBJECTIVE: To examine the effects of genetic p...,Random-effects meta-analyses failed to detect ...,rs1056837,POAG,negative,-
...,...,...,...,...,...,...
360,Metabolic syndrome (MetS) is a common multifac...,Analysis of 4 SNPs revealed a significant diff...,rs10757274,MetS,negative,-
361,Genetic variation in the androgen receptor (AR...,Minor alleles in three correlated ht SNPs (rs6...,rs6152,endometrial cancer,positive,weak
362,Genetic variation in the androgen receptor (AR...,Minor alleles in three correlated ht SNPs (rs6...,rs1204038,endometrial cancer,positive,weak
363,Genetic variation in the androgen receptor (AR...,Minor alleles in three correlated ht SNPs (rs6...,rs1337082,endometrial cancer,positive,weak


In [8]:
for train_file in train_files[:]:
    
    with open(train_file, 'r', encoding='utf-8') as f:
        xml_data = f.read()

    pars_data = BeautifulSoup(xml_data, "xml")
    items = pars_data.find_all('sentence')
    
    text = pars_data.find_all('abstract')[0]['TEXT']
    sents = sent_tokenize(text)
    
    for item in items:
        snps = item.find_all('snp')
        phenotypes = item.find_all('phenotype')
        pairs = item.find_all('pair')
        
        snps_dic = {}
        for st in snps:
            snps_dic[st['ID']] = st['TEXT']
            
        phenotypes_dic = {}
        for pt in phenotypes:
            phenotypes_dic[pt['ID']] = pt['text']
        
        for pair in pairs:
            idx = len(train_df)
            train_df.at[idx, 'snp'] = snps_dic[pair['SNPID']]
            train_df.at[idx, 'phenotype'] = phenotypes_dic[pair['PHENOTYPEID']]
            train_df.at[idx, 'ASSOCIATION'] = pair['ASSOCIATION']
            train_df.at[idx, 'CONFIDENCE'] = pair['CONFIDENCE']
            train_df.at[idx, 'text'] = text

            for se in sents:
                if((phenotypes_dic[pair['PHENOTYPEID']] in se)&(snps_dic[pair['SNPID']] in se)):
                    train_df.at[idx, 'sentence'] = se

In [9]:
train_df.tail()

Unnamed: 0,text,sentence,snp,phenotype,ASSOCIATION,CONFIDENCE
930,OBJECTIVE: GH deficiency (GHD) in adults is as...,"RESULTS: At baseline, the minor alleles of cho...",rs35136575,higher serum TC,neutral,zero
931,OBJECTIVE: GH deficiency (GHD) in adults is as...,"RESULTS: At baseline, the minor alleles of cho...",rs35136575,lower TC concentrations,neutral,zero
932,OBJECTIVE: GH deficiency (GHD) in adults is as...,"RESULTS: At baseline, the minor alleles of cho...",rs35136575,higher serum HDL-C,neutral,zero
933,OBJECTIVE: GH deficiency (GHD) in adults is as...,"RESULTS: At baseline, the minor alleles of cho...",rs35136575,lower LDL-C,neutral,zero
934,OBJECTIVE: GH deficiency (GHD) in adults is as...,"RESULTS: At baseline, the minor alleles of cho...",rs35136575,higher LDL-C,neutral,zero


In [10]:
train_df['CONFIDENCE'].value_counts()

weak        375
strong      219
zero        142
moderate    108
-            91
Name: CONFIDENCE, dtype: int64

In [11]:
train_df['ASSOCIATION'].value_counts()

positive    702
neutral     142
negative     91
Name: ASSOCIATION, dtype: int64

In [12]:
test_df['CONFIDENCE'].value_counts()

zero        166
weak        140
-            29
moderate     16
strong       14
Name: CONFIDENCE, dtype: int64

In [13]:
test_df['ASSOCIATION'].value_counts()

positive    170
neutral     166
negative     29
Name: ASSOCIATION, dtype: int64

In [14]:
test_df.to_csv('test.csv', index=False)
train_df.to_csv('train.csv', index=False)