# processing_neg_all

Preprocessing the negative data.

1. remove no year data
2. Fix year in [1985, 2020]

## Load data

In [1]:
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
# pos data
pos_df = pd.read_csv('../csv_data/train.csv', squeeze=True, encoding='utf-8', dtype=str)
pos_df = pos_df.fillna('')
pos_pmid = pos_df['pmid'].to_list()
pos_df

Unnamed: 0,pmid,title,abstract,doi,year,authors
0,22115527,Erroneous attribution of relevant transcriptio...,Background: Cis-regulatory modules are bound b...,10.1186/1471-2164-12-578,2011,"Marc S Halfon, Qianqian Zhu, Elizabeth R Brenn..."
1,2478402,Dorsal expression of the Drosophila z600 gene ...,The Drosophila z600 gene is a member of an ove...,10.1016/0012-1606(89)90143-7,1989,"R A Schulz, J L Miksch"
2,9115734,The Drosophila islet gene governs axon pathfin...,We have isolated the Drosophila homolog of the...,10.1016/s0896-6273(00)81241-6,1997,"S Thor, J B Thomas"
3,7900988,Negative autoregulation by Ultrabithorax contr...,The Drosophila homeotic gene Ultrabithorax (Ub...,,1993,"K D Irvine, J Botas, S Jha, R S Mann, D S Hogness"
4,24186975,3D chromatin interactions organize Yan chromat...,Long-range integration of transcriptional inpu...,10.1101/gad.225789.113,2013,"Jemma L Webber, Jie Zhang, Aaron Mitchell-Dick..."
...,...,...,...,...,...,...
714,1363225,Ultrabithorax is a regulator of beta 3 tubulin...,beta 3 tubulin expression accompanies the spec...,,1992,"U Hinz, A Wolk, R Renkawitz-Pohl"
715,20668662,Robust target gene discovery through transcrip...,A comprehensive systems-level understanding of...,10.1371/journal.pbio.1000435,2010,"Stein Aerts, Xiao-Jiang Quan, Annelies Claeys,..."
716,11290304,Drosophila OVO regulates ovarian tumor transcr...,Evolutionarily conserved ovo loci encode devel...,,2001,"J Lü, B Oliver"
717,23597484,Regional modulation of a stochastically expres...,Stochastic mechanisms are sometimes utilized t...,10.1016/j.devcel.2013.02.016,2013,"Shivani U Thanawala, Jens Rister, Gregory W Go..."


In [3]:
neg_df = pd.read_csv('../csv_data/neg_all-combine.csv', encoding='utf-8', dtype=str, header=None)
neg_df.columns = ['title','pmid','doi','abstract','year','authors']
neg_df = neg_df.fillna('')
neg_df

Unnamed: 0,title,pmid,doi,abstract,year,authors
0,The microbiota of Drosophila suzukii influence...,31763075,10.7717/peerj.8097,Microorganisms play a central role in the biol...,2019,"Hiruni Dodangoda, Rita Ntim-Gyakari, Peter D N..."
1,Evolution of a central neural circuit underlie...,29995860,10.1038/s41586-018-0322-9,Courtship rituals serve to reinforce reproduct...,2018,"David L Stern, Vanessa Ruta, Laura F Seeholzer..."
2,Immunity in Drosophila melanogaster--from micr...,25421701,10.1038/nri3763,Since the discovery of antimicrobial peptide r...,2014,"Sara Cherry, Nicolas Buchon, Neal Silverman"
3,Carnivory in the larvae of Drosophila melanoga...,30341324,10.1038/s41598-018-33906-w,Drosophila melanogaster is widely used as a mo...,2018,Daxiang Yang
4,Can Drosophila melanogaster tell who's who?,30356241,10.1371/journal.pone.0205043,Drosophila melanogaster are known to live in a...,2018,"Jonathan Schneider, Nihal Murali, Graham W Tay..."
...,...,...,...,...,...,...
93113,The ins and outs of EGFR asymmetry,20723751,10.1016/j.cell.2010.08.003,The epidermal growth factor receptor (EGFR) re...,2010,Daniel J Leahy
93114,Cloning of CDP-diacylglycerol synthase from a ...,8863531,10.1046/j.1471-4159.1996.67052200.x,A critical step in the supply of substrate for...,1996,"M D Uhler, A M Heacock, B W Agranoff"
93115,Acclimation and selection for increased resist...,8852846,,Direct selection for increased resistance to a...,1996,"R A Krebs, V Loeschcke"
93116,Synergistic effects on dopamine cell death in ...,25160001,10.1016/j.neuro.2014.08.005,The neurodegenerative effects of Parkinson's d...,2014,"Vanessa Nunez, David E Krantz, Khadij Assani, ..."


## Processing year

1. remove rundant white space
2. fix year in [1985, 2020]

In [4]:
remove_idx = []

for idx in range(len(neg_df)):
    # Check if remove redundant while space
    ori_abs = neg_df.iloc[idx]['abstract']
    ori_spl = ori_abs.split()
    aft_abs = ' '.join(ori_spl)
    if not (ori_abs == aft_abs):
        print('error:',idx)
    
    # remove invalid year
    if not (neg_df.iloc[idx]['year']).isnumeric(): # remove empty year
        remove_idx.append(idx)
    elif (int(neg_df.iloc[idx]['year'])>2020) or (int(neg_df.iloc[idx]['year'])<1985): # remove invalid year
        remove_idx.append(idx)

In [5]:
print('len of remove_idx:', len(remove_idx))
remove_idx = list(set(remove_idx))
print('len of remove_idx:', len(remove_idx))

len of remove_idx: 10065
len of remove_idx: 10065


In [6]:
neg_df = neg_df.drop(remove_idx).reset_index(drop=True)
neg_df

Unnamed: 0,title,pmid,doi,abstract,year,authors
0,The microbiota of Drosophila suzukii influence...,31763075,10.7717/peerj.8097,Microorganisms play a central role in the biol...,2019,"Hiruni Dodangoda, Rita Ntim-Gyakari, Peter D N..."
1,Evolution of a central neural circuit underlie...,29995860,10.1038/s41586-018-0322-9,Courtship rituals serve to reinforce reproduct...,2018,"David L Stern, Vanessa Ruta, Laura F Seeholzer..."
2,Immunity in Drosophila melanogaster--from micr...,25421701,10.1038/nri3763,Since the discovery of antimicrobial peptide r...,2014,"Sara Cherry, Nicolas Buchon, Neal Silverman"
3,Carnivory in the larvae of Drosophila melanoga...,30341324,10.1038/s41598-018-33906-w,Drosophila melanogaster is widely used as a mo...,2018,Daxiang Yang
4,Can Drosophila melanogaster tell who's who?,30356241,10.1371/journal.pone.0205043,Drosophila melanogaster are known to live in a...,2018,"Jonathan Schneider, Nihal Murali, Graham W Tay..."
...,...,...,...,...,...,...
83048,The ins and outs of EGFR asymmetry,20723751,10.1016/j.cell.2010.08.003,The epidermal growth factor receptor (EGFR) re...,2010,Daniel J Leahy
83049,Cloning of CDP-diacylglycerol synthase from a ...,8863531,10.1046/j.1471-4159.1996.67052200.x,A critical step in the supply of substrate for...,1996,"M D Uhler, A M Heacock, B W Agranoff"
83050,Acclimation and selection for increased resist...,8852846,,Direct selection for increased resistance to a...,1996,"R A Krebs, V Loeschcke"
83051,Synergistic effects on dopamine cell death in ...,25160001,10.1016/j.neuro.2014.08.005,The neurodegenerative effects of Parkinson's d...,2014,"Vanessa Nunez, David E Krantz, Khadij Assani, ..."


## Processing abstract

remove:

1. positive data appear in negative
2. abstract = ''
3. avstract = 'No abstract avalible'

In [7]:
pmid_err = 0
abst_err = 0
remove_list = []

for i in tqdm(range(len(neg_df))):
    pmid = neg_df.iloc[i]['pmid']
    abst = neg_df.iloc[i]['abstract']
    if pmid in pos_pmid:
        pmid_err += 1
        remove_list.append(i)
    if (abst == 'No abstract available') or (abst == ''):
        abst_err += 1
        remove_list.append(i)

remove_list = list(set(remove_list))
print('remove samples:', pmid_err+abst_err)
print('pmid error:', pmid_err)
print('abst error:', abst_err)

neg_df = neg_df.drop(remove_list).reset_index(drop=True)
neg_df.to_csv('../csv_data/neg_all-final.csv', index = False, encoding = 'utf-8')
neg_df

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=83053.0), HTML(value='')))


remove samples: 2790
pmid error: 699
abst error: 2091


Unnamed: 0,title,pmid,doi,abstract,year,authors
0,The microbiota of Drosophila suzukii influence...,31763075,10.7717/peerj.8097,Microorganisms play a central role in the biol...,2019,"Hiruni Dodangoda, Rita Ntim-Gyakari, Peter D N..."
1,Evolution of a central neural circuit underlie...,29995860,10.1038/s41586-018-0322-9,Courtship rituals serve to reinforce reproduct...,2018,"David L Stern, Vanessa Ruta, Laura F Seeholzer..."
2,Immunity in Drosophila melanogaster--from micr...,25421701,10.1038/nri3763,Since the discovery of antimicrobial peptide r...,2014,"Sara Cherry, Nicolas Buchon, Neal Silverman"
3,Carnivory in the larvae of Drosophila melanoga...,30341324,10.1038/s41598-018-33906-w,Drosophila melanogaster is widely used as a mo...,2018,Daxiang Yang
4,Can Drosophila melanogaster tell who's who?,30356241,10.1371/journal.pone.0205043,Drosophila melanogaster are known to live in a...,2018,"Jonathan Schneider, Nihal Murali, Graham W Tay..."
...,...,...,...,...,...,...
80258,The ins and outs of EGFR asymmetry,20723751,10.1016/j.cell.2010.08.003,The epidermal growth factor receptor (EGFR) re...,2010,Daniel J Leahy
80259,Cloning of CDP-diacylglycerol synthase from a ...,8863531,10.1046/j.1471-4159.1996.67052200.x,A critical step in the supply of substrate for...,1996,"M D Uhler, A M Heacock, B W Agranoff"
80260,Acclimation and selection for increased resist...,8852846,,Direct selection for increased resistance to a...,1996,"R A Krebs, V Loeschcke"
80261,Synergistic effects on dopamine cell death in ...,25160001,10.1016/j.neuro.2014.08.005,The neurodegenerative effects of Parkinson's d...,2014,"Vanessa Nunez, David E Krantz, Khadij Assani, ..."
