# split_pos

Split positive data into training / validation / test.

The distribution is 8:1:1.

## Load data

In [67]:
import numpy as np
import random 
import pandas as pd

seed_val = 0
random.seed(seed_val)
np.random.seed(seed_val)

In [68]:
df = pd.read_csv('../csv_data/output-pos-remove_no_abst.csv', encoding='utf-8', header=None, dtype=str).fillna('')
df.columns = ['pmid','title','abstract','doi','year','authors']
df

Unnamed: 0,pmid,title,abstract,doi,year,authors
0,10028969,Frizzled regulation of Notch signalling polari...,"The Drosophila eye, a paradigm for epithelial ...",10.1038/17395,1999,"M T Cooper, S J Bray"
1,10101114,Cis-regulatory elements of the mitotic regulat...,Mitosis in most Drosophila cells is triggered ...,,1999,"D A Lehman, B Patterson, L A Johnston, T Balze..."
2,10198247,Identification of a novel cis-acting positive ...,The Drosophila NK-1 homeobox gene belongs to t...,10.1006/bbrc.1999.0501,1999,"S J Kim, T Park, K K Lee"
3,10198632,"twin of eyeless, a second Pax-6 gene of Drosop...",The Drosophila Pax-6 gene eyeless (ey) plays a...,10.1016/s1097-2765(00)80457-8,1999,"T Czerny, G Halder, U Kloter, A Souabni, W J G..."
4,10207149,Direct regulatory interaction of the eyeless p...,The Pax-6 gene encodes a transcription factor ...,,1999,"T Niimi, M Seimiya, U Kloter, S Flister, W J G..."
...,...,...,...,...,...,...
894,9847251,Regulation of Pax6 expression is conserved bet...,Pax6 plays a key role in visual system develop...,,1999,"P X Xu, X Zhang, S Heaney, A Yoon, A M Michels..."
895,9858707,Transcriptional control of Drosophila bicoid b...,Concentration of maternal BICOID (BCD) establi...,10.1016/s0925-4773(98)00159-2,1998,"C Ruez, F Payre, A Vincent"
896,9882489,Drosophila mef2 expression during mesoderm dev...,"The function of the Drosophila mef2 gene, a me...",10.1006/dbio.1998.9081,1998,"H T Nguyen, X Xu"
897,9892357,A LIM-homeodomain combinatorial code for motor...,Different classes of vertebrate motor neuron t...,10.1038/16275,1999,"S Thor, S G Andersson, A Tomlinson, J B Thomas"


## Shuffle

Shuffle the dataframe by sample() method.

In [69]:
sh_df = df.sample(frac=1, random_state=0)
sh_df

Unnamed: 0,pmid,title,abstract,doi,year,authors
492,22115527,Erroneous attribution of relevant transcriptio...,Background:\n\n\n\n\n Cis-regulatory modu...,10.1186/1471-2164-12-578,2011,"Marc S Halfon, Qianqian Zhu, Elizabeth R Brenn..."
141,1327756,Regulation of even-skipped stripe 2 in the Dro...,In an effort to determine how crude gradients ...,,1992,"S Small, A Blair, M Levine"
409,20213139,The winged-helix transcription factor JUMU reg...,The PEV-modifying winged-helix/forkhead domain...,10.1007/s10577-010-9118-y,2010,"Annemarie Hofmann, Madeleine Brünner, Alexande..."
31,10660673,Cardiac enhancer activity of the homeobox gene...,The Drosophila homeobox gene tinman plays a cr...,,2000,"T V Venkatesh, M Park, K Ocorr, J Nemaceck, K ..."
570,2478402,Dorsal expression of the Drosophila z600 gene ...,The Drosophila z600 gene is a member of an ove...,10.1016/0012-1606(89)90143-7,1989,"R A Schulz, J L Miksch"
...,...,...,...,...,...,...
835,8660878,Disperse versus compact elements for the regul...,The segmented body pattern of the Drosophila e...,10.1006/dbio.1996.0146,1996,"M Klingler, J Soong, B Butler, J P Gergen"
192,15479229,Expression of the Drosophila melanogaster ATP ...,Mitochondrial biogenesis is a complex and high...,10.1111/j.1432-1033.2004.04336.x,2004,"Ana Talamillo, Miguel Angel Fernández-Moreno, ..."
629,26204530,Genome-Wide Mapping of Collier In Vivo Binding...,"Collier, the single Drosophila COE (Collier/EB...",10.1371/journal.pone.0133387,2015,"Mathilde de Taffin, Yannick Carrier, Laurence ..."
559,24496624,Machine learning classification of cell-specif...,The Drosophila heart is composed of two distin...,10.1242/dev.101709,2014,"Shaad M Ahmad, Brian W Busser, Di Huang, Eliza..."


## Output 

In [70]:
num_valid = round (len(sh_df) * 0.1)
num_test = round (len(sh_df) * 0.1)
num_train = len(sh_df) - num_valid - num_test
print('num_train:', num_train)
print('num_valid:', num_valid)
print('num_test:', num_test)
print()
print('check sum:', num_train + num_valid + num_test)

num_train: 719
num_valid: 90
num_test: 90

check sum: 899


In [71]:
def rmv_blank(df:pd.DataFrame):
    """
    Remove redundant blank in abstract.
    
    Args:
        df(DataFrame)
    """
    for idx in range(len(df)):
        bef_abs = df.iloc[idx]['abstract']
        bef_abs_split = bef_abs.split()
        aft_abs = ' '.join(bef_abs_split)
        df.iloc[idx]['abstract'] = aft_abs

In [72]:
train_df = sh_df[:num_train]
rmv_blank(train_df)
train_df.to_csv('./csv_data/train.csv', index=False, encoding='utf-8')
train_df

Unnamed: 0,pmid,title,abstract,doi,year,authors
492,22115527,Erroneous attribution of relevant transcriptio...,Background: Cis-regulatory modules are bound b...,10.1186/1471-2164-12-578,2011,"Marc S Halfon, Qianqian Zhu, Elizabeth R Brenn..."
141,1327756,Regulation of even-skipped stripe 2 in the Dro...,In an effort to determine how crude gradients ...,,1992,"S Small, A Blair, M Levine"
409,20213139,The winged-helix transcription factor JUMU reg...,The PEV-modifying winged-helix/forkhead domain...,10.1007/s10577-010-9118-y,2010,"Annemarie Hofmann, Madeleine Brünner, Alexande..."
31,10660673,Cardiac enhancer activity of the homeobox gene...,The Drosophila homeobox gene tinman plays a cr...,,2000,"T V Venkatesh, M Park, K Ocorr, J Nemaceck, K ..."
570,2478402,Dorsal expression of the Drosophila z600 gene ...,The Drosophila z600 gene is a member of an ove...,10.1016/0012-1606(89)90143-7,1989,"R A Schulz, J L Miksch"
...,...,...,...,...,...,...
117,12537575,Assessing the impact of comparative genomic se...,Background: It is widely accepted that compara...,10.1186/gb-2002-3-12-research0086,2002,"Casey M Bergman, Barret D Pfeiffer, Diego E Ri..."
464,21430782,A cis-regulatory map of the Drosophila genome,Systematic annotation of gene regulatory eleme...,10.1038/nature09990,2011,"Nicolas Nègre, Christopher D Brown, Lijia Ma, ..."
25,10619432,A pdf neuropeptide gene mutation and ablation ...,The mechanisms by which circadian pacemaker sy...,10.1016/s0092-8674(00)81676-1,1999,"S C Renn, J H Park, M Rosbash, J C Hall, P H T..."
110,12421707,The Drosophila Pox neuro gene: control of male...,We have dissected the entire cis-regulatory re...,10.1242/dev.00157,2002,"Werner Boll, Markus Noll"


In [73]:
valid_df = sh_df[num_train:num_train+num_valid]
rmv_blank(valid_df)
valid_df.to_csv('./csv_data/valid.csv', index=False, encoding='utf-8')
valid_df

Unnamed: 0,pmid,title,abstract,doi,year,authors
152,14507783,Tgfbeta signaling acts on a Hox response eleme...,Hox proteins play fundamental roles in generat...,10.1242/dev.00760,2003,"Aurélie Grienenberger, Samir Merabet, John Man..."
528,2328832,Cooperative enhancement at the Drosophila Sgs-...,The Drosophila glue gene Sgs-3 is specifically...,10.1016/0012-1606(90)90283-o,1990,"M Roark, K V Raghavan, T Todo, C A Mayeda, E M..."
696,2975615,Analysis of the promoter of the Rh2 opsin gene...,We have analyzed the cis-acting regulatory seq...,,1988,"D Mismer, W M Michael, T R Laverty, G M Rubin"
621,25835988,The evolutionary origination and diversificati...,The origination and diversification of morphol...,10.1371/journal.pgen.1005136,2015,"Eric M Camino, John C Butts, Alison Ordway, Jo..."
461,21383317,High resolution mapping of Twist to DNA in Dro...,Cis-regulatory modules (CRMs) function by bind...,10.1101/gr.104018.109,2011,"Anil Ozdemir, Katherine I Fisher-Aylor, Shirle..."
...,...,...,...,...,...,...
143,1356761,Sharp anterior boundary of homeotic gene expre...,Parasegmental boundaries in the Drosophila emb...,,1992,"J Müller, M Bienz"
180,15314643,Cellular immune response to parasitization in ...,Drosophila immune response involves three type...,10.1371/journal.pbio.0020196,2004,"Michèle Crozatier, Jean-Michel Ubeda, Alain Vi..."
131,12758126,Transcription control of a gene for Drosophila...,A DNA replication-related element (DRE)-bindin...,10.1016/s0378-1119(03)00493-1,2003,"Eunjeong Kwon, Hirokazu Seto, Fumiko Hirose, N..."
861,9250684,Mechanism and Bicoid-dependent control of hair...,Pair-rule gene hairy (h) expression in seven e...,10.1093/emboj/16.14.4403,1997,"A La Rosée, T Häder, H Taubert, R Rivera-Pomar..."


In [74]:
test_df = sh_df[num_train+num_valid:num_train+2*num_valid]
rmv_blank(test_df)
test_df.to_csv('./csv_data/test.csv', index=False, encoding='utf-8')
test_df

Unnamed: 0,pmid,title,abstract,doi,year,authors
774,7821226,Regulation of a decapentaplegic midgut enhance...,The clustered homeotic genes encode transcript...,,1994,"J R Manak, L D Mathies, M P Scott"
324,18234213,dFOXO regulates transcription of a Drosophila ...,Insulin resistance is a major feature of patho...,10.1016/j.jmb.2007.12.042,2008,"Terhi Vihervaara, Oscar Puig"
203,15737936,Genetic programs activated by proneural protei...,Neurogenesis depends on a family of proneural ...,10.1016/j.devcel.2005.01.020,2005,"Nick Reeves, James W Posakony"
84,11752402,Regulatory DNA required for vnd/NK-2 homeobox ...,Vnd/NK-2 protein was detected in 11 neuroblast...,10.1073/pnas.012584599,2002,"Xiaoping Shao, Keita Koizumi, Neil Nosworthy, ..."
633,26369287,The fatty acid elongase Bond is essential for ...,Insects use a spectacular variety of chemical ...,10.1038/ncomms9263,2015,"Wan Chin Ng, Jacqueline S R Chin, Kah Junn Tan..."
...,...,...,...,...,...,...
835,8660878,Disperse versus compact elements for the regul...,The segmented body pattern of the Drosophila e...,10.1006/dbio.1996.0146,1996,"M Klingler, J Soong, B Butler, J P Gergen"
192,15479229,Expression of the Drosophila melanogaster ATP ...,Mitochondrial biogenesis is a complex and high...,10.1111/j.1432-1033.2004.04336.x,2004,"Ana Talamillo, Miguel Angel Fernández-Moreno, ..."
629,26204530,Genome-Wide Mapping of Collier In Vivo Binding...,"Collier, the single Drosophila COE (Collier/EB...",10.1371/journal.pone.0133387,2015,"Mathilde de Taffin, Yannick Carrier, Laurence ..."
559,24496624,Machine learning classification of cell-specif...,The Drosophila heart is composed of two distin...,10.1242/dev.101709,2014,"Shaad M Ahmad, Brian W Busser, Di Huang, Eliza..."
