In [1]:
import pandas as pd
import numpy as np

In [7]:
def get_F1(df):
    #df = dataframe with label and preds = prediction column
    #returns F1, precision, recall
    TN = (df.loc[df.label == 'O'].preds.values == 'O').sum()
    FP = (df.loc[df.label == 'O'].preds.values == 'B-protein').sum()

    TP = (df.loc[df.label == 'B-protein'].preds.values == 'B-protein').sum()
    FN = (df.loc[df.label == 'B-protein'].preds.values == 'O').sum()


    precision = TP / (TP + FP)
    recall = TP / (TP + FN)

    F1 = 2 * precision * recall / (precision + recall)
    return(F1,precision,recall)

    

In [3]:
# run for validation
df = pd.read_csv('uebung5_training.iob', sep = '\t', names= ['token', 'label'])
df['token'] = df.token.str.lower()
val_idx = int(len(df) * 0.3)
train = df[:-val_idx]
val = df[-val_idx:]

In [2]:
# run to build final dict
df = pd.read_csv('uebung5_training.iob', sep = '\t', names= ['token', 'label'])
df['token'] = df.token.str.lower()
train = df
val = df


In [3]:
#load gene dict, and stopwords
stopwords = pd.read_csv('english_stop_words.txt', sep = '\t', names= ['stopword'])
genenames = pd.read_csv('human-genenames.txt', sep = '\t', names= ['GenName'])

#create set of lower case entries
genes = set(genenames.GenName.str.lower().unique())
stopwords = set(stopwords.stopword.str.lower().unique())


In [4]:
#add training set labels to dict
genes = genes.union(set(train.loc[train.label == 'B-protein'].token))

In [5]:
preds = []

for i in range(len(val)):
    if val.token.iloc[i] in stopwords:
        preds.append('O')
    elif val.token.iloc[i] in genes:
        preds.append('B-protein')
    else:
        preds.append('O')
val['preds'] = preds        

In [8]:
F1,precision,recall, = get_F1(val)
F1,precision,recall

(0.7062795408507765, 0.5476439790575917, 0.9942965779467681)

In [10]:
# add word count column
val['count'] = val.groupby('token')['token'].transform('count')
# add column with true prediction counts
val['result'] = val.label == val.preds
val.result = val.result.astype('int')
val['true_count'] = val.groupby('token')['result'].transform('sum')
# add column with false prediction counts
val['false_count'] = val['count'] - val['true_count']


In [11]:
# remove tokens whith more FP than TP from dict
val_FP = val.loc[val.label == 'O']
val_FP = val_FP.loc[val.preds == 'B-protein']

FP_tokens = set(val_FP.token.loc[val_FP.true_count < val_FP.false_count])

genes = genes - FP_tokens

In [12]:
preds = []

for i in range(len(val)):
    if val.token.iloc[i] in stopwords:
        preds.append('O')
    elif val.token.iloc[i] in genes:
        preds.append('B-protein')
    else:
        preds.append('O')
val['preds'] = preds      

In [13]:
F1,precision,recall, = get_F1(val)
F1,precision,recall

(0.9429373246024321, 0.9281767955801105, 0.9581749049429658)

In [21]:
pd.DataFrame(list(genes))

Unnamed: 0,0
0,
1,rnanc
2,c10orf134
3,dri42
4,sp7
5,rps17b
6,otthump00000059199
7,dj336k20b.2
8,znf427
9,mgc21297


In [23]:
genenames_extended = pd.DataFrame(list(genes))

In [25]:
genenames_extended.to_csv('genenames_extended.txt' ,header=None, index=None)

## Train with full dataset

In [38]:
# Add tokens from full training set to genes
INPUT_FILE = 'uebung5_test_sample_blind.iob'
OUTPUT_FILE = 'predictions.iob'
STOP_WORDS_FILE = 'english_stop_words.txt'
DICT_FILE = 'genenames_extended.txt'



In [39]:
#load gene dict, and stopwords
stopwords = pd.read_csv(STOP_WORDS_FILE, sep = '\t', names= ['stopword'])
genenames = pd.read_csv(DICT_FILE, sep = '\t', names= ['GenName'])

#create set of lower case entries
genes = set(genenames.GenName.str.lower().unique())
stopwords = set(stopwords.stopword.str.lower().unique())

In [36]:
test = pd.read_csv(INPUT_FILE, sep = '\t',header=None, names= ['token', 'label'])
test['lower'] = test.token.str.lower()

In [46]:
test.head()

Unnamed: 0,token,label,lower
0,Two,O,two
1,new,O,new
2,glucosidase,O,glucosidase
3,inhibitors,O,inhibitors
4,(,O,(


In [45]:
preds = []

for i in range(len(test)):
    if test.lower.iloc[i] in stopwords:
        preds.append('O')
    elif test.lower.iloc[i] in genes:
        preds.append('B-protein')
    else:
        preds.append('O')

In [56]:
pred_df = pd.DataFrame({'token':test.token.values, 'preds':preds})

In [59]:
pred_df.to_csv(OUTPUT_FILE, sep = '\t', header = None, index=None)

## .py Code

In [None]:
import numpy as np
import pandas as pd

def main():
    INPUT_FILE = sys.argv[1]
    OUTPUT_FILE = sys.argv[2]
    STOP_WORDS_FILE = 'english_stop_words.txt'
    DICT_FILE = 'genenames_extended.txt'
    
    #load gene dict, and stopwords
    stopwords = pd.read_csv(STOP_WORDS_FILE, sep = '\t', names= ['stopword'])
    genenames = pd.read_csv(DICT_FILE, sep = '\t', names= ['GenName'])

    #create set of lower case entries
    genes = set(genenames.GenName.str.lower().unique())
    stopwords = set(stopwords.stopword.str.lower().unique())
    
    # load classification data
    test = pd.read_csv(INPUT_FILE, sep = '\t',header=None, names= ['token', 'label'])
    test['lower'] = test.token.str.lower()
    
    # make predictions
    preds = []

    for i in range(len(test)):
        if test.lower.iloc[i] in stopwords:
            preds.append('O')
        elif test.lower.iloc[i] in genes:
            preds.append('B-protein')
        else:
            preds.append('O')
    
    pred_df = pd.DataFrame({'token':test.token.values, 'preds':preds})
    pred_df.to_csv(OUTPUT_FILE, sep = '\t', header = None, index=None)
    
main()
    