In [1]:
import pandas as pd
import numpy as np

In [2]:
def get_F1(df):
    #df = dataframe with label and preds = prediction column
    #returns F1, precision, recall
    TN = (df.loc[df.label == 'O'].preds.values == 'O').sum()
    FP = (df.loc[df.label == 'O'].preds.values == 'B-protein').sum()

    TP = (df.loc[df.label == 'B-protein'].preds.values == 'B-protein').sum()
    FN = (df.loc[df.label == 'B-protein'].preds.values == 'O').sum()


    precision = TP / (TP + FP)
    recall = TP / (TP + FN)

    F1 = 2 * precision * recall / (precision + recall)
    return(F1,precision,recall)

    

In [3]:
# run for validation
df = pd.read_csv('uebung5_training.iob', sep = '\t', names= ['token', 'label'],skip_blank_lines=False)
df['token'] = df.token.str.lower()
val_idx = int(len(df) * 0.3)
train = df[:-val_idx]
val = df[-val_idx:]

In [4]:
# run to build final dict
df = pd.read_csv('uebung5_training.iob', sep = '\t', names= ['token', 'label'])
df['token'] = df.token.str.lower()
train = df
val = df


In [5]:
#load gene dict, and stopwords
stopwords = pd.read_csv('english_stop_words.txt', sep = '\t', names= ['stopword'])
genenames = pd.read_csv('human-genenames.txt', sep = '\t', names= ['GenName'])

#create set of lower case entries
genes = set(genenames.GenName.str.lower().unique())
stopwords = set(stopwords.stopword.str.lower().unique())


In [6]:
#add training set labels to dict
genes = genes.union(set(train.loc[train.label == 'B-protein'].token))

In [7]:
preds = []

for i in range(len(val)):
    if val.token.iloc[i] in stopwords:
        preds.append('O')
    elif val.token.iloc[i] in genes:
        preds.append('B-protein')
    else:
        preds.append('O')
val['preds'] = preds        

In [9]:
F1,precision,recall, = get_F1(val)
F1,precision,recall

(0.7062795408507765, 0.5476439790575917, 0.9942965779467681)

In [10]:
# add word count column
val['count'] = val.groupby('token')['token'].transform('count')
# add column with true prediction counts
val['result'] = val.label == val.preds
val.result = val.result.astype('int')
val['true_count'] = val.groupby('token')['result'].transform('sum')
# add column with false prediction counts
val['false_count'] = val['count'] - val['true_count']


In [11]:
# remove tokens whith more FP than TP from dict

# subset where true value is 'O'
val_FP = val.loc[val.label == 'O']
# subset where prediction is 'B-protein'
val_FP = val_FP.loc[val.preds == 'B-protein']

FP_tokens = set(val_FP.token.loc[val_FP.true_count < val_FP.false_count])

genes = genes - FP_tokens

In [12]:
# new evaluation
preds = []

for i in range(len(val)):
    if val.token.iloc[i] in stopwords:
        preds.append('O')
    elif val.token.iloc[i] in genes:
        preds.append('B-protein')
    else:
        preds.append('O')
val['preds'] = preds
F1,precision,recall, = get_F1(val)
F1,precision,recall

In [14]:
# save new dict to file
genenames_extended = pd.DataFrame(list(genes))

genenames_extended.to_csv('genenames_extended.txt' ,header=None, index=None)

## Train with full dataset

In [34]:
# Add tokens from full training set to genes
INPUT_FILE = 'uebung5_test_sample_blind.iob'
INPUT_FILE = 'uebung5_training.iob'
OUTPUT_FILE = 'predictions.iob'
STOP_WORDS_FILE = 'english_stop_words.txt'
DICT_FILE = 'genenames_extended.txt'



In [35]:
#load gene dict, and stopwords
stopwords = pd.read_csv(STOP_WORDS_FILE, sep = '\t', names= ['stopword'])
genenames = pd.read_csv(DICT_FILE, sep = '\t', names= ['GenName'])

#create set of lower case entries
genes = set(genenames.GenName.str.lower().unique())
stopwords = set(stopwords.stopword.str.lower().unique())

In [19]:
#old
'''
test = pd.read_csv(INPUT_FILE, sep = '\t',header=None, names= ['token', 'label'] ,skip_blank_lines=False)
test['lower'] = test.token.str.lower()


preds = []

for i in range(len(test)):
    if test.lower[i] != test.lower[i]:
        preds.append(np.nan)
    elif test.lower.iloc[i] in stopwords:
        preds.append('O')
    elif test.lower.iloc[i] in genes:
        preds.append('B-protein')
    else:
        preds.append('O')
        
pred_df = pd.DataFrame({'token':test.token.values, 'preds':preds})

#pred_df.to_csv(OUTPUT_FILE, sep = '\t', header = None, index=None)

#write lines from df
f = open("predictions.iob", "a")
for i in range(len(pred_df)):
    if(test.lower[i] != test.lower[i]):
        f.write('\n')
    else:
        f.write(pred_df.token[i] + '\t' + pred_df.preds[i] + '\n')
f.close()

'''


In [36]:
with open(INPUT_FILE, 'r') as f:
    lines = f.readlines()
    
lines = [line.strip() for line in lines]
token = [str.split(line, '\t')[0] for line in lines]

In [37]:
f = open(OUTPUT_FILE, "w")
for tok in token:
    if(tok == ''):
        f.write('\n')
    elif(tok.lower() in stopwords):
        f.write(tok + '\t' + 'O' + '\n')
    elif(tok.lower() in genes):
        f.write(tok + '\t' + 'B-protein' +'\n')
    else:
        f.write(tok + '\t' + 'O' + '\n')
f.close()

## .py Code

In [None]:
import numpy as np
import pandas as pd
import sys

def main():
    INPUT_FILE = sys.argv[1]
    OUTPUT_FILE = sys.argv[2]
    STOP_WORDS_FILE = 'english_stop_words.txt'
    DICT_FILE = 'genenames_extended.txt'
    
    #load gene dict, and stopwords
    stopwords = pd.read_csv(STOP_WORDS_FILE, sep = '\t', names= ['stopword'])
    genenames = pd.read_csv(DICT_FILE, sep = '\t', names= ['GenName'])

    #create set of lower case entries
    genes = set(genenames.GenName.str.lower().unique())
    stopwords = set(stopwords.stopword.str.lower().unique())
    
    with open(INPUT_FILE, 'r') as f:
        lines = f.readlines()

    lines = [line.strip() for line in lines]
    token = [str.split(line, '\t')[0] for line in lines]
    
    
    f = open(OUTPUT_FILE, "w")
    for tok in token:
        if(tok == ''):
            f.write('\n')
        elif(tok.lower() in stopwords):
            f.write(tok + '\t' + 'O' + '\n')
        elif(tok.lower() in genes):
            f.write(tok + '\t' + 'B-protein' +'\n')
        else:
            f.write(tok + '\t' + 'O' + '\n')
    f.close()
    
    '''
        # load classification data
    test = pd.read_csv(INPUT_FILE, sep = '\t',header=None, names= ['token', 'label'], skip_blank_lines=False)
    test['lower'] = test.token.str.lower()
    
    # make predictions
    preds = []

    for i in range(len(test)):
        if test.lower[i] != test.lower[i]:
            preds.append(np.nan)
        elif test.lower.iloc[i] in stopwords:
            preds.append('O')
        elif test.lower.iloc[i] in genes:
            preds.append('B-protein')
        else:
            preds.append('O')
    
    pred_df = pd.DataFrame({'token':test.token.values, 'preds':preds})
    pred_df = pred_df.fillna('')
    pred_df.to_csv(OUTPUT_FILE, sep = '\t', header = None, index=None)
    '''

    
main()
    

## Evaluate

In [45]:
GOLD_STANDARD = 'uebung5_training.iob'
PREDICTION_FILE = 'predictions.iob'


with open(GOLD_STANDARD, 'r') as f:
    lines = f.readlines()
    
lines = [line.strip() for line in lines]
labels = [str.split(line, '\t')[1] if len(line) is not 0 else str.split(line, '\t')[0] for line in lines]
token = [str.split(line, '\t')[0] for line in lines]

with open(PREDICTION_FILE, 'r') as f:
    lines = f.readlines()

lines = [line.strip() for line in lines]
preds = [str.split(line, '\t')[1] if len(line) is not 0 else str.split(line, '\t')[0] for line in lines]

In [86]:
TN = {}
FP = {}

TP = {}
FN = {}

for i in range(len(labels)):
    
    if labels[i] == '':
        if preds[i] == '':
            next
        else:
            print("non matching empty line at row " + str(i))
            break
    elif labels[i] == 'O':
        if labels[i] == preds[i]:
            if token[i] in TN:
                TN.get(token[i]).append(i)
            else:
                TN[token[i]] = [i]
        else:
            if token[i] in FP:
                FP.get(token[i]).append(i)
            else:
                FP[token[i]] = [i]
                
    elif labels[i] == 'B-protein':
        if labels[i] == preds[i]:
            if token[i] in TP:
                TP.get(token[i]).append(i)
            else:
                TP[token[i]] = [i]
        else:
            if token[i] in FN:
                FN.get(token[i]).append(i)
            else:
                FN[token[i]] = [i]

In [100]:
TP_count = sum([len(value) for key, value in TP.items()])
FP_count = sum([len(value) for key, value in FP.items()])

TN_count = sum([len(value) for key, value in TN.items()])
FN_count = sum([len(value) for key, value in FN.items()])

TP_count,FP_count,TN_count,FN_count

(1512, 110, 173088, 66)

In [101]:
precision = TP_count / (TP_count + FP_count)
recall = TP_count / (TP_count + FN_count)

F1 = 2 * precision * recall / (precision + recall)

F1, precision, recall

(0.9450000000000001, 0.9321824907521579, 0.9581749049429658)

In [117]:
for key, value in FN.items():
    line = [key]
    ids = [str(i) for i in value]
    line.extend(ids)
    print("FP   " + ', '.join(line))
    
for key, value in FN.items():
    line = [key]
    ids = [str(i) for i in value]
    line.extend(ids)
    print("FP   " + ', '.join(line))
    
print('\n')
print('True Positives     ' + str(TP_count))
print('False Positives     ' + str(FP_count))
print('True Negatives     ' + str(TN_count))
print('Precision     ' + str(precision))
print('Recall     ' + str(recall))
print('F1 Score     ' + str(F1))

FP   R, 22853, 141060
FP   AF, 30414, 30446
FP   porphyrin, 37792, 37796
FP   p50, 40994
FP   b6, 41022
FP   Shc, 49200
FP   rev, 52473
FP   W, 52487
FP   Y, 52489
FP   P1, 52948
FP   P2, 52950
FP   PR, 55545, 146765, 146789
FP   D1, 55986
FP   Rev, 60007
FP   Cas, 68112
FP   Q, 75218
FP   CAT, 78031, 127232
FP   L1, 82553, 138422
FP   HA, 84069
FP   PTH, 84146, 87257, 103011
FP   IF, 86551
FP   AP, 87270
FP   AFP, 87789
FP   F1, 93465
FP   Ro, 94804
FP   La, 94806
FP   glucagon, 97063
FP   CAP, 97453
FP   TR, 98261
FP   T, 99155
FP   SAP, 103360
FP   PRC, 115744
FP   CBF, 118223
FP   PML, 118629
FP   RT, 118682
FP   R1, 124995
FP   R2, 124997
FP   E2, 137415, 181282
FP   HD, 137460
FP   Max, 137583
FP   Z, 141058
FP   endonuclease, 155262
FP   A, 157570
FP   GAP, 157919
FP   cytokine, 161773
FP   LT, 162892
FP   P3, 163657
FP   tip, 163856
FP   gpI, 171204, 171218
FP   en, 174070
FP   E1, 176076, 176084, 180418, 181280
FP   carboxyhemoglobin, 179584
FP   R, 22853, 141060
FP   AF, 3041

### uebung5-eval.py

In [None]:
import sys

def main():
    
    GOLD_STANDARD = sys.argv[1]
    PREDICTION_FILE = sys.argv[2]

    # read true labels, tokens and predictions

    with open(GOLD_STANDARD, 'r') as f:
        lines = f.readlines()

    lines = [line.strip() for line in lines]
    labels = [str.split(line, '\t')[1] if len(line) is not 0 else str.split(line, '\t')[0] for line in lines]
    token = [str.split(line, '\t')[0] for line in lines]

    with open(PREDICTION_FILE, 'r') as f:
        lines = f.readlines()

    lines = [line.strip() for line in lines]
    preds = [str.split(line, '\t')[1] if len(line) is not 0 else str.split(line, '\t')[0] for line in lines]

    # create dictionary of TP,FP, TN,TP with tokens and ids

    TN = {}
    FP = {}

    TP = {}
    FN = {}

    for i in range(len(labels)):

        if labels[i] == '':
            if preds[i] == '':
                next
            else:
                print("non matching empty line at row " + str(i))
                break
        elif labels[i] == 'O':
            if labels[i] == preds[i]:
                if token[i] in TN:
                    TN.get(token[i]).append(i)
                else:
                    TN[token[i]] = [i]
            else:
                if token[i] in FP:
                    FP.get(token[i]).append(i)
                else:
                    FP[token[i]] = [i]

        elif labels[i] == 'B-protein':
            if labels[i] == preds[i]:
                if token[i] in TP:
                    TP.get(token[i]).append(i)
                else:
                    TP[token[i]] = [i]
            else:
                if token[i] in FN:
                    FN.get(token[i]).append(i)
                else:
                    FN[token[i]] = [i]


    # count class lengths
    TP_count = sum([len(value) for key, value in TP.items()])
    FP_count = sum([len(value) for key, value in FP.items()])

    TN_count = sum([len(value) for key, value in TN.items()])
    FN_count = sum([len(value) for key, value in FN.items()])

    # calculate results

    precision = TP_count / (TP_count + FP_count)
    recall = TP_count / (TP_count + FN_count)

    F1 = 2 * precision * recall / (precision + recall)

    # print results

    for key, value in FN.items():
        line = [key]
        ids = [str(i) for i in value]
        line.extend(ids)
        print("FP   " + ', '.join(line))
    
    print('\n')

    for key, value in FN.items():
        line = [key]
        ids = [str(i) for i in value]
        line.extend(ids)
        print("FP   " + ', '.join(line))

    print('\n')
    print('True Positives     ' + str(TP_count))
    print('False Positives     ' + str(FP_count))
    print('False Negatives     ' + str(FN_count))
    print('Precision     ' + str(precision))
    print('Recall     ' + str(recall))
    print('F1 Score     ' + str(F1))

main()