In [19]:
# ====================================================
# Directory settings
# ====================================================
import os
from pathlib import Path

INPUT_DIR = Path("../input")
OUTPUT_DIR = Path('../output')
os.makedirs(INPUT_DIR, exist_ok=True)
os.makedirs(OUTPUT_DIR, exist_ok=True)

In [20]:
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

# os.system('pip uninstall -y transformers')
# os.system('python -m pip install --no-index --find-links=../input/nbme-pip-wheels transformers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

tokenizers.__version__: 0.12.1
transformers.__version__: 4.18.0


In [21]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv(INPUT_DIR / 'train.csv')
test = pd.read_csv(INPUT_DIR / 'test.csv')
train['annotation'] = train['annotation'].apply(ast.literal_eval)
train['location'] = train['location'].apply(ast.literal_eval)
features = pd.read_csv(INPUT_DIR / 'features.csv')
def preprocess_features(features):
    features.loc[27, 'feature_text'] = "Last-Pap-smear-1-year-ago"
#     features['feature_text'] = features['feature_text'].str.replace("-"," ")
    return features
features = preprocess_features(features)
patient_notes = pd.read_csv(INPUT_DIR / 'patient_notes.csv')

print(f"train.shape: {train.shape}")
print(f"features.shape: {features.shape}")
print(f"patient_notes.shape: {patient_notes.shape}")

train.shape: (14300, 6)
features.shape: (143, 3)
patient_notes.shape: (42146, 3)


In [22]:
train = train.merge(features, on=['feature_num', 'case_num'], how='left')
train = train.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
test = test.merge(features, on=['feature_num', 'case_num'], how='left')
test = test.merge(patient_notes, on=['pn_num', 'case_num'], how='left')
display(train.head())

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,feature_text,pn_history
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],Family-history-of-MI-OR-Family-history-of-myoc...,HPI: 17yo M presents with palpitations. Patien...
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693],Family-history-of-thyroid-disorder,HPI: 17yo M presents with palpitations. Patien...
2,00016_002,0,16,2,[chest pressure],[203 217],Chest-pressure,HPI: 17yo M presents with palpitations. Patien...
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]",Intermittent-symptoms,HPI: 17yo M presents with palpitations. Patien...
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258],Lightheaded,HPI: 17yo M presents with palpitations. Patien...


In [23]:
# incorrect annotation
train.loc[338, 'annotation'] = ast.literal_eval('[["father heart attack"]]')
train.loc[338, 'location'] = ast.literal_eval('[["764 783"]]')

train.loc[621, 'annotation'] = ast.literal_eval('[["for the last 2-3 months"]]')
train.loc[621, 'location'] = ast.literal_eval('[["77 100"]]')

train.loc[655, 'annotation'] = ast.literal_eval('[["no heat intolerance"], ["no cold intolerance"]]')
train.loc[655, 'location'] = ast.literal_eval('[["285 292;301 312"], ["285 287;296 312"]]')

train.loc[1262, 'annotation'] = ast.literal_eval('[["mother thyroid problem"]]')
train.loc[1262, 'location'] = ast.literal_eval('[["551 557;565 580"]]')

train.loc[1265, 'annotation'] = ast.literal_eval('[[\'felt like he was going to "pass out"\']]')
train.loc[1265, 'location'] = ast.literal_eval('[["131 135;181 212"]]')

train.loc[1396, 'annotation'] = ast.literal_eval('[["stool , with no blood"]]')
train.loc[1396, 'location'] = ast.literal_eval('[["259 280"]]')

train.loc[1591, 'annotation'] = ast.literal_eval('[["diarrhoe non blooody"]]')
train.loc[1591, 'location'] = ast.literal_eval('[["176 184;201 212"]]')

train.loc[1615, 'annotation'] = ast.literal_eval('[["diarrhea for last 2-3 days"]]')
train.loc[1615, 'location'] = ast.literal_eval('[["249 257;271 288"]]')

train.loc[1664, 'annotation'] = ast.literal_eval('[["no vaginal discharge"]]')
train.loc[1664, 'location'] = ast.literal_eval('[["822 824;907 924"]]')

train.loc[1714, 'annotation'] = ast.literal_eval('[["started about 8-10 hours ago"]]')
train.loc[1714, 'location'] = ast.literal_eval('[["101 129"]]')

train.loc[1929, 'annotation'] = ast.literal_eval('[["no blood in the stool"]]')
train.loc[1929, 'location'] = ast.literal_eval('[["531 539;549 561"]]')

train.loc[2134, 'annotation'] = ast.literal_eval('[["last sexually active 9 months ago"]]')
train.loc[2134, 'location'] = ast.literal_eval('[["540 560;581 593"]]')

train.loc[2191, 'annotation'] = ast.literal_eval('[["right lower quadrant pain"]]')
train.loc[2191, 'location'] = ast.literal_eval('[["32 57"]]')

train.loc[2553, 'annotation'] = ast.literal_eval('[["diarrhoea no blood"]]')
train.loc[2553, 'location'] = ast.literal_eval('[["308 317;376 384"]]')

train.loc[3124, 'annotation'] = ast.literal_eval('[["sweating"]]')
train.loc[3124, 'location'] = ast.literal_eval('[["549 557"]]')

train.loc[3858, 'annotation'] = ast.literal_eval('[["previously as regular"], ["previously eveyr 28-29 days"], ["previously lasting 5 days"], ["previously regular flow"]]')
train.loc[3858, 'location'] = ast.literal_eval('[["102 123"], ["102 112;125 141"], ["102 112;143 157"], ["102 112;159 171"]]')

train.loc[4373, 'annotation'] = ast.literal_eval('[["for 2 months"]]')
train.loc[4373, 'location'] = ast.literal_eval('[["33 45"]]')

train.loc[4763, 'annotation'] = ast.literal_eval('[["35 year old"]]')
train.loc[4763, 'location'] = ast.literal_eval('[["5 16"]]')

train.loc[4782, 'annotation'] = ast.literal_eval('[["darker brown stools"]]')
train.loc[4782, 'location'] = ast.literal_eval('[["175 194"]]')

train.loc[4908, 'annotation'] = ast.literal_eval('[["uncle with peptic ulcer"]]')
train.loc[4908, 'location'] = ast.literal_eval('[["700 723"]]')

train.loc[6016, 'annotation'] = ast.literal_eval('[["difficulty falling asleep"]]')
train.loc[6016, 'location'] = ast.literal_eval('[["225 250"]]')

train.loc[6192, 'annotation'] = ast.literal_eval('[["helps to take care of aging mother and in-laws"]]')
train.loc[6192, 'location'] = ast.literal_eval('[["197 218;236 260"]]')

train.loc[6380, 'annotation'] = ast.literal_eval('[["No hair changes"], ["No skin changes"], ["No GI changes"], ["No palpitations"], ["No excessive sweating"]]')
train.loc[6380, 'location'] = ast.literal_eval('[["480 482;507 519"], ["480 482;499 503;512 519"], ["480 482;521 531"], ["480 482;533 545"], ["480 482;564 582"]]')

train.loc[6562, 'annotation'] = ast.literal_eval('[["stressed due to taking care of her mother"], ["stressed due to taking care of husbands parents"]]')
train.loc[6562, 'location'] = ast.literal_eval('[["290 320;327 337"], ["290 320;342 358"]]')

train.loc[6862, 'annotation'] = ast.literal_eval('[["stressor taking care of many sick family members"]]')
train.loc[6862, 'location'] = ast.literal_eval('[["288 296;324 363"]]')

train.loc[7022, 'annotation'] = ast.literal_eval('[["heart started racing and felt numbness for the 1st time in her finger tips"]]')
train.loc[7022, 'location'] = ast.literal_eval('[["108 182"]]')

train.loc[7422, 'annotation'] = ast.literal_eval('[["first started 5 yrs"]]')
train.loc[7422, 'location'] = ast.literal_eval('[["102 121"]]')

train.loc[8876, 'annotation'] = ast.literal_eval('[["No shortness of breath"]]')
train.loc[8876, 'location'] = ast.literal_eval('[["481 483;533 552"]]')

train.loc[9027, 'annotation'] = ast.literal_eval('[["recent URI"], ["nasal stuffines, rhinorrhea, for 3-4 days"]]')
train.loc[9027, 'location'] = ast.literal_eval('[["92 102"], ["123 164"]]')

train.loc[9938, 'annotation'] = ast.literal_eval('[["irregularity with her cycles"], ["heavier bleeding"], ["changes her pad every couple hours"]]')
train.loc[9938, 'location'] = ast.literal_eval('[["89 117"], ["122 138"], ["368 402"]]')

train.loc[9973, 'annotation'] = ast.literal_eval('[["gaining 10-15 lbs"]]')
train.loc[9973, 'location'] = ast.literal_eval('[["344 361"]]')

train.loc[10513, 'annotation'] = ast.literal_eval('[["weight gain"], ["gain of 10-16lbs"]]')
train.loc[10513, 'location'] = ast.literal_eval('[["600 611"], ["607 623"]]')

train.loc[11551, 'annotation'] = ast.literal_eval('[["seeing her son knows are not real"]]')
train.loc[11551, 'location'] = ast.literal_eval('[["386 400;443 461"]]')

train.loc[11677, 'annotation'] = ast.literal_eval('[["saw him once in the kitchen after he died"]]')
train.loc[11677, 'location'] = ast.literal_eval('[["160 201"]]')

train.loc[12124, 'annotation'] = ast.literal_eval('[["tried Ambien but it didnt work"]]')
train.loc[12124, 'location'] = ast.literal_eval('[["325 337;349 366"]]')

train.loc[12279, 'annotation'] = ast.literal_eval('[["heard what she described as a party later than evening these things did not actually happen"]]')
train.loc[12279, 'location'] = ast.literal_eval('[["405 459;488 524"]]')

train.loc[12289, 'annotation'] = ast.literal_eval('[["experienced seeing her son at the kitchen table these things did not actually happen"]]')
train.loc[12289, 'location'] = ast.literal_eval('[["353 400;488 524"]]')

train.loc[13238, 'annotation'] = ast.literal_eval('[["SCRACHY THROAT"], ["RUNNY NOSE"]]')
train.loc[13238, 'location'] = ast.literal_eval('[["293 307"], ["321 331"]]')

train.loc[13297, 'annotation'] = ast.literal_eval('[["without improvement when taking tylenol"], ["without improvement when taking ibuprofen"]]')
train.loc[13297, 'location'] = ast.literal_eval('[["182 221"], ["182 213;225 234"]]')

train.loc[13299, 'annotation'] = ast.literal_eval('[["yesterday"], ["yesterday"]]')
train.loc[13299, 'location'] = ast.literal_eval('[["79 88"], ["409 418"]]')

train.loc[13845, 'annotation'] = ast.literal_eval('[["headache global"], ["headache throughout her head"]]')
train.loc[13845, 'location'] = ast.literal_eval('[["86 94;230 236"], ["86 94;237 256"]]')

train.loc[14083, 'annotation'] = ast.literal_eval('[["headache generalized in her head"]]')
train.loc[14083, 'location'] = ast.literal_eval('[["56 64;156 179"]]')

In [24]:
train["case_num"].value_counts()

5    1800
8    1800
2    1700
9    1700
3    1600
0    1300
1    1300
6    1200
4    1000
7     900
Name: case_num, dtype: int64

In [25]:
train["feature_num"].value_counts()

0      100
611    100
605    100
606    100
607    100
608    100
609    100
610    100
700    100
603    100
701    100
702    100
703    100
704    100
705    100
706    100
604    100
602    100
708    100
511    100
505    100
506    100
507    100
508    100
509    100
510    100
512    100
601    100
513    100
514    100
515    100
516    100
517    100
600    100
707    100
800    100
503    100
908    100
902    100
903    100
904    100
905    100
906    100
907    100
909    100
900    100
910    100
911    100
912    100
913    100
914    100
915    100
901    100
817    100
801    100
808    100
802    100
803    100
804    100
805    100
806    100
807    100
809    100
816    100
810    100
811    100
812    100
813    100
814    100
815    100
504    100
502    100
1      100
200    100
107    100
108    100
109    100
110    100
111    100
112    100
201    100
105    100
202    100
203    100
204    100
205    100
206    100
207    100
106    100
104    100
209    100

In [26]:
train["pn_num"].value_counts()

50072    18
56535    18
56697    18
56841    18
57026    18
         ..
71449     9
71432     9
71344     9
71235     9
74087     9
Name: pn_num, Length: 1000, dtype: int64

In [27]:
train["feature_text"].value_counts()

Female                                                                  700
Male                                                                    300
20-year                                                                 200
Nausea                                                                  200
35-year                                                                 200
17-year                                                                 200
Family-history-of-MI-OR-Family-history-of-myocardial-infarction         100
Worse-with-deep-breath-OR-pleuritic                                     100
Chest-pain                                                              100
Duration-x-1-day                                                        100
No-shortness-of-breath                                                  100
Recent-heavy-lifting-at-work-OR-recent-rock-climbing                    100
No-relief-with-asthma-inhaler                                           100
Sharp-OR-sta

In [28]:
print(train.head())
print(len(train))

          id  case_num  pn_num  feature_num                              annotation          location                                       feature_text                                         pn_history
0  00016_000         0      16            0          [dad with recent heart attcak]         [696 724]  Family-history-of-MI-OR-Family-history-of-myoc...  HPI: 17yo M presents with palpitations. Patien...
1  00016_001         0      16            1             [mom with "thyroid disease]         [668 693]                 Family-history-of-thyroid-disorder  HPI: 17yo M presents with palpitations. Patien...
2  00016_002         0      16            2                        [chest pressure]         [203 217]                                     Chest-pressure  HPI: 17yo M presents with palpitations. Patien...
3  00016_003         0      16            3        [intermittent episodes, episode]  [70 91, 176 183]                              Intermittent-symptoms  HPI: 17yo M presents with palp

In [29]:
heart_pounding = train[train["feature_text"]=="heart-pounding-OR-heart-racing"]

In [30]:
heart_pounding["annotation"]

9                  [palpitations, heart beating/pounding]
22                                       [HEART POUNDING]
35             [palpitations, palpitations, palpitations]
48         [heart pounding, heart racing, heart pounding]
61                       [pounding heart, pounding heart]
74                                          [palpitaions]
87         [heart pounding, heart pounding, palpitations]
100     [palpitations, Palpitations, palpitations, hea...
113                                      [heart pounding]
126                      [heart pounding, heart is pound]
139                                      [heart pounding]
152       [palpitations, palpitations, heart is pounding]
165                   [pounding heart beat, palpitations]
178                                        [palpitations]
191                                      [heart pounding]
204                                      [HEART POUNDING]
217                                  [heart palpitations]
230           

In [69]:
sum(heart_pounding["pn_history"].str.contains("Heart pounding"))

1

In [68]:
sum(heart_pounding["annotation"].apply(lambda x: "heart pounding" in " ".join(x)))

45

## feature_text の uni-gram, bi-gram を取ってきて、 annotation rate(sum(annotation)/sum(pn_history)) が大きいやつに対して hard labeling

In [76]:
train["feature_text_list"] = train["feature_text"].apply(lambda x: x.split("-OR-")).apply(lambda x: [x1.split("-") for x1 in x])

In [100]:
def ngram(feature_text_lists, n: int):
    return [" ".join(feature_text_list[k:k+n]) for feature_text_list in feature_text_lists for k in range(len(feature_text_list)-n+1)]

In [105]:
train["feature_text_unigram"] = train["feature_text_list"].apply(ngram, n=1)
train["feature_text_bigram"] = train["feature_text_list"].apply(ngram, n=2)

In [107]:
train["feature_text_bigram"]

0        [Family history, history of, of MI, Family his...
1        [Family history, history of, of thyroid, thyro...
2                                         [Chest pressure]
3                                  [Intermittent symptoms]
4                                                       []
                               ...                        
14295           [Family history, history of, of migraines]
14296                                                   []
14297                                                   []
14298          [No known, known illness, illness contacts]
14299                                   [Subjective fever]
Name: feature_text_bigram, Length: 14300, dtype: object

In [109]:
heart_pounding["annotation"]

9                  [palpitations, heart beating/pounding]
22                                       [HEART POUNDING]
35             [palpitations, palpitations, palpitations]
48         [heart pounding, heart racing, heart pounding]
61                       [pounding heart, pounding heart]
74                                          [palpitaions]
87         [heart pounding, heart pounding, palpitations]
100     [palpitations, Palpitations, palpitations, hea...
113                                      [heart pounding]
126                      [heart pounding, heart is pound]
139                                      [heart pounding]
152       [palpitations, palpitations, heart is pounding]
165                   [pounding heart beat, palpitations]
178                                        [palpitations]
191                                      [heart pounding]
204                                      [HEART POUNDING]
217                                  [heart palpitations]
230           

In [112]:
feature_texts = train["feature_text"].unique()

In [130]:
feature_text_list_unigram = {feature_text: [ngram([feature.lower().split("-")], 1) for feature in feature_text.split("-OR-")] for feature_text in feature_texts}
feature_text_list_bigram = {feature_text: [ngram([feature.lower().split("-")], 2) for feature in feature_text.split("-OR-")] for feature_text in feature_texts}

In [131]:
feature_text_list_unigram

{'Family-history-of-MI-OR-Family-history-of-myocardial-infarction': [['family',
   'history',
   'of',
   'mi'],
  ['family', 'history', 'of', 'myocardial', 'infarction']],
 'Family-history-of-thyroid-disorder': [['family',
   'history',
   'of',
   'thyroid',
   'disorder']],
 'Chest-pressure': [['chest', 'pressure']],
 'Intermittent-symptoms': [['intermittent', 'symptoms']],
 'Lightheaded': [['lightheaded']],
 'No-hair-changes-OR-no-nail-changes-OR-no-temperature-intolerance': [['no',
   'hair',
   'changes'],
  ['no', 'nail', 'changes'],
  ['no', 'temperature', 'intolerance']],
 'Adderall-use': [['adderall', 'use']],
 'Shortness-of-breath': [['shortness', 'of', 'breath']],
 'Caffeine-use': [['caffeine', 'use']],
 'heart-pounding-OR-heart-racing': [['heart', 'pounding'],
  ['heart', 'racing']],
 'Few-months-duration': [['few', 'months', 'duration']],
 '17-year': [['17', 'year']],
 'Male': [['male']],
 'No-vaginal-discharge': [['no', 'vaginal', 'discharge']],
 'Weight-loss': [['weight

In [150]:
from tqdm import tqdm
annotation_rate_dic = {}
threshold = 0.8
for feature_text in feature_text_list_unigram.keys():
    annotation_rate_dic[feature_text] = []
for feature_text, word_lists in feature_text_list_unigram.items():
    for word_list in word_lists:
        for word in word_list:
            annotation_count = sum(
                train[train["feature_text"] == feature_text]["annotation"].apply(
                    lambda x: word in " ".join(x).lower()
                )
            )
            pn_history_count = sum(
                train[train["feature_text"] == feature_text]["pn_history"].apply(
                    lambda x: word in x.lower()
                )
            )
            if pn_history_count != 0:
                annotation_rate = annotation_count / pn_history_count
                if annotation_rate > 0.8:
                    annotation_rate_dic[feature_text].append(
                        {
                            "word": word,
                            "annotation_rate": annotation_rate,
                            "pn_history_count": pn_history_count,
                        }
                    )
            else:
                print(f"{feature_text}, {word}: pn_history_count=0")

Right-sided-LQ-abdominal-pain-OR-Right-lower-quadrant-abdominal-pain, sided: pn_history_count=0
Diminished-appetite, diminished: pn_history_count=0
Recent-nausea-vomiting-OR-Recent-flulike-symptoms, flulike: pn_history_count=0
Sleep-disturbance-OR-Early-awakenings, awakenings: pn_history_count=0
NSAID-use-OR-Nonsteroidal-anti-inflammatory-drug-use, nonsteroidal: pn_history_count=0
NSAID-use-OR-Nonsteroidal-anti-inflammatory-drug-use, anti: pn_history_count=0
NSAID-use-OR-Nonsteroidal-anti-inflammatory-drug-use, inflammatory: pn_history_count=0
Post-prandial-bloating-OR-fullness-with-meals, prandial: pn_history_count=0
Heavy-caffeine-use, heavy: pn_history_count=0
Duration-x-1-day, duration: pn_history_count=0
Unsuccessful-napping, unsuccessful: pn_history_count=0
Unsuccessful-napping, napping: pn_history_count=0
Sleeping-medication-ineffective, ineffective: pn_history_count=0
viral-symptoms-OR-rhinorrhea-OR-scratchy-throat, viral: pn_history_count=0
Shares-an-apartment, shares: pn_hist

In [158]:
def match_annotation_rate(feature_text_list, threshold=0.8):
    annotation_rate_dic = {}
    for feature_text, word_lists in feature_text_list.items():
        annotation_rate_dic[feature_text] = []
        for word_list in word_lists:
            for word in word_list:
                annotation_count = sum(
                    train[train["feature_text"] == feature_text]["annotation"].apply(
                        lambda x: word in " ".join(x).lower()
                    )
                )
                pn_history_count = sum(
                    train[train["feature_text"] == feature_text]["pn_history"].apply(
                        lambda x: word in x.lower()
                    )
                )
                if pn_history_count != 0:
                    annotation_rate = annotation_count / pn_history_count
                    if annotation_rate > threshold:
                        annotation_rate_dic[feature_text].append(
                            {
                                "word": word,
                                "annotation_rate": annotation_rate,
                                "pn_history_count": pn_history_count,
                            }
                        )
#                 else:
#                     print(f"{feature_text}, {word}: pn_history_count=0")
    return annotation_rate_dic

In [162]:
unigram_annotation_rate_word_list = match_annotation_rate(feature_text_list_unigram, threshold=0.8)
bigram_annotation_rate_word_list = match_annotation_rate(feature_text_list_bigram, threshold=0.8)


In [163]:
bigram_annotation_rate_word_list

{'Family-history-of-MI-OR-Family-history-of-myocardial-infarction': [{'word': 'of mi',
   'annotation_rate': 1.0,
   'pn_history_count': 1},
  {'word': 'myocardial infarction',
   'annotation_rate': 1.0,
   'pn_history_count': 1}],
 'Family-history-of-thyroid-disorder': [{'word': 'thyroid disorder',
   'annotation_rate': 1.0,
   'pn_history_count': 3}],
 'Chest-pressure': [{'word': 'chest pressure',
   'annotation_rate': 1.0,
   'pn_history_count': 43}],
 'Intermittent-symptoms': [],
 'Lightheaded': [],
 'No-hair-changes-OR-no-nail-changes-OR-no-temperature-intolerance': [{'word': 'nail changes',
   'annotation_rate': 1.0,
   'pn_history_count': 1},
  {'word': 'no temperature', 'annotation_rate': 1.5, 'pn_history_count': 2},
  {'word': 'temperature intolerance',
   'annotation_rate': 1.0,
   'pn_history_count': 4}],
 'Adderall-use': [],
 'Shortness-of-breath': [{'word': 'shortness of',
   'annotation_rate': 0.8260869565217391,
   'pn_history_count': 23},
  {'word': 'of breath',
   'ann

In [164]:
unigram_annotation_rate_word_list

{'Family-history-of-MI-OR-Family-history-of-myocardial-infarction': [{'word': 'myocardial',
   'annotation_rate': 1.0,
   'pn_history_count': 1},
  {'word': 'infarction', 'annotation_rate': 1.0, 'pn_history_count': 1}],
 'Family-history-of-thyroid-disorder': [{'word': 'thyroid',
   'annotation_rate': 1.0,
   'pn_history_count': 87}],
 'Chest-pressure': [{'word': 'pressure',
   'annotation_rate': 0.9838709677419355,
   'pn_history_count': 62}],
 'Intermittent-symptoms': [{'word': 'intermittent',
   'annotation_rate': 0.9444444444444444,
   'pn_history_count': 18}],
 'Lightheaded': [{'word': 'lightheaded',
   'annotation_rate': 0.96,
   'pn_history_count': 25}],
 'No-hair-changes-OR-no-nail-changes-OR-no-temperature-intolerance': [{'word': 'nail',
   'annotation_rate': 1.0,
   'pn_history_count': 1},
  {'word': 'temperature', 'annotation_rate': 1.0, 'pn_history_count': 6},
  {'word': 'intolerance', 'annotation_rate': 0.96, 'pn_history_count': 25}],
 'Adderall-use': [{'word': 'adderall',


In [166]:
train[train["feature_text"]=="8-to-10-hours-of-acute-pain"]["annotation"]

1311                                 [last 10 hours]
1324                              [onset 8-10 hours]
1337                    [pain for the past 10 hours]
1350                       [began about 8 hours ago]
1363                        [started 8-10 hours ago]
1376                    [8-10 HOURS SUDDEN IN ONSET]
1389                           [pain for 8-10 hours]
1402                                    [for 10 ago]
1415                                [8-10 hours ago]
1428                                              []
1441                          [started 8-10 hr back]
1454                                              []
1467                       [For the last 8-10 hours]
1480                                   [pain 10 hrs]
1493                             [8-10 hour history]
1506                        [started 8-10 hours ago]
1519                      [Pain began ~8-10 hrs ago]
1532                             [pain for 8-10 hrs]
1545                           [started 8-10hr

In [169]:
test["feature_text"].uniqueque()

5