In [1]:
import pandas as pd

data_path = 'C:/Users/pietr/Documents/GitHub/Thesis-CaianiRenContardi/df_model.csv'

df = pd.read_csv(data_path)

In [2]:
df.head()

Unnamed: 0,FOI_TEXT,PROBLEM_CODE,IMDRF
0,IT WAS REPORTED THAT THE LEAD WAS IMPLANTED IN...,2923.0,A051201
1,"IT WAS REPORTED THAT DURING FOLLOW-UP VISIT, D...",1440.0,A071205
2,IT WAS REPORTED THAT THE LEAD CAPTURE THRESHOL...,1559.0,A070908
3,IT WAS REPORTED THAT THE LEAD CAPTURE THRESHOL...,3266.0,A070102
4,NEW INFORMATION RECEIVED NOTES THAT LOSS OF SE...,1559.0,A070908


In [3]:
# count the number of text above 512 tokens
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

def count_tokens(text):
    output = tokenizer.tokenize(text)
    return len(output)

num_tokens = df['FOI_TEXT'].apply(count_tokens)
num_tokens[num_tokens > 512].count()

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

In [3]:
len(df)

584279

In [4]:
unlabelled = df[(df['PROBLEM_CODE'].isna()) | (df['IMDRF'].isna())]
df.drop(unlabelled.index, inplace=True)
df.drop_duplicates(inplace=True)
df.drop(columns=['PROBLEM_CODE'], inplace=True)

In [5]:
from util.preprocessor import TextPreprocessor

preprocessor = TextPreprocessor()

In [7]:
df['FOI_TEXT'] = df['FOI_TEXT'].str.lower()
df['Cleaned'] = df['FOI_TEXT'].apply(preprocessor.clean_text)
df['NoPunctuation'] = df['Cleaned'].apply(preprocessor.remove_punct)

In [8]:
uni10 = pd.read_csv('ngrams/uni10.csv')
uni20 = pd.read_csv('ngrams/uni20.csv')
uni30 = pd.read_csv('ngrams/uni30.csv')

list10 = uni10['word'].tolist()
list20 = uni20['word'].tolist()
list30 = uni30['word'].tolist()

In [9]:
df['NoStop10'] = df['Cleaned'].apply(lambda text: preprocessor.remove_stopwords(text=text, stoplist=list10))
df['NoStop20'] = df['Cleaned'].apply(lambda text: preprocessor.remove_stopwords(text=text, stoplist=list20))
df['NoStop30'] = df['Cleaned'].apply(lambda text: preprocessor.remove_stopwords(text=text, stoplist=list30))

In [10]:
df['IMDRF'] = df['IMDRF'].apply(preprocessor.remredclass)
df['CatOneIMDRF'], df['CatTwoIMDRF'] = zip(*df['IMDRF'].apply(preprocessor.extract_category))

In [11]:
sub_df = df.sample(n=100, random_state=42)
sub_df

Unnamed: 0,FOI_TEXT,IMDRF,Cleaned,NoPunctuation,NoStop10,NoStop20,NoStop30,CatOneIMDRF,CatTwoIMDRF
432108,it was reported that during implant the�right ...,A0701,it was reported that during implant the right ...,it was reported that during implant the right ...,it was that during implant the exhibited poor ...,it was that during implant the poor thresholds...,it was that during implant the poor thresholds...,A07,A0701
145991,it was reported that the patient presented to ...,A090801,it was reported that the patient presented to ...,it was reported that the patient presented to ...,it was that the presented to the hospital for ...,it was that the presented to the hospital for ...,it was that the presented to the hospital for ...,A09,A0908
259118,it was reported that this pacemaker system tri...,A072201,it was reported that this pacemaker system tri...,it was reported that this pacemaker system tri...,it was that this pacemaker system triggered a ...,it was that this pacemaker system triggered a ...,it was that this system triggered a safety swi...,A07,A0722
502603,related manufacturer report number: 2017865-20...,A070909,related manufacturer report number: --. it was...,related manufacturer report number it was rep...,related manufacturer report number: --. it was...,related manufacturer report number: --. it was...,related manufacturer report number: --. it was...,A07,A0709
173353,it was reported that a patient presented in a ...,A070101,it was reported that a patient presented in a ...,it was reported that a patient presented in a ...,it was that a presented in a post operation ch...,it was that a presented in a post operation ch...,it was that a presented in a post operation ch...,A07,A0701
...,...,...,...,...,...,...,...,...,...
83969,related manufacturer reference number: 2938836...,A070909,related manufacturer reference number: --. it ...,related manufacturer reference number it was ...,related manufacturer reference number: --. it ...,related manufacturer reference number: --. it ...,related manufacturer reference number: --. it ...,A07,A0709
323755,it was further reported that the wound culture...,A24,it was further reported that the wound culture...,it was further reported that the wound culture...,it was further that the wound culture from out...,it was further that the wound culture from out...,it was further that the wound culture from out...,A24,
47378,it was reported that this implantable cardiove...,A090201,it was reported that this implantable cardiove...,it was reported that this implantable cardiove...,it was that this implantable cardioverter defi...,it was that this implantable cardioverter defi...,it was that this implantable cardioverter defi...,A09,A0902
483388,related mfr report number: 2017865-2023-21184....,A24,related mfr report number: --. related mfr rep...,related mfr report number related mfr report ...,related mfr report number: --. related mfr rep...,related mfr report number: --. related mfr rep...,related mfr report number: --. related mfr rep...,A24,


In [12]:
sub_df['f2sent'] = sub_df['FOI_TEXT'].apply(lambda x: preprocessor.first_two_sentences(x))
sub_df['trimsent'] = sub_df['FOI_TEXT'].apply(lambda x: preprocessor.trimlastcompBERT(x))

In [13]:
sub_df['cleanf2'] = sub_df['f2sent'].apply(preprocessor.clean_text)
sub_df['cleantrim'] = sub_df['trimsent'].apply(preprocessor.clean_text)
sub_df['nopunf2'] = sub_df['cleanf2'].apply(preprocessor.remove_punct)
sub_df['nopuntrim'] = sub_df['cleantrim'].apply(preprocessor.remove_punct)

In [None]:
sub_df

Unnamed: 0,FOI_TEXT,IMDRF,Cleaned,NoPunctuation,NoStop10,NoStop20,NoStop30,CatOneIMDRF,CatTwoIMDRF,f2sent,trimsent,cleanf2,cleantrim,nopunf2,nopuntrim
432108,it was reported that during implant the�right ...,A0701,it was reported that during implant the right ...,it was reported that during implant the right ...,it was that during implant the exhibited poor ...,it was that during implant the poor thresholds...,it was that during implant the poor thresholds...,A07,A0701,it was reported that during implant the�right ...,it was reported that during implant the�right ...,it was reported that during implant the right ...,it was reported that during implant the right ...,it was reported that during implant the right ...,it was reported that during implant the right ...
145991,it was reported that the patient presented to ...,A090801,it was reported that the patient presented to ...,it was reported that the patient presented to ...,it was that the presented to the hospital for ...,it was that the presented to the hospital for ...,it was that the presented to the hospital for ...,A09,A0908,it was reported that the patient presented to ...,it was reported that the patient presented to ...,it was reported that the patient presented to ...,it was reported that the patient presented to ...,it was reported that the patient presented to ...,it was reported that the patient presented to ...
259118,it was reported that this pacemaker system tri...,A072201,it was reported that this pacemaker system tri...,it was reported that this pacemaker system tri...,it was that this pacemaker system triggered a ...,it was that this pacemaker system triggered a ...,it was that this system triggered a safety swi...,A07,A0722,it was reported that this pacemaker system tri...,it was reported that this pacemaker system tri...,it was reported that this pacemaker system tri...,it was reported that this pacemaker system tri...,it was reported that this pacemaker system tri...,it was reported that this pacemaker system tri...
502603,related manufacturer report number: 2017865-20...,A070909,related manufacturer report number: --. it was...,related manufacturer report number it was rep...,related manufacturer report number: --. it was...,related manufacturer report number: --. it was...,related manufacturer report number: --. it was...,A07,A0709,related manufacturer report number: 2017865-20...,related manufacturer report number: 2017865-20...,related manufacturer report number: --. it was...,related manufacturer report number: --. it was...,related manufacturer report number it was rep...,related manufacturer report number it was rep...
173353,it was reported that a patient presented in a ...,A070101,it was reported that a patient presented in a ...,it was reported that a patient presented in a ...,it was that a presented in a post operation ch...,it was that a presented in a post operation ch...,it was that a presented in a post operation ch...,A07,A0701,it was reported that a patient presented in a ...,it was reported that a patient presented in a ...,it was reported that a patient presented in a ...,it was reported that a patient presented in a ...,it was reported that a patient presented in a ...,it was reported that a patient presented in a ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83969,related manufacturer reference number: 2938836...,A070909,related manufacturer reference number: --. it ...,related manufacturer reference number it was ...,related manufacturer reference number: --. it ...,related manufacturer reference number: --. it ...,related manufacturer reference number: --. it ...,A07,A0709,related manufacturer reference number: 2938836...,related manufacturer reference number: 2938836...,related manufacturer reference number: --. it ...,related manufacturer reference number: --. it ...,related manufacturer reference number it was ...,related manufacturer reference number it was ...
323755,it was further reported that the wound culture...,A24,it was further reported that the wound culture...,it was further reported that the wound culture...,it was further that the wound culture from out...,it was further that the wound culture from out...,it was further that the wound culture from out...,A24,,it was further reported that the wound culture...,it was further reported that the wound culture...,it was further reported that the wound culture...,it was further reported that the wound culture...,it was further reported that the wound culture...,it was further reported that the wound culture...
47378,it was reported that this implantable cardiove...,A090201,it was reported that this implantable cardiove...,it was reported that this implantable cardiove...,it was that this implantable cardioverter defi...,it was that this implantable cardioverter defi...,it was that this implantable cardioverter defi...,A09,A0902,it was reported that this implantable cardiove...,it was reported that this implantable cardiove...,it was reported that this implantable cardiove...,it was reported that this implantable cardiove...,it was reported that this implantable cardiove...,it was reported that this implantable cardiove...
483388,related mfr report number: 2017865-2023-21184....,A24,related mfr report number: --. related mfr rep...,related mfr report number related mfr report ...,related mfr report number: --. related mfr rep...,related mfr report number: --. related mfr rep...,related mfr report number: --. related mfr rep...,A24,,related mfr report number: 2017865-2023-21184....,related mfr report number: 2017865-2023-21184....,related mfr report number: --. related mfr rep...,related mfr report number: --. related mfr rep...,related mfr report number related mfr report ...,related mfr report number related mfr report ...


In [14]:
df_imdrf = pd.read_csv('df_imdrf_refined.csv')

In [15]:
df_imdrf.head()

Unnamed: 0,Level 1 Term,Level 2 Term,Level 3 Term,FDA Code,NCIt Code,IMDRF Code,Definition,Non-IMDRF Code/Term,Status,Status Description,CodeHierarchy
0,Patient Device Interaction Problem,,,4001,C133496,A01,Problem related to the interaction between the...,,,,A01
1,,Patient-Device Incompatibility,,2682,C62919,A0101,Problem associated with the interaction betwee...,,,,A01|A0101
2,,,Biocompatibility,2886,C63294,A010101,Problem associated with undesirable local or s...,,,,A01|A0101|A010101
3,,,Device Appears to Trigger Rejection,1524,C62853,A010102,The device appears to elicit undesired respons...,,,,A01|A0101|A010102
4,,,Inadequacy of Device Shape and/or Size,1583,C62947,A010103,The physical size and/or shape of the device w...,,,,A01|A0101|A010103


In [16]:
# creating the single term and multiple terms
def assignterm(df):
    if len(df['IMDRF Code']) == 3: 
        df['single_term'] = df['Level 1 Term']
        df['multiple_terms'] = df['Level 1 Term']
    elif len(df['IMDRF Code']) == 5:
        df['single_term'] = df['Level 2 Term']
        df['multiple_terms'] = df['Level 1 Term'] + '; ' + df['Level 2 Term']
    else:
        df['single_term'] = df['Level 3 Term']
        df['multiple_terms'] = df['Level 1 Term'] + '; ' + df['Level 2 Term'] + '; ' + df['Level 3 Term']

    df['single_term'] = df['single_term'].lower()
    df['multiple_terms'] = df['multiple_terms'].lower()
    return df

import numpy as np

df_imdrf['Level 1 Term'].fillna(method='ffill', inplace=True)
df_imdrf['Level 2 Term'].fillna(method='ffill', inplace=True)
df_imdrf['Level 3 Term'].fillna(method='ffill', inplace=True)

df_imdrf.loc[df_imdrf['IMDRF Code'].str.len() == 3, ['Level 2 Term', 'Level 3 Term']] = np.nan
df_imdrf.loc[df_imdrf['IMDRF Code'].str.len() == 5, 'Level 3 Term'] = np.nan

df_imdrf = df_imdrf.apply(assignterm, axis=1)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_imdrf['Level 1 Term'].fillna(method='ffill', inplace=True)
  df_imdrf['Level 1 Term'].fillna(method='ffill', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_imdrf['Level 2 Term'].fillna(method='ffill', inplace=True)
  df_imdrf['Level 2 Term'].fillna(method='ff

In [17]:
df_imdrf1st = df_imdrf[df_imdrf['IMDRF Code'].str.len() == 3]

## Standard approach

### Sentence Transformers

#### SBERT

In [None]:
from sentence_transformers import SentenceTransformer, SimilarityFunction

model = SentenceTransformer("all-MiniLM-L6-v2", similarity_fn_name=SimilarityFunction.COSINE)
# model = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO')
# model = SentenceTransformer("neuml/pubmedbert-base-embeddings")
# model = SentenceTransformer('TimKond/S-PubMedBert-MedQuAD')
#model = SentenceTransformer('pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb')

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


KeyboardInterrupt: 

In [None]:
from util.prepemb import SbertComparer
comparer = SbertComparer(model=model, df=sub_df, df_imdrf=df_imdrf1st)

In [None]:
normaltext = model.encode(sub_df['FOI_TEXT'].to_list())
cleanpun = model.encode(sub_df['Cleaned'].to_list())
nopun = model.encode(sub_df['NoPunctuation'].to_list())

In [None]:
f2sent = model.encode(sub_df['f2sent'].to_list())
trimsent = model.encode(sub_df['trimsent'].to_list())
cleanf2 = model.encode(sub_df['cleanf2'].to_list())
cleantrim = model.encode(sub_df['cleantrim'].to_list())
nopunf2 = model.encode(sub_df['nopunf2'].to_list())
nopuntrim = model.encode(sub_df['nopuntrim'].to_list())

In [None]:
singleterm = model.encode(df_imdrf1st['single_term'].to_list())
multiterm = model.encode(df_imdrf1st['multiple_terms'].to_list())
defimdrf = model.encode(df_imdrf1st['Definition'].to_list())

In [66]:
nopunsingle = comparer.extractbestmatch(nopun, singleterm, 'CatOneIMDRF')
punsingle = comparer.extractbestmatch(cleanpun, singleterm, 'CatOneIMDRF')
normalsingle = comparer.extractbestmatch(normaltext, singleterm, 'CatOneIMDRF')

Correct Prediction
1    57
0    43
Name: count, dtype: int64
Correct Prediction
1    59
0    41
Name: count, dtype: int64
Correct Prediction
1    60
0    40
Name: count, dtype: int64


In [None]:
nopunmulti = comparer.extractbestmatch(nopun, multiterm, 'CatOneIMDRF')
punmulti = comparer.extractbestmatch(cleanpun, multiterm, 'CatOneIMDRF')
normalmulti = comparer.extractbestmatch(normaltext, multiterm, 'CatOneIMDRF')

Correct Prediction
0    59
1    41
Name: count, dtype: int64
Correct Prediction
0    57
1    43
Name: count, dtype: int64
Correct Prediction
0    55
1    45
Name: count, dtype: int64


In [68]:
nopundef = comparer.extractbestmatch(nopun, defimdrf, 'CatOneIMDRF')
punundef = comparer.extractbestmatch(cleanpun, defimdrf, 'CatOneIMDRF')
normaldef = comparer.extractbestmatch(normaltext, defimdrf, 'CatOneIMDRF')

Correct Prediction
1    59
0    41
Name: count, dtype: int64
Correct Prediction
1    59
0    41
Name: count, dtype: int64
Correct Prediction
1    59
0    41
Name: count, dtype: int64


In [69]:
singlef2 = comparer.extractbestmatch(f2sent, singleterm, 'CatOneIMDRF')
singletrim = comparer.extractbestmatch(trimsent, singleterm, 'CatOneIMDRF')

Correct Prediction
1    57
0    43
Name: count, dtype: int64
Correct Prediction
1    60
0    40
Name: count, dtype: int64


In [70]:
multif2 = comparer.extractbestmatch(f2sent, multiterm, 'CatOneIMDRF')
multitrim = comparer.extractbestmatch(trimsent, multiterm, 'CatOneIMDRF')

Correct Prediction
0    52
1    48
Name: count, dtype: int64
Correct Prediction
0    55
1    45
Name: count, dtype: int64


In [71]:
deff2 = comparer.extractbestmatch(f2sent, defimdrf, 'CatOneIMDRF')
deftrim = comparer.extractbestmatch(trimsent, defimdrf, 'CatOneIMDRF')

Correct Prediction
1    59
0    41
Name: count, dtype: int64
Correct Prediction
1    59
0    41
Name: count, dtype: int64


In [72]:
f2punsingle = comparer.extractbestmatch(cleanf2, singleterm, 'CatOneIMDRF')
f2punmulti = comparer.extractbestmatch(cleanf2, multiterm, 'CatOneIMDRF')
f2pundef = comparer.extractbestmatch(cleanf2, defimdrf, 'CatOneIMDRF')

Correct Prediction
1    56
0    44
Name: count, dtype: int64
Correct Prediction
0    55
1    45
Name: count, dtype: int64
Correct Prediction
1    59
0    41
Name: count, dtype: int64


In [73]:
trimpunsingle = comparer.extractbestmatch(cleantrim, singleterm, 'CatOneIMDRF')
trimpunmulti = comparer.extractbestmatch(cleantrim, multiterm, 'CatOneIMDRF')
trimpundef = comparer.extractbestmatch(cleantrim, defimdrf, 'CatOneIMDRF')

Correct Prediction
1    59
0    41
Name: count, dtype: int64
Correct Prediction
0    57
1    43
Name: count, dtype: int64
Correct Prediction
1    59
0    41
Name: count, dtype: int64


In [74]:
f2nopunsingle = comparer.extractbestmatch(nopunf2, singleterm, 'CatOneIMDRF')
f2nopunmulti = comparer.extractbestmatch(nopunf2, multiterm, 'CatOneIMDRF')
f2nopundef = comparer.extractbestmatch(nopunf2, defimdrf, 'CatOneIMDRF')

Correct Prediction
1    55
0    45
Name: count, dtype: int64
Correct Prediction
0    57
1    43
Name: count, dtype: int64
Correct Prediction
1    60
0    40
Name: count, dtype: int64


In [75]:
trimnopunsingle = comparer.extractbestmatch(nopuntrim, singleterm, 'CatOneIMDRF')
trimnopunmulti = comparer.extractbestmatch(nopuntrim, multiterm, 'CatOneIMDRF')
trimnopundef = comparer.extractbestmatch(nopuntrim, defimdrf, 'CatOneIMDRF')

Correct Prediction
1    57
0    43
Name: count, dtype: int64
Correct Prediction
0    59
1    41
Name: count, dtype: int64
Correct Prediction
1    59
0    41
Name: count, dtype: int64


### Transformers Models

#### Choosing the model

In [96]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

In [113]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [144]:
model = AutoModel.from_pretrained("ncbi/MedCPT-Query-Encoder")
tokenizer = AutoTokenizer.from_pretrained("ncbi/MedCPT-Query-Encoder")

In [176]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")
model = AutoModelForMaskedLM.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


KeyboardInterrupt: 

In [145]:
from util.prepemb import BCBComparer
bcbcomparer = BCBComparer(df=sub_df, df_imdrf=df_imdrf, tokenizer=tokenizer, model=model)

#### Choosing the token

comparing the differences among the different leves of cleaning of text

In [161]:
#token = 'cls'
token = 'mean'

In [162]:
normalbcb = bcbcomparer.compute_bioclinical_embeddings(sub_df['FOI_TEXT'], token)
cleanbcb = bcbcomparer.compute_bioclinical_embeddings(sub_df['Cleaned'], token)
nopunbcb = bcbcomparer.compute_bioclinical_embeddings(sub_df['NoPunctuation'], token)

In [163]:
f2sent = bcbcomparer.compute_bioclinical_embeddings(sub_df['f2sent'], token)
trimsent = bcbcomparer.compute_bioclinical_embeddings(sub_df['trimsent'], token)
cleanf2 = bcbcomparer.compute_bioclinical_embeddings(sub_df['cleanf2'], token)
cleantrim = bcbcomparer.compute_bioclinical_embeddings(sub_df['cleantrim'], token)
nopunf2 = bcbcomparer.compute_bioclinical_embeddings(sub_df['nopunf2'], token)
nopuntrim = bcbcomparer.compute_bioclinical_embeddings(sub_df['nopuntrim'], token)

In [164]:
singleterm = bcbcomparer.compute_bioclinical_embeddings(df_imdrf['single_term'], token)
multiterm = bcbcomparer.compute_bioclinical_embeddings(df_imdrf['multiple_terms'], token)
defimdrf = bcbcomparer.compute_bioclinical_embeddings(df_imdrf['Definition'], token)

In [165]:
bcbsinglenormal = bcbcomparer.extractbestfit(normalbcb, singleterm, 'CatOneIMDRF')
bcbsingleclean = bcbcomparer.extractbestfit(cleanbcb, singleterm, 'CatOneIMDRF')
bcbsinglenopun = bcbcomparer.extractbestfit(nopunbcb, singleterm, 'CatOneIMDRF')

Correct Prediction
0    81
1    19
Name: count, dtype: int64
Correct Prediction
0    83
1    17
Name: count, dtype: int64
Correct Prediction
0    88
1    12
Name: count, dtype: int64


In [166]:
bcbmultinormal = bcbcomparer.extractbestfit(normalbcb, multiterm, 'CatOneIMDRF')
bcbmulticlean = bcbcomparer.extractbestfit(cleanbcb, multiterm, 'CatOneIMDRF')
bcbmultinopun = bcbcomparer.extractbestfit(nopunbcb, multiterm, 'CatOneIMDRF')

Correct Prediction
0    88
1    12
Name: count, dtype: int64
Correct Prediction
0    88
1    12
Name: count, dtype: int64
Correct Prediction
0    93
1     7
Name: count, dtype: int64


In [167]:
bcbdefnormal = bcbcomparer.extractbestfit(normalbcb, defimdrf, 'CatOneIMDRF')
bcbdefclean = bcbcomparer.extractbestfit(cleanbcb, defimdrf, 'CatOneIMDRF')
bcbdefnopun = bcbcomparer.extractbestfit(nopunbcb, defimdrf, 'CatOneIMDRF')

Correct Prediction
0    54
1    46
Name: count, dtype: int64
Correct Prediction
0    52
1    48
Name: count, dtype: int64
Correct Prediction
0    65
1    35
Name: count, dtype: int64


In [168]:
bcbf2single = bcbcomparer.extractbestfit(f2sent, singleterm, 'CatOneIMDRF')
bcbf2multi = bcbcomparer.extractbestfit(f2sent, multiterm, 'CatOneIMDRF')
bcbf2def = bcbcomparer.extractbestfit(f2sent, defimdrf, 'CatOneIMDRF')

Correct Prediction
0    63
1    37
Name: count, dtype: int64
Correct Prediction
0    72
1    28
Name: count, dtype: int64
Correct Prediction
0    51
1    49
Name: count, dtype: int64


In [169]:
bcbtrimsingle = bcbcomparer.extractbestfit(trimsent, singleterm, 'CatOneIMDRF')
bcbtrimmulti = bcbcomparer.extractbestfit(trimsent, multiterm, 'CatOneIMDRF')
bcbtrimdef = bcbcomparer.extractbestfit(trimsent, defimdrf, 'CatOneIMDRF')

Correct Prediction
0    81
1    19
Name: count, dtype: int64
Correct Prediction
0    88
1    12
Name: count, dtype: int64
Correct Prediction
0    54
1    46
Name: count, dtype: int64


In [170]:
bcbsinglef2clean = bcbcomparer.extractbestfit(cleanf2, singleterm, 'CatOneIMDRF')
bcbsingletrimclean = bcbcomparer.extractbestfit(cleantrim, singleterm, 'CatOneIMDRF')

Correct Prediction
0    60
1    40
Name: count, dtype: int64
Correct Prediction
0    83
1    17
Name: count, dtype: int64


In [171]:
bcbsinglef2nopun = bcbcomparer.extractbestfit(nopunf2, singleterm, 'CatOneIMDRF')
bcbsingletrimnopun = bcbcomparer.extractbestfit(nopuntrim, singleterm, 'CatOneIMDRF')

Correct Prediction
0    67
1    33
Name: count, dtype: int64
Correct Prediction
0    88
1    12
Name: count, dtype: int64


In [172]:
bcbmultif2clean = bcbcomparer.extractbestfit(cleanf2, multiterm, 'CatOneIMDRF')
bcbmultitrimclean = bcbcomparer.extractbestfit(cleantrim, multiterm, 'CatOneIMDRF')

Correct Prediction
0    69
1    31
Name: count, dtype: int64
Correct Prediction
0    88
1    12
Name: count, dtype: int64


In [173]:
bcbmultif2nopun = bcbcomparer.extractbestfit(nopunf2, multiterm, 'CatOneIMDRF')
bcbmultitrimnopun = bcbcomparer.extractbestfit(nopuntrim, multiterm, 'CatOneIMDRF')

Correct Prediction
0    78
1    22
Name: count, dtype: int64
Correct Prediction
0    93
1     7
Name: count, dtype: int64


In [174]:
bcbdeff2clean = bcbcomparer.extractbestfit(cleanf2, defimdrf, 'CatOneIMDRF')
bcbdeftrimclean = bcbcomparer.extractbestfit(cleantrim, defimdrf, 'CatOneIMDRF')

Correct Prediction
1    54
0    46
Name: count, dtype: int64
Correct Prediction
0    52
1    48
Name: count, dtype: int64


In [175]:
bcbdeff2nopun = bcbcomparer.extractbestfit(nopunf2, defimdrf, 'CatOneIMDRF')
bcbdeftrimnopun = bcbcomparer.extractbestfit(nopuntrim, defimdrf, 'CatOneIMDRF')

Correct Prediction
0    54
1    46
Name: count, dtype: int64
Correct Prediction
0    65
1    35
Name: count, dtype: int64


## EmbedRank approach

Which is the best number of keywords to predict? Let's take a comparison further on

In [15]:
from sentence_transformers import SentenceTransformer
from util.embedrank_sbert import EmbedRank
from util.prepemb import SbertComparer

embed_rank = EmbedRank()

In [19]:
# test for the changes
model = SentenceTransformer("all-MiniLM-L6-v2")
comparer = SbertComparer(model=model, df=sub_df, df_imdrf=df_imdrf)

weight3 = sub_df['FOI_TEXT'].apply(lambda x: embed_rank.getKWembedding(document=x, model=model, weight_flag=True))
weight3 = weight3.tolist()

In [17]:
for i in range(1, 10):
    sub_df[f'ListKW{i}'] = sub_df['Cleaned'].apply(lambda x: embed_rank._mmr(x, N=i)[0])
    sub_df[f'simKW{i}'] = sub_df['Cleaned'].apply(lambda x: embed_rank._mmr(x, N=i)[1])

In [21]:
# Compute weighted average of the embeddings using the similarity score as weights
import numpy as np

def weighted_avg(keylist, simlist):
    emb = []
    for key in keylist:
        emb.append(model.encode(key))

    emb = np.array(emb)
    simlist = np.array(simlist)  # Ensure simlist is a numpy array
    # computing the weighted average
    avg = np.average(emb, axis=0, weights=simlist)
    return avg

def normal_avg(keylist):
    emb = []
    for key in keylist:
        emb.append(model.encode(key))

    emb = np.array(emb)
    avg = np.mean(emb, axis=0)
    return avg

In [22]:
for i in range(1, 10):
    sub_df[f'EmbKW{i}'] = sub_df.apply(lambda x: weighted_avg(x[f'ListKW{i}'], x[f'simKW{i}']), axis=1)

In [29]:
singleterm = model.encode(df_imdrf['single_term'].to_list())
multiterm = model.encode(df_imdrf['multiple_terms'].to_list())
defimdrf = model.encode(df_imdrf['Definition'].to_list())

In [30]:
test0 = pd.DataFrame()

for i in range(1, 10):
    print(i)
    test0 = pd.concat([test0, comparer.extractbestmatch(sub_df[f'EmbKW{i}'], singleterm, 'CatOneIMDRF').add_prefix(f'KW{i}single_')], axis=1)
    test0 = pd.concat([test0, comparer.extractbestmatch(sub_df[f'EmbKW{i}'], multiterm, 'CatOneIMDRF').add_prefix(f'KW{i}multi_')], axis=1)
    test0 = pd.concat([test0, comparer.extractbestmatch(sub_df[f'EmbKW{i}'], defimdrf, 'CatOneIMDRF').add_prefix(f'KW{i}def_')], axis=1)
    print("\n")

1
Correct Prediction
0    52
1    48
Name: count, dtype: int64
Correct Prediction
0    62
1    38
Name: count, dtype: int64
Correct Prediction
0    58
1    42
Name: count, dtype: int64


2
Correct Prediction
0    55
1    45
Name: count, dtype: int64
Correct Prediction
0    60
1    40
Name: count, dtype: int64
Correct Prediction
1    52
0    48
Name: count, dtype: int64


3
Correct Prediction
0    53
1    47
Name: count, dtype: int64
Correct Prediction
0    63
1    37
Name: count, dtype: int64
Correct Prediction
0    55
1    45
Name: count, dtype: int64


4
Correct Prediction
0    56
1    44
Name: count, dtype: int64
Correct Prediction
0    65
1    35
Name: count, dtype: int64
Correct Prediction
0    58
1    42
Name: count, dtype: int64


5
Correct Prediction
0    59
1    41
Name: count, dtype: int64
Correct Prediction
0    65
1    35
Name: count, dtype: int64
Correct Prediction
0    54
1    46
Name: count, dtype: int64


6
Correct Prediction
0    60
1    40
Name: count, dtype: int64
Co

In [31]:
for i in range(1, 10):
    sub_df[f'EmbKWavg{i}'] = sub_df[f'ListKW{i}'].apply(normal_avg)

In [32]:
for i in range(1,10):
    print(i)
    test0 = pd.concat([test0, comparer.extractbestmatch(sub_df[f'EmbKWavg{i}'], singleterm, 'CatOneIMDRF').add_prefix(f'KWavg{i}single_')], axis=1)
    test0 = pd.concat([test0, comparer.extractbestmatch(sub_df[f'EmbKWavg{i}'], multiterm, 'CatOneIMDRF').add_prefix(f'KWavg{i}multi_')], axis=1)
    test0 = pd.concat([test0, comparer.extractbestmatch(sub_df[f'EmbKWavg{i}'], defimdrf, 'CatOneIMDRF').add_prefix(f'KWavg{i}def_')], axis=1)

1
Correct Prediction
0    52
1    48
Name: count, dtype: int64
Correct Prediction
0    62
1    38
Name: count, dtype: int64
Correct Prediction
0    58
1    42
Name: count, dtype: int64
2
Correct Prediction
0    60
1    40
Name: count, dtype: int64
Correct Prediction
0    68
1    32
Name: count, dtype: int64
Correct Prediction
0    60
1    40
Name: count, dtype: int64
3
Correct Prediction
0    57
1    43
Name: count, dtype: int64
Correct Prediction
0    66
1    34
Name: count, dtype: int64
Correct Prediction
0    62
1    38
Name: count, dtype: int64
4
Correct Prediction
0    66
1    34
Name: count, dtype: int64
Correct Prediction
0    76
1    24
Name: count, dtype: int64
Correct Prediction
0    65
1    35
Name: count, dtype: int64
5
Correct Prediction
0    69
1    31
Name: count, dtype: int64
Correct Prediction
0    82
1    18
Name: count, dtype: int64
Correct Prediction
0    69
1    31
Name: count, dtype: int64
6
Correct Prediction
0    74
1    26
Name: count, dtype: int64
Correct Pred

### Sentence Transformers

In [14]:
from sentence_transformers import SentenceTransformer

# model = SentenceTransformer("all-MiniLM-L6-v2", similarity_fn_name=SimilarityFunction.COSINE)
model = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO')
# model = SentenceTransformer("neuml/pubmedbert-base-embeddings")
# model = SentenceTransformer('TimKond/S-PubMedBert-MedQuAD')

In [15]:
from util.embedrank_sbert import SBERT_EmbedRank

embed_rank = SBERT_EmbedRank(model=model)

In [18]:
from util.prepemb import SbertComparer
comparer = SbertComparer(model=model, df=sub_df, df_imdrf=df_imdrf)

2 keywords seems to be the best option 

In [19]:
sub_df[f'ListKW'] = sub_df['Cleaned'].apply(lambda x: embed_rank._mmr(x, N=2)[0])
sub_df[f'simKW'] = sub_df['Cleaned'].apply(lambda x: embed_rank._mmr(x, N=2)[1])

sub_df['EmbKW'] = sub_df.apply(lambda x: weighted_avg(x['ListKW'], x['simKW']), axis=1)
sub_df['EmbKWavg'] = sub_df['ListKW'].apply(normal_avg)

In [20]:
df_imdrf['DefKW'] = df_imdrf['Definition'].apply(lambda x: embed_rank._mmr(x, N=2)[0])
df_imdrf['simDefKW'] = df_imdrf['Definition'].apply(lambda x: embed_rank._mmr(x, N=2)[1])

In [21]:
defkw = df_imdrf.apply(lambda x: weighted_avg(x['DefKW'], x['simDefKW']), axis=1)
defkwavg = df_imdrf['DefKW'].apply(normal_avg)

In [22]:
# compute embeddings
keytext = sub_df['EmbKW'].to_list()
keytextavg = sub_df['EmbKWavg'].to_list()

In [24]:
singleterm = model.encode(df_imdrf['single_term'].to_list())
multiterm = model.encode(df_imdrf['multiple_terms'].to_list())
defimdrf = model.encode(df_imdrf['Definition'].to_list())

In [25]:
sbert_single_kw = comparer.extractbestmatch(keytext, singleterm, 'CatOneIMDRF')
sbert_single_kwavg = comparer.extractbestmatch(keytextavg, singleterm, 'CatOneIMDRF')

Correct Prediction
0    57
1    43
Name: count, dtype: int64
Correct Prediction
0    58
1    42
Name: count, dtype: int64


In [26]:
sbert_multi_ftext_first = comparer.extractbestmatch(keytext, multiterm, 'CatOneIMDRF')
sbert_multi_ftext_avg = comparer.extractbestmatch(keytextavg, multiterm, 'CatOneIMDRF')

Correct Prediction
0    66
1    34
Name: count, dtype: int64
Correct Prediction
0    65
1    35
Name: count, dtype: int64


In [27]:
sbert_def_kw = comparer.extractbestmatch(keytext, defimdrf, 'CatOneIMDRF')
sbert_def_kwavg = comparer.extractbestmatch(keytextavg, defimdrf, 'CatOneIMDRF')

Correct Prediction
1    55
0    45
Name: count, dtype: int64
Correct Prediction
1    55
0    45
Name: count, dtype: int64


In [28]:
defkw = df_imdrf.apply(lambda x: weighted_avg(x['DefKW'], x['simDefKW']), axis=1)
defkwavg = df_imdrf['DefKW'].apply(normal_avg)

In [29]:
prova = comparer.extractbestmatch(keytext, defkw, 'CatOneIMDRF')
prova1 = comparer.extractbestmatch(keytextavg, defkwavg, 'CatOneIMDRF')
prova2 = comparer.extractbestmatch(keytext, defkwavg, 'CatOneIMDRF')
prova3 = comparer.extractbestmatch(keytextavg, defkw, 'CatOneIMDRF')

  a = torch.tensor(a)


Correct Prediction
0    54
1    46
Name: count, dtype: int64
Correct Prediction
0    54
1    46
Name: count, dtype: int64
Correct Prediction
0    54
1    46
Name: count, dtype: int64
Correct Prediction
0    54
1    46
Name: count, dtype: int64


### BioClinicalBERT

In [154]:
bcb_multikey = extractbestfit(keyemb, multiterm, 'CatOneIMDRF')

Correct Prediction
0    80
1    20
Name: count, dtype: int64


In [155]:
bcb_defkey = extractbestfit(keyemb, defimdrf, 'CatOneIMDRF')

Correct Prediction
0    100
Name: count, dtype: int64


In [159]:
keydef

array([[[ 0.23155749,  0.42346317, -0.28617242, ...,  0.07774629,
          0.3985537 , -0.09858841]],

       [[ 0.24650027,  0.5146463 , -0.15345092, ..., -0.01318007,
          0.53123015, -0.01704025]],

       [[ 0.35450855,  0.48198155, -0.15754001, ..., -0.23693207,
          0.3843452 , -0.16703849]],

       ...,

       [[ 0.4327824 ,  0.4073781 , -0.19794036, ..., -0.15663508,
          0.1246662 , -0.08677831]],

       [[ 0.45247906,  0.3986438 , -0.35425413, ..., -0.13332123,
          0.15436204, -0.19034617]],

       [[ 0.44606107,  0.2969163 , -0.16692372, ...,  0.05608018,
          0.0501839 , -0.32752177]]], dtype=float32)

normal stuff

In [136]:
normal = bcbcomparer.compute_bioclinical_embeddings(sub_df['FOI_TEXT'], 'cls')
pun = bcbcomparer.compute_bioclinical_embeddings(sub_df['Cleaned'], 'cls')
nopun = bcbcomparer.compute_bioclinical_embeddings(sub_df['TEXT_CLEAN'], 'cls')

In [137]:
normal_single = extractbestfit(normal, singleterm, 'CatOneIMDRF')
normal_multi = extractbestfit(normal, multiterm, 'CatOneIMDRF')
normal_def = extractbestfit(normal, defimdrf, 'CatOneIMDRF')

Correct Prediction
0    84
1    16
Name: count, dtype: int64
Correct Prediction
0    84
1    16
Name: count, dtype: int64
Correct Prediction
0    95
1     5
Name: count, dtype: int64


In [138]:
punsingle = extractbestfit(pun, singleterm, 'CatOneIMDRF')
punmulti = extractbestfit(pun, multiterm, 'CatOneIMDRF')
pundef = extractbestfit(pun, defimdrf, 'CatOneIMDRF')

Correct Prediction
0    80
1    20
Name: count, dtype: int64
Correct Prediction
0    86
1    14
Name: count, dtype: int64
Correct Prediction
0    89
1    11
Name: count, dtype: int64


In [139]:
nopunsingle = extractbestfit(nopun, singleterm, 'CatOneIMDRF')
nopunmulti = extractbestfit(nopun, multiterm, 'CatOneIMDRF')
nopundef = extractbestfit(nopun, defimdrf, 'CatOneIMDRF')

Correct Prediction
0    80
1    20
Name: count, dtype: int64
Correct Prediction
0    81
1    19
Name: count, dtype: int64
Correct Prediction
0    89
1    11
Name: count, dtype: int64


In [140]:
bcbf2 = sub_df['FOI_TEXT'].apply(lambda x: preprocessor.first_two_sentences(x))
bcbf2 = bcbcomparer.compute_bioclinical_embeddings(bcbf2, 'cls')

In [141]:
bcbsingle = extractbestfit(bcbf2, singleterm, 'CatOneIMDRF')
bcbmulti = extractbestfit(bcbf2, multiterm, 'CatOneIMDRF')
bcbdef = extractbestfit(bcbf2, defimdrf, 'CatOneIMDRF')

Correct Prediction
0    85
1    15
Name: count, dtype: int64
Correct Prediction
0    88
1    12
Name: count, dtype: int64
Correct Prediction
0    83
1    17
Name: count, dtype: int64


In [142]:
trims = sub_df['FOI_TEXT'].apply(lambda x: preprocessor.trimtextlen(x))
trims = bcbcomparer.compute_bioclinical_embeddings(trims, 'cls')

In [143]:
trimsingle = extractbestfit(trims, singleterm, 'CatOneIMDRF')
trimmulti = extractbestfit(trims, multiterm, 'CatOneIMDRF')
trimdef = extractbestfit(trims, defimdrf, 'CatOneIMDRF')

Correct Prediction
0    80
1    20
Name: count, dtype: int64
Correct Prediction
0    86
1    14
Name: count, dtype: int64
Correct Prediction
0    89
1    11
Name: count, dtype: int64


## Final Approach

In [18]:
from util.embedrank_sbert import EmbedRank

embed_rank = EmbedRank()

### Sentence Transformers

In [52]:
from sentence_transformers import SentenceTransformer

#model = SentenceTransformer("all-MiniLM-L6-v2")
#model = SentenceTransformer('pritamdeka/S-PubMedBert-MS-MARCO')
#model = SentenceTransformer("neuml/pubmedbert-base-embeddings")
#model = SentenceTransformer('TimKond/S-PubMedBert-MedQuAD')
model = SentenceTransformer('pritamdeka/BioBERT-mnli-snli-scinli-scitail-mednli-stsb')

Error during conversion: ChunkedEncodingError(ProtocolError('Response ended prematurely'))


In [53]:
from util.prepemb import SbertComparer

comparer = SbertComparer(model=model, df=sub_df, df_imdrf=df_imdrf1st)

In [54]:
singleterm = model.encode(df_imdrf1st['single_term'].to_list())
multiterm = model.encode(df_imdrf1st['multiple_terms'].to_list())
defimdrf = model.encode(df_imdrf1st['Definition'].to_list())

In [55]:
defkw_wt = df_imdrf1st['Definition'].apply(
    lambda x: embed_rank.getKWembedding(document=x, model=model, weight_flag=True)
).to_list()

defkw_wt_array = np.array(defkw_wt)
defkw_wt_array = np.squeeze(defkw_wt_array, axis=1)

defkw_avg = df_imdrf1st['Definition'].apply(
    lambda x: embed_rank.getKWembedding(document=x, model=model, weight_flag=False)
).to_list()

defkw_avg_array = np.array(defkw_avg)
defkw_avg_array = np.squeeze(defkw_avg_array, axis=1)

In [56]:
normaltext = model.encode(sub_df['FOI_TEXT'].to_list())
cleanpun = model.encode(sub_df['Cleaned'].to_list())
nopun = model.encode(sub_df['NoPunctuation'].to_list())

cleanstop10 = model.encode(sub_df['NoStop10'].to_list())
cleanstop20 = model.encode(sub_df['NoStop20'].to_list())
cleanstop30 = model.encode(sub_df['NoStop30'].to_list())

f2sent = model.encode(sub_df['f2sent'].to_list())
trimsent = model.encode(sub_df['trimsent'].to_list())
cleanf2 = model.encode(sub_df['cleanf2'].to_list())
cleantrim = model.encode(sub_df['cleantrim'].to_list())
nopunf2 = model.encode(sub_df['nopunf2'].to_list())
nopuntrim = model.encode(sub_df['nopuntrim'].to_list())

kw_wt = sub_df['FOI_TEXT'].apply(lambda x: embed_rank.getKWembedding(document=x, model=model, weight_flag=True)).to_list()
kw_avg = sub_df['FOI_TEXT'].apply(lambda x: embed_rank.getKWembedding(document=x, model=model, weight_flag=False)).to_list()

kw_wt_array = np.array(kw_wt)
kw_avg_array = np.array(kw_avg)
kw_wt_array = np.squeeze(kw_wt_array, axis=1)
kw_avg_array = np.squeeze(kw_avg_array, axis=1)

In [60]:
normalsingle = comparer.extractbestmatch(normaltext, singleterm, 'CatOneIMDRF')
normalmulti = comparer.extractbestmatch(normaltext, multiterm, 'CatOneIMDRF')
normaldef = comparer.extractbestmatch(normaltext, defimdrf, 'CatOneIMDRF')
normaldefwt = comparer.extractbestmatch(normaltext, defkw_wt_array, 'CatOneIMDRF')
normaldefavg = comparer.extractbestmatch(normaltext, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
1    60
0    40
Name: count, dtype: int64
Correct Prediction
0    55
1    45
Name: count, dtype: int64
Correct Prediction
1    59
0    41
Name: count, dtype: int64
Correct Prediction
0    53
1    47
Name: count, dtype: int64
Correct Prediction
0    58
1    42
Name: count, dtype: int64


In [32]:
cleansingle = comparer.extractbestmatch(cleanpun, singleterm, 'CatOneIMDRF')
cleanmulti = comparer.extractbestmatch(cleanpun, multiterm, 'CatOneIMDRF')
cleandef = comparer.extractbestmatch(cleanpun, defimdrf, 'CatOneIMDRF')
cleandefwt = comparer.extractbestmatch(cleanpun, defkw_wt_array, 'CatOneIMDRF')
cleandefavg = comparer.extractbestmatch(cleanpun, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    59
1    41
Name: count, dtype: int64
Correct Prediction
0    57
1    43
Name: count, dtype: int64
Correct Prediction
0    62
1    38
Name: count, dtype: int64
Correct Prediction
1    59
0    41
Name: count, dtype: int64
Correct Prediction
1    61
0    39
Name: count, dtype: int64


In [27]:
nopunsingle = comparer.extractbestmatch(nopun, singleterm, 'CatOneIMDRF')
nopunmulti = comparer.extractbestmatch(nopun, multiterm, 'CatOneIMDRF')
nopundef = comparer.extractbestmatch(nopun, defimdrf, 'CatOneIMDRF')
nopundefwt = comparer.extractbestmatch(nopun, defkw_wt_array, 'CatOneIMDRF')
nopundefavg = comparer.extractbestmatch(nopun, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    97
1     3
Name: count, dtype: int64
Correct Prediction
0    97
1     3
Name: count, dtype: int64
Correct Prediction
0    90
1    10
Name: count, dtype: int64
Correct Prediction
0    86
1    14
Name: count, dtype: int64
Correct Prediction
0    89
1    11
Name: count, dtype: int64


In [57]:
nostop10single = comparer.extractbestmatch(cleanstop10, singleterm, 'CatOneIMDRF')
nostop10multi = comparer.extractbestmatch(cleanstop10, multiterm, 'CatOneIMDRF')
nostop10def = comparer.extractbestmatch(cleanstop10, defimdrf, 'CatOneIMDRF')
nostop10defwt = comparer.extractbestmatch(cleanstop10, defkw_wt_array, 'CatOneIMDRF')
nostop10defavg = comparer.extractbestmatch(cleanstop10, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    88
1    12
Name: count, dtype: int64
Correct Prediction
0    88
1    12
Name: count, dtype: int64
Correct Prediction
0    74
1    26
Name: count, dtype: int64
Correct Prediction
0    89
1    11
Name: count, dtype: int64
Correct Prediction
0    93
1     7
Name: count, dtype: int64


In [58]:
nostop20single = comparer.extractbestmatch(cleanstop20, singleterm, 'CatOneIMDRF')
nostop20multi = comparer.extractbestmatch(cleanstop20, multiterm, 'CatOneIMDRF')
nostop20def = comparer.extractbestmatch(cleanstop20, defimdrf, 'CatOneIMDRF')
nostop20defwt = comparer.extractbestmatch(cleanstop20, defkw_wt_array, 'CatOneIMDRF')
nostop20defavg = comparer.extractbestmatch(cleanstop20, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    89
1    11
Name: count, dtype: int64
Correct Prediction
0    89
1    11
Name: count, dtype: int64
Correct Prediction
0    82
1    18
Name: count, dtype: int64
Correct Prediction
0    90
1    10
Name: count, dtype: int64
Correct Prediction
0    93
1     7
Name: count, dtype: int64


In [59]:
nostop30single = comparer.extractbestmatch(cleanstop30, singleterm, 'CatOneIMDRF')
nostop30multi = comparer.extractbestmatch(cleanstop30, multiterm, 'CatOneIMDRF')
nostop30def = comparer.extractbestmatch(cleanstop30, defimdrf, 'CatOneIMDRF')
nostop30defwt = comparer.extractbestmatch(cleanstop30, defkw_wt_array, 'CatOneIMDRF')
nostop30defavg = comparer.extractbestmatch(cleanstop30, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    88
1    12
Name: count, dtype: int64
Correct Prediction
0    88
1    12
Name: count, dtype: int64
Correct Prediction
0    77
1    23
Name: count, dtype: int64
Correct Prediction
0    94
1     6
Name: count, dtype: int64
Correct Prediction
0    94
1     6
Name: count, dtype: int64


In [148]:
f2single = comparer.extractbestmatch(f2sent, singleterm, 'CatOneIMDRF')
f2multi = comparer.extractbestmatch(f2sent, multiterm, 'CatOneIMDRF')
f2def = comparer.extractbestmatch(f2sent, defimdrf, 'CatOneIMDRF')
f2defwt = comparer.extractbestmatch(f2sent, defkw_wt_array, 'CatOneIMDRF')
f2defavg = comparer.extractbestmatch(f2sent, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    96
1     4
Name: count, dtype: int64
Correct Prediction
0    96
1     4
Name: count, dtype: int64
Correct Prediction
0    66
1    34
Name: count, dtype: int64
Correct Prediction
0    95
1     5
Name: count, dtype: int64
Correct Prediction
0    91
1     9
Name: count, dtype: int64


In [149]:
trimsingle = comparer.extractbestmatch(trimsent, singleterm, 'CatOneIMDRF')
trimmulti = comparer.extractbestmatch(trimsent, multiterm, 'CatOneIMDRF')
trimdef = comparer.extractbestmatch(trimsent, defimdrf, 'CatOneIMDRF')
trimdefwt = comparer.extractbestmatch(trimsent, defkw_wt_array, 'CatOneIMDRF')
trimdefavg = comparer.extractbestmatch(trimsent, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    92
1     8
Name: count, dtype: int64
Correct Prediction
0    92
1     8
Name: count, dtype: int64
Correct Prediction
0    72
1    28
Name: count, dtype: int64
Correct Prediction
0    94
1     6
Name: count, dtype: int64
Correct Prediction
0    96
1     4
Name: count, dtype: int64


In [150]:
cleanf2single = comparer.extractbestmatch(cleanf2, singleterm, 'CatOneIMDRF')
cleanf2multi = comparer.extractbestmatch(cleanf2, multiterm, 'CatOneIMDRF')
cleanf2def = comparer.extractbestmatch(cleanf2, defimdrf, 'CatOneIMDRF')
cleanf2defwt = comparer.extractbestmatch(cleanf2, defkw_wt_array, 'CatOneIMDRF')
cleanf2defavg = comparer.extractbestmatch(cleanf2, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    95
1     5
Name: count, dtype: int64
Correct Prediction
0    95
1     5
Name: count, dtype: int64
Correct Prediction
0    61
1    39
Name: count, dtype: int64
Correct Prediction
0    93
1     7
Name: count, dtype: int64
Correct Prediction
0    91
1     9
Name: count, dtype: int64


In [151]:
cleantrimsingle = comparer.extractbestmatch(cleantrim, singleterm, 'CatOneIMDRF')
cleantrimmulti = comparer.extractbestmatch(cleantrim, multiterm, 'CatOneIMDRF')
cleantrimdef = comparer.extractbestmatch(cleantrim, defimdrf, 'CatOneIMDRF')
cleantrimdefwt = comparer.extractbestmatch(cleantrim, defkw_wt_array, 'CatOneIMDRF')
cleantrimdefavg = comparer.extractbestmatch(cleantrim, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    89
1    11
Name: count, dtype: int64
Correct Prediction
0    89
1    11
Name: count, dtype: int64
Correct Prediction
0    70
1    30
Name: count, dtype: int64
Correct Prediction
0    95
1     5
Name: count, dtype: int64
Correct Prediction
0    96
1     4
Name: count, dtype: int64


In [152]:
nopunf2single = comparer.extractbestmatch(nopunf2, singleterm, 'CatOneIMDRF')
nopunf2multi = comparer.extractbestmatch(nopunf2, multiterm, 'CatOneIMDRF')
nopunf2def = comparer.extractbestmatch(nopunf2, defimdrf, 'CatOneIMDRF')
nopunf2defwt = comparer.extractbestmatch(nopunf2, defkw_wt_array, 'CatOneIMDRF')
nopunf2defavg = comparer.extractbestmatch(nopunf2, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    96
1     4
Name: count, dtype: int64
Correct Prediction
0    96
1     4
Name: count, dtype: int64
Correct Prediction
0    63
1    37
Name: count, dtype: int64
Correct Prediction
0    92
1     8
Name: count, dtype: int64
Correct Prediction
0    91
1     9
Name: count, dtype: int64


In [153]:
nopuntrimsingle = comparer.extractbestmatch(nopuntrim, singleterm, 'CatOneIMDRF')
nopuntrimmulti = comparer.extractbestmatch(nopuntrim, multiterm, 'CatOneIMDRF')
nopuntrimdef = comparer.extractbestmatch(nopuntrim, defimdrf, 'CatOneIMDRF')
nopuntrimdefwt = comparer.extractbestmatch(nopuntrim, defkw_wt_array, 'CatOneIMDRF')
nopuntrimdefavg = comparer.extractbestmatch(nopuntrim, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    92
1     8
Name: count, dtype: int64
Correct Prediction
0    92
1     8
Name: count, dtype: int64
Correct Prediction
0    69
1    31
Name: count, dtype: int64
Correct Prediction
0    96
1     4
Name: count, dtype: int64
Correct Prediction
0    96
1     4
Name: count, dtype: int64


In [154]:
kwwtsingle = comparer.extractbestmatch(kw_wt_array, singleterm, 'CatOneIMDRF')
kwwtmulti = comparer.extractbestmatch(kw_wt_array, multiterm, 'CatOneIMDRF')
kwwtdef = comparer.extractbestmatch(kw_wt_array, defimdrf, 'CatOneIMDRF')
kwwtdefwt = comparer.extractbestmatch(kw_wt_array, defkw_wt_array, 'CatOneIMDRF')
kwwtdefavg = comparer.extractbestmatch(kw_wt_array, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    90
1    10
Name: count, dtype: int64
Correct Prediction
0    90
1    10
Name: count, dtype: int64
Correct Prediction
0    60
1    40
Name: count, dtype: int64
Correct Prediction
0    72
1    28
Name: count, dtype: int64
Correct Prediction
0    69
1    31
Name: count, dtype: int64


In [155]:
kwavgsingle = comparer.extractbestmatch(kw_avg_array, singleterm, 'CatOneIMDRF')
kwavgmulti = comparer.extractbestmatch(kw_avg_array, multiterm, 'CatOneIMDRF')
kwavgdef = comparer.extractbestmatch(kw_avg_array, defimdrf, 'CatOneIMDRF')
kwavgdefwt = comparer.extractbestmatch(kw_avg_array, defkw_wt_array, 'CatOneIMDRF')
kwavgdefavg = comparer.extractbestmatch(kw_avg_array, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    66
1    34
Name: count, dtype: int64
Correct Prediction
0    88
1    12
Name: count, dtype: int64
Correct Prediction
0    78
1    22
Name: count, dtype: int64


### Transformers

In [60]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")

In [94]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

In [140]:
from transformers import AutoTokenizer, AutoModel
model = AutoModel.from_pretrained("ncbi/MedCPT-Query-Encoder")
tokenizer = AutoTokenizer.from_pretrained("ncbi/MedCPT-Query-Encoder")

In [61]:
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")
model = AutoModelForMaskedLM.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")

Some weights of the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [141]:
from util.prepemb import BCBComparer
bcbcomparer = BCBComparer(df=sub_df, df_imdrf=df_imdrf1st, tokenizer=tokenizer, model=model)

In [164]:
#token = 'cls'
token = 'mean'

In [165]:
normalbcb = bcbcomparer.compute_bioclinical_embeddings(sub_df['FOI_TEXT'], token)
cleanbcb = bcbcomparer.compute_bioclinical_embeddings(sub_df['Cleaned'], token)
nopunbcb = bcbcomparer.compute_bioclinical_embeddings(sub_df['NoPunctuation'], token)

In [166]:
nostop10 = bcbcomparer.compute_bioclinical_embeddings(sub_df['NoStop10'], token)
nostop20 = bcbcomparer.compute_bioclinical_embeddings(sub_df['NoStop20'], token)
nostop30 = bcbcomparer.compute_bioclinical_embeddings(sub_df['NoStop30'], token)

In [167]:
f2sent = bcbcomparer.compute_bioclinical_embeddings(sub_df['f2sent'], token)
trimsent = bcbcomparer.compute_bioclinical_embeddings(sub_df['trimsent'], token)
cleanf2 = bcbcomparer.compute_bioclinical_embeddings(sub_df['cleanf2'], token)
cleantrim = bcbcomparer.compute_bioclinical_embeddings(sub_df['cleantrim'], token)
nopunf2 = bcbcomparer.compute_bioclinical_embeddings(sub_df['nopunf2'], token)
nopuntrim = bcbcomparer.compute_bioclinical_embeddings(sub_df['nopuntrim'], token)

In [168]:
defkw_wt = df_imdrf1st['Definition'].apply(
    lambda x: embed_rank.getKWembedding(document=x, model=model, tokenizer=tokenizer, strategy=token, weight_flag=True)
).to_list()

defkw_wt_array = np.array(defkw_wt)
defkw_wt_array = np.squeeze(defkw_wt_array, axis=1)

defkw_avg = df_imdrf1st['Definition'].apply(
    lambda x: embed_rank.getKWembedding(document=x, model=model, tokenizer=tokenizer, strategy=token, weight_flag=False)
).to_list()

defkw_avg_array = np.array(defkw_avg)
defkw_avg_array = np.squeeze(defkw_avg_array, axis=1)

In [169]:
singleterm = bcbcomparer.compute_bioclinical_embeddings(df_imdrf1st['single_term'], token)
multiterm = bcbcomparer.compute_bioclinical_embeddings(df_imdrf1st['multiple_terms'], token)
defimdrf = bcbcomparer.compute_bioclinical_embeddings(df_imdrf1st['Definition'], token)

In [170]:
kw_wt = sub_df['FOI_TEXT'].apply(lambda x: embed_rank.getKWembedding(document=x, model=model, tokenizer=tokenizer, strategy=token, weight_flag=True)).to_list()
kw_avg = sub_df['FOI_TEXT'].apply(lambda x: embed_rank.getKWembedding(document=x, model=model, tokenizer=tokenizer, strategy=token, weight_flag=False)).to_list()

kw_wt_array = np.array(kw_wt)
kw_avg_array = np.array(kw_avg)
kw_wt_array = np.squeeze(kw_wt_array, axis=1)
kw_avg_array = np.squeeze(kw_avg_array, axis=1)

In [171]:
normalsingle = bcbcomparer.extractbestfit(normalbcb, singleterm, 'CatOneIMDRF')
normalmulti = bcbcomparer.extractbestfit(normalbcb, multiterm, 'CatOneIMDRF')
normaldef = bcbcomparer.extractbestfit(normalbcb, defimdrf, 'CatOneIMDRF')
normaldefwt = bcbcomparer.extractbestfit(normalbcb, defkw_wt_array, 'CatOneIMDRF')
normaldefavg = bcbcomparer.extractbestfit(normalbcb, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    93
1     7
Name: count, dtype: int64
Correct Prediction
0    94
1     6
Name: count, dtype: int64
Correct Prediction
0    94
1     6
Name: count, dtype: int64


In [172]:
cleansingle = bcbcomparer.extractbestfit(cleanbcb, singleterm, 'CatOneIMDRF')
cleanmulti = bcbcomparer.extractbestfit(cleanbcb, multiterm, 'CatOneIMDRF')
cleandef = bcbcomparer.extractbestfit(cleanbcb, defimdrf, 'CatOneIMDRF')
cleandefwt = bcbcomparer.extractbestfit(cleanbcb, defkw_wt_array, 'CatOneIMDRF')
cleandefavg = bcbcomparer.extractbestfit(cleanbcb, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    95
1     5
Name: count, dtype: int64
Correct Prediction
0    93
1     7
Name: count, dtype: int64
Correct Prediction
0    94
1     6
Name: count, dtype: int64


In [173]:
nopunsingle = bcbcomparer.extractbestfit(nopunbcb, singleterm, 'CatOneIMDRF')
nopunmulti = bcbcomparer.extractbestfit(nopunbcb, multiterm, 'CatOneIMDRF')
nopundef = bcbcomparer.extractbestfit(nopunbcb, defimdrf, 'CatOneIMDRF')
nopundefwt = bcbcomparer.extractbestfit(nopunbcb, defkw_wt_array, 'CatOneIMDRF')
nopundefavg = bcbcomparer.extractbestfit(nopunbcb, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    93
1     7
Name: count, dtype: int64
Correct Prediction
0    93
1     7
Name: count, dtype: int64
Correct Prediction
0    93
1     7
Name: count, dtype: int64


In [174]:
nostop10single = bcbcomparer.extractbestfit(nostop10, singleterm, 'CatOneIMDRF')
nostop10multi = bcbcomparer.extractbestfit(nostop10, multiterm, 'CatOneIMDRF')
nostop10def = bcbcomparer.extractbestfit(nostop10, defimdrf, 'CatOneIMDRF')
nostop10defwt = bcbcomparer.extractbestfit(nostop10, defkw_wt_array, 'CatOneIMDRF')
nostop10defavg = bcbcomparer.extractbestfit(nostop10, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    95
1     5
Name: count, dtype: int64
Correct Prediction
0    95
1     5
Name: count, dtype: int64
Correct Prediction
0    95
1     5
Name: count, dtype: int64


In [175]:
nostop20single = bcbcomparer.extractbestfit(nostop20, singleterm, 'CatOneIMDRF')
nostop20multi = bcbcomparer.extractbestfit(nostop20, multiterm, 'CatOneIMDRF')
nostop20def = bcbcomparer.extractbestfit(nostop20, defimdrf, 'CatOneIMDRF')
nostop20defwt = bcbcomparer.extractbestfit(nostop20, defkw_wt_array, 'CatOneIMDRF')
nostop20defavg = bcbcomparer.extractbestfit(nostop20, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    92
1     8
Name: count, dtype: int64
Correct Prediction
0    92
1     8
Name: count, dtype: int64
Correct Prediction
0    96
1     4
Name: count, dtype: int64
Correct Prediction
0    95
1     5
Name: count, dtype: int64
Correct Prediction
0    98
1     2
Name: count, dtype: int64


In [176]:
nostop30single = bcbcomparer.extractbestfit(nostop30, singleterm, 'CatOneIMDRF')
nostop30multi = bcbcomparer.extractbestfit(nostop30, multiterm, 'CatOneIMDRF')
nostop30def = bcbcomparer.extractbestfit(nostop30, defimdrf, 'CatOneIMDRF')
nostop30defwt = bcbcomparer.extractbestfit(nostop30, defkw_wt_array, 'CatOneIMDRF')
nostop30defavg = bcbcomparer.extractbestfit(nostop30, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    95
1     5
Name: count, dtype: int64
Correct Prediction
0    95
1     5
Name: count, dtype: int64
Correct Prediction
0    97
1     3
Name: count, dtype: int64


In [177]:
f2single = bcbcomparer.extractbestfit(f2sent, singleterm, 'CatOneIMDRF')
f2multi = bcbcomparer.extractbestfit(f2sent, multiterm, 'CatOneIMDRF')
f2def = bcbcomparer.extractbestfit(f2sent, defimdrf, 'CatOneIMDRF')
f2defwt = bcbcomparer.extractbestfit(f2sent, defkw_wt_array, 'CatOneIMDRF')
f2defavg = bcbcomparer.extractbestfit(f2sent, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    92
1     8
Name: count, dtype: int64
Correct Prediction
0    94
1     6
Name: count, dtype: int64
Correct Prediction
0    95
1     5
Name: count, dtype: int64


In [178]:
trimsingle = bcbcomparer.extractbestfit(trimsent, singleterm, 'CatOneIMDRF')
trimmulti = bcbcomparer.extractbestfit(trimsent, multiterm, 'CatOneIMDRF')
trimdef = bcbcomparer.extractbestfit(trimsent, defimdrf, 'CatOneIMDRF')
trimdefwt = bcbcomparer.extractbestfit(trimsent, defkw_wt_array, 'CatOneIMDRF')
trimdefavg = bcbcomparer.extractbestfit(trimsent, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    93
1     7
Name: count, dtype: int64
Correct Prediction
0    94
1     6
Name: count, dtype: int64
Correct Prediction
0    94
1     6
Name: count, dtype: int64


In [179]:
cleanf2single = bcbcomparer.extractbestfit(cleanf2, singleterm, 'CatOneIMDRF')
cleanf2multi = bcbcomparer.extractbestfit(cleanf2, multiterm, 'CatOneIMDRF')
cleanf2def = bcbcomparer.extractbestfit(cleanf2, defimdrf, 'CatOneIMDRF')
cleanf2defwt = bcbcomparer.extractbestfit(cleanf2, defkw_wt_array, 'CatOneIMDRF')
cleanf2defavg = bcbcomparer.extractbestfit(cleanf2, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    92
1     8
Name: count, dtype: int64
Correct Prediction
0    94
1     6
Name: count, dtype: int64
Correct Prediction
0    95
1     5
Name: count, dtype: int64


In [180]:
cleantrimsingle = bcbcomparer.extractbestfit(cleantrim, singleterm, 'CatOneIMDRF')
cleantrimmulti = bcbcomparer.extractbestfit(cleantrim, multiterm, 'CatOneIMDRF')
cleantrimdef = bcbcomparer.extractbestfit(cleantrim, defimdrf, 'CatOneIMDRF')
cleantrimdefwt = bcbcomparer.extractbestfit(cleantrim, defkw_wt_array, 'CatOneIMDRF')
cleantrimdefavg = bcbcomparer.extractbestfit(cleantrim, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    95
1     5
Name: count, dtype: int64
Correct Prediction
0    93
1     7
Name: count, dtype: int64
Correct Prediction
0    94
1     6
Name: count, dtype: int64


In [181]:
nopunf2single = bcbcomparer.extractbestfit(nopunf2, singleterm, 'CatOneIMDRF')
nopunf2multi = bcbcomparer.extractbestfit(nopunf2, multiterm, 'CatOneIMDRF')
nopunf2def = bcbcomparer.extractbestfit(nopunf2, defimdrf, 'CatOneIMDRF')
nopunf2defwt = bcbcomparer.extractbestfit(nopunf2, defkw_wt_array, 'CatOneIMDRF')
nopunf2defavg = bcbcomparer.extractbestfit(nopunf2, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    95
1     5
Name: count, dtype: int64
Correct Prediction
0    95
1     5
Name: count, dtype: int64
Correct Prediction
0    96
1     4
Name: count, dtype: int64


In [182]:
nopuntrimsingle = bcbcomparer.extractbestfit(nopuntrim, singleterm, 'CatOneIMDRF')
nopuntrimmulti = bcbcomparer.extractbestfit(nopuntrim, multiterm, 'CatOneIMDRF')
nopuntrimdef = bcbcomparer.extractbestfit(nopuntrim, defimdrf, 'CatOneIMDRF')
nopuntrimdefwt = bcbcomparer.extractbestfit(nopuntrim, defkw_wt_array, 'CatOneIMDRF')
nopuntrimdefavg = bcbcomparer.extractbestfit(nopuntrim, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    93
1     7
Name: count, dtype: int64
Correct Prediction
0    93
1     7
Name: count, dtype: int64
Correct Prediction
0    93
1     7
Name: count, dtype: int64


In [183]:
kwwtsingle = bcbcomparer.extractbestfit(kw_wt_array, singleterm, 'CatOneIMDRF')
kwwtmulti = bcbcomparer.extractbestfit(kw_wt_array, multiterm, 'CatOneIMDRF')
kwwtdef = bcbcomparer.extractbestfit(kw_wt_array, defimdrf, 'CatOneIMDRF')
kwwtdefwt = bcbcomparer.extractbestfit(kw_wt_array, defkw_wt_array, 'CatOneIMDRF')
kwwtdefavg = bcbcomparer.extractbestfit(kw_wt_array, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    93
1     7
Name: count, dtype: int64
Correct Prediction
0    87
1    13
Name: count, dtype: int64
Correct Prediction
0    88
1    12
Name: count, dtype: int64


In [184]:
kwavgsingle = bcbcomparer.extractbestfit(kw_avg_array, singleterm, 'CatOneIMDRF')
kwavgmulti = bcbcomparer.extractbestfit(kw_avg_array, multiterm, 'CatOneIMDRF')
kwavgdef = bcbcomparer.extractbestfit(kw_avg_array, defimdrf, 'CatOneIMDRF')
kwavgdefwt = bcbcomparer.extractbestfit(kw_avg_array, defkw_wt_array, 'CatOneIMDRF')
kwavgdefavg = bcbcomparer.extractbestfit(kw_avg_array, defkw_avg_array, 'CatOneIMDRF')

Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    91
1     9
Name: count, dtype: int64
Correct Prediction
0    93
1     7
Name: count, dtype: int64
Correct Prediction
0    87
1    13
Name: count, dtype: int64
Correct Prediction
0    89
1    11
Name: count, dtype: int64


In [185]:
kwavgdefavg

Unnamed: 0,Original IMDRF,Predicted IMDRF,Similarity Score,Correct Prediction
432108,A07,A16,0.929283,0
145991,A09,A09,0.891885,1
259118,A07,A12,0.878803,0
502603,A07,A12,0.837709,0
173353,A07,A09,0.847493,0
...,...,...,...,...
83969,A07,A09,0.838104,0
323755,A24,A16,0.849184,0
47378,A09,A12,0.892797,0
483388,A24,A16,0.889829,0
