# Calculate finetuned model performance on the classification task

### Load in rough responses and clean them

In [26]:
N_EXAMPLES = 100 # we only haves results on 100 examples

In [38]:
import pandas as pd
import numpy as np
from sklearn import metrics

In [6]:
response_df = pd.read_csv("output_response.csv", names=["raw_response"], header=None)
response_df.shape

(100, 1)

In [7]:
response_df

Unnamed: 0,raw_response
0,yes.
1,yes.
2,yes
3,yes.
4,summary : tailored interventions are ways to c...
...,...
95,yes. cigarette taxes are passed on to consumer...
96,summary : vertical lines in distal esophageal ...
97,yes.
98,yes.


In [19]:
temp_df = response_df

In [21]:
temp_df['yes_no_maybe'] = temp_df['raw_response'].apply(lambda x: 'yes' if 'yes' in x else ('no' if 'no' in x else 'maybe'))

In [22]:
temp_df

Unnamed: 0,raw_response,yes_no_maybe
0,yes.,yes
1,yes.,yes
2,yes,yes
3,yes.,yes
4,summary : tailored interventions are ways to c...,maybe
...,...,...
95,yes. cigarette taxes are passed on to consumer...,yes
96,summary : vertical lines in distal esophageal ...,no
97,yes.,yes
98,yes.,yes


#### Understand the maybe

In [25]:
temp_df[temp_df['yes_no_maybe'] == 'maybe']

Unnamed: 0,raw_response,yes_no_maybe
4,summary : tailored interventions are ways to c...,maybe
9,summary : a short stay or 23-hour ward is a ho...,maybe
11,summary : therapeutic anticoagulation is the u...,maybe
14,summary : emergency ultrasound fellowship prog...,maybe
20,summary : israeli hospitals are well prepared ...,maybe
21,summary : acute respiratory distress syndrome ...,maybe
37,summary : a pap smear is a test to check for c...,maybe
38,summary : pollen is a fine powder that comes f...,maybe
54,summary : a sternal fracture is a break in the...,maybe
56,summary : adjuvant chemotherapy is given after...,maybe


### Load true test set and binarize

In [8]:
test_df = pd.read_csv("cleaned_pubmed_qa_all.csv")
test_df.head()

Unnamed: 0,pubid,instruction,context,long_answer,output,num_characters_instruction,num_characters_output
0,21645374,Do mitochondria play a role in remodelling lac...,{'contexts': ['Programmed cell death (PCD) is ...,Results depicted mitochondrial dynamics in viv...,yes,90,3
1,16418930,Landolt C and snellen e acuity: differences in...,{'contexts': ['Assessment of visual acuity dep...,"Using the charts described, there was only a s...",no,68,2
2,9488747,"Syncope during bathing in infants, a pediatric...",{'contexts': ['Apparent life-threatening event...,"""Aquagenic maladies"" could be a pediatric form...",yes,79,3
3,17208539,Are the long-term results of the transanal pul...,{'contexts': ['The transanal endorectal pull-t...,Our long-term study showed significantly bette...,no,106,2
4,10808977,Can tailored interventions increase mammograph...,{'contexts': ['Telephone counseling and tailor...,The effects of the intervention were most pron...,yes,68,3


In [33]:
def binarize(df, input_colname, output_colname):
  df[output_colname] = 0
  df.loc[df[input_colname] == "yes", output_colname] = 1
  print("Total rows:", df.shape[0])
  print("Total rows with yes:", df[df[output_colname]==1].shape[0])
  print("Total rows with no:", df[df[output_colname]==0].shape[0])
  return df

In [17]:
temp = test_df
temp2 = binarize(df=temp, input_colname= "output", outut_colname="y_true")
temp2

Total rows: 1000
Total rows with yes: 552
Total rows with no: 448


Unnamed: 0,pubid,instruction,context,long_answer,output,num_characters_instruction,num_characters_output,y_true
0,21645374,Do mitochondria play a role in remodelling lac...,{'contexts': ['Programmed cell death (PCD) is ...,Results depicted mitochondrial dynamics in viv...,yes,90,3,1
1,16418930,Landolt C and snellen e acuity: differences in...,{'contexts': ['Assessment of visual acuity dep...,"Using the charts described, there was only a s...",no,68,2,0
2,9488747,"Syncope during bathing in infants, a pediatric...",{'contexts': ['Apparent life-threatening event...,"""Aquagenic maladies"" could be a pediatric form...",yes,79,3,1
3,17208539,Are the long-term results of the transanal pul...,{'contexts': ['The transanal endorectal pull-t...,Our long-term study showed significantly bette...,no,106,2,0
4,10808977,Can tailored interventions increase mammograph...,{'contexts': ['Telephone counseling and tailor...,The effects of the intervention were most pron...,yes,68,3,1
...,...,...,...,...,...,...,...,...
995,8921484,Does gestational age misclassification explain...,"{'contexts': ['After 34 weeks gestation, summa...",Gestational age misclassification is an unlike...,no,115,2,0
996,16564683,Is there any interest to perform ultrasonograp...,{'contexts': ['To evaluate the accuracy of ult...,Sonography has no place in the diagnosis of un...,no,81,2,0
997,23147106,Is peak concentration needed in therapeutic dr...,{'contexts': ['We analyzed the pharmacokinetic...,These results suggest little need to use peak ...,no,74,2,0
998,21550158,Can autologous platelet-rich plasma gel enhanc...,{'contexts': ['This investigation assesses the...,"The PRP group recorded reduced pain, swelling,...",yes,109,3,1


### Line up predictions and true values

In [31]:
score_df = temp2.head(N_EXAMPLES)
score_df['model_response'] = temp_df.yes_no_maybe

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_df['model_response'] = temp_df.yes_no_maybe


In [36]:
score_df = binarize(score_df, input_colname='model_response', output_colname='y_pred')
score_df

Total rows: 100
Total rows with yes: 60
Total rows with no: 40


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[output_colname] = 0


Unnamed: 0,pubid,instruction,context,long_answer,output,num_characters_instruction,num_characters_output,y_true,model_response,y_pred
0,21645374,Do mitochondria play a role in remodelling lac...,{'contexts': ['Programmed cell death (PCD) is ...,Results depicted mitochondrial dynamics in viv...,yes,90,3,1,yes,1
1,16418930,Landolt C and snellen e acuity: differences in...,{'contexts': ['Assessment of visual acuity dep...,"Using the charts described, there was only a s...",no,68,2,0,yes,1
2,9488747,"Syncope during bathing in infants, a pediatric...",{'contexts': ['Apparent life-threatening event...,"""Aquagenic maladies"" could be a pediatric form...",yes,79,3,1,yes,1
3,17208539,Are the long-term results of the transanal pul...,{'contexts': ['The transanal endorectal pull-t...,Our long-term study showed significantly bette...,no,106,2,0,yes,1
4,10808977,Can tailored interventions increase mammograph...,{'contexts': ['Telephone counseling and tailor...,The effects of the intervention were most pron...,yes,68,3,1,maybe,0
...,...,...,...,...,...,...,...,...,...,...
95,23076787,Can increases in the cigarette tax rate be lin...,"{'contexts': [""To explain China's cigarette pr...",Numerous studies have found that taxation is o...,no,77,2,0,yes,1
96,9199905,Vertical lines in distal esophageal mucosa (VL...,{'contexts': ['We observed an endoscopic abnor...,Histology usually demonstrated moderate to sev...,yes,110,3,1,no,0
97,23999452,Does hypoglycaemia increase the risk of cardio...,{'contexts': ['Hypoglycaemia caused by glucose...,Severe hypoglycaemia is associated with an inc...,yes,62,3,1,yes,1
98,22534881,Does the radiographic transition zone correlat...,"{'contexts': [""The correlation between radiogr...",Correlation between level of radiographic tran...,no,122,2,0,yes,1


In [37]:
print(score_df.shape)
score_df = score_df[score_df['model_response'] != 'maybe']
print(score_df.shape)

(100, 10)
(83, 10)


### Score the model

In [39]:
metrics.accuracy_score(y_true=score_df['y_true'], y_pred=score_df['y_pred'])

0.5421686746987951

In [40]:
metrics.precision_score(y_true=score_df['y_true'], y_pred=score_df['y_pred'])

0.5833333333333334

In [41]:
metrics.recall_score(y_true=score_df['y_true'], y_pred=score_df['y_pred'])

0.7291666666666666