# Import

In [None]:
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

from google.generativeai.types import HarmCategory, HarmBlockThreshold

GOOGLE_API_KEY = 'YOUR API KEY'
genai.configure(api_key=GOOGLE_API_KEY)
model = genai.GenerativeModel('gemini-pro')

In [None]:
prompt = '''
This is a sequential classification task for medical abstracts. There are only 5 labels, namely: OBJECTIVE, BACKGROUND, METHODS, RESULTS, CONCLUSIONS. I want to classify each sentence with its corresponding label. Note that each sentence corresponds to ONE AND ONLY ONE LABEL.

  Below are 10 abstracts for examples, with the format LABEL,text for each sentences in the abstract. Note that within 1 abstracts, the sentences are related to eachother but the abstracts are not related to eachother:

  ABSTRACT 1:
  OBJECTIVE	To investigate the efficacy of 6 weeks of daily low-dose oral prednisolone in improving pain , mobility , and systemic low-grade inflammation in the short term and whether the effect would be sustained at 12 weeks in older adults with moderate to severe knee osteoarthritis ( OA ) .
  METHODS	A total of 125 patients with primary knee OA were randomized 1:1 ; 63 received 7.5 mg/day of prednisolone and 62 received placebo for 6 weeks .
  METHODS	Outcome measures included pain reduction and improvement in function scores and systemic inflammation markers .
  METHODS	Pain was assessed using the visual analog pain scale ( 0-100 mm ) .
  METHODS	Secondary outcome measures included the Western Ontario and McMaster Universities Osteoarthritis Index scores , patient global assessment ( PGA ) of the severity of knee OA , and 6-min walk distance ( 6MWD ) .
  METHODS	Serum levels of interleukin 1 ( IL-1 ) , IL-6 , tumor necrosis factor ( TNF ) - , and high-sensitivity C-reactive protein ( hsCRP ) were measured .
  RESULTS	There was a clinically relevant reduction in the intervention group compared to the placebo group for knee pain , physical function , PGA , and 6MWD at 6 weeks .
  RESULTS	The mean difference between treatment arms ( 95 % CI ) was 10.9 ( 4.8-18 .0 ) , p < 0.001 ; 9.5 ( 3.7-15 .4 ) , p < 0.05 ; 15.7 ( 5.3-26 .1 ) , p < 0.001 ; and 86.9 ( 29.8-144 .1 ) , p < 0.05 , respectively .
  RESULTS	Further , there was a clinically relevant reduction in the serum levels of IL-1 , IL-6 , TNF - , and hsCRP at 6 weeks in the intervention group when compared to the placebo group .
  RESULTS	These differences remained significant at 12 weeks .
  RESULTS	The Outcome Measures in Rheumatology Clinical Trials-Osteoarthritis Research Society International responder rate was 65 % in the intervention group and 34 % in the placebo group ( p < 0.05 ) .
  CONCLUSIONS	Low-dose oral prednisolone had both a short-term and a longer sustained effect resulting in less knee pain , better physical function , and attenuation of systemic inflammation in older patients with knee OA ( ClinicalTrials.gov identifier NCT01619163 ) .

  ABSTRACT 2:
  BACKGROUND	Emotional eating is associated with overeating and the development of obesity .
  BACKGROUND	Yet , empirical evidence for individual ( trait ) differences in emotional eating and cognitive mechanisms that contribute to eating during sad mood remain equivocal .
  OBJECTIVE	The aim of this study was to test if attention bias for food moderates the effect of self-reported emotional eating during sad mood ( vs neutral mood ) on actual food intake .
  OBJECTIVE	It was expected that emotional eating is predictive of elevated attention for food and higher food intake after an experimentally induced sad mood and that attentional maintenance on food predicts food intake during a sad versus a neutral mood .
  METHODS	Participants ( N = 85 ) were randomly assigned to one of the two experimental mood induction conditions ( sad/neutral ) .
  METHODS	Attentional biases for high caloric foods were measured by eye tracking during a visual probe task with pictorial food and neutral stimuli .
  METHODS	Self-reported emotional eating was assessed with the Dutch Eating Behavior Questionnaire ( DEBQ ) and ad libitum food intake was tested by a disguised food offer .
  RESULTS	Hierarchical multivariate regression modeling showed that self-reported emotional eating did not account for changes in attention allocation for food or food intake in either condition .
  RESULTS	Yet , attention maintenance on food cues was significantly related to increased intake specifically in the neutral condition , but not in the sad mood condition .
  CONCLUSIONS	The current findings show that self-reported emotional eating ( based on the DEBQ ) might not validly predict who overeats when sad , at least not in a laboratory setting with healthy women .
  CONCLUSIONS	Results further suggest that attention maintenance on food relates to eating motivation when in a neutral affective state , and might therefore be a cognitive mechanism contributing to increased food intake in general , but maybe not during sad mood .

  ABSTRACT 3:
  BACKGROUND	Although working smoke alarms halve deaths in residential fires , many households do not keep alarms operational .
  BACKGROUND	We tested whether theory-based education increases alarm operability .
  METHODS	Randomised multiarm trial , with a single arm randomly selected for use each day , in low-income neighbourhoods in Maryland , USA .
  METHODS	Intervention arms : ( 1 ) Full Education combining a health belief module with a social-cognitive theory module that provided hands-on practice installing alarm batteries and using the alarm 's hush button ; ( 2 ) Hands-on Practice social-cognitive module supplemented by typical fire department education ; ( 3 ) Current Norm receiving typical fire department education only .
  METHODS	Four hundred and thirty-six homes recruited through churches or by knocking on doors in 2005-2008 .
  METHODS	Follow-up visits checked alarm operability in 370 homes ( 85 % ) 1-3 .5 years after installation .
  METHODS	number of homes with working alarms defined as alarms with working batteries or hard-wired and number of working alarms per home .
  METHODS	Regressions controlled for alarm status preintervention ; demographics and beliefs about fire risks and alarm effectiveness .
  RESULTS	Homes in the Full Education and Practice arms were more likely to have a functioning smoke alarm at follow-up ( OR = 2.77 , 95 % CI 1.09 to 7.03 ) and had an average of 0.32 more working alarms per home ( 95 % CI 0.09 to 0.56 ) .
  RESULTS	Working alarms per home rose 16 % .
  RESULTS	Full Education and Practice had similar effectiveness ( p = 0.97 on both outcome measures ) .
  CONCLUSIONS	Without exceeding typical fire department installation time , installers can achieve greater smoke alarm operability .
  CONCLUSIONS	Hands-on practice is key .
  CONCLUSIONS	Two years after installation , for every three homes that received hands-on practice , one had an additional working alarm .
  BACKGROUND	http://www.clinicaltrials.gov number NCT00139126 .

  ABSTRACT 4:
  OBJECTIVE	To evaluate the performance ( efficacy , safety and acceptability ) of a new micro-adherent absorbent dressing ( UrgoClean ) compared with a hydrofiber dressing ( Aquacel ) in the local management of venous leg ulcers , in the debridement stage .
  METHODS	A non-inferiority European randomised controlled clinical trial ( RCT ) was conducted in 37 centres , on patients presenting with venous or predominantly venous , mixed aetiology leg ulcers at their sloughy stage ( with more than 70 % of the wound bed covered with slough at baseline ) .
  METHODS	Patients were followed over a 6-week period and assessed weekly .
  METHODS	The primary judgement criteria was the relative regression of the wound surface area after the 6-week treatment period .
  METHODS	Secondary endpoints were the relative reduction of sloughy tissue and the percentage of patients presenting with a debrided wound .
  RESULTS	Altogether , 159 patients were randomised to either UrgoClean ( test group ; n = 83 ) or Aquacel ( control group ; n = 76 ) dressings .
  RESULTS	Regarding the wound healing process predictive factors ( wound area , duration , ABPI value , recurrence ) , at baseline , the two groups were well balanced , for both wound and patient characteristics .
  RESULTS	Compression therapy was administered to both groups and after a median 42-day treatment period , the percentage of relative reduction of the wound surface area was very similar ( -36.9 % vs -35.4 % in the UrgoClean and control groups , respectively ) .
  RESULTS	When considering the secondary criteria at week 6 , the relative reduction of sloughy tissue was significantly higher in the UrgoClean group than in the control group ( -65.3 % vs -42,6 % ; p = 0.013 ) .
  RESULTS	The percentage of debrided wounds was also significantly higher in the test group ( 52.5 % vs 35.1 % ; p = 0.033 ) .
  CONCLUSIONS	This ` EARTH ' RCT confirmed that the UrgoClean dressing has similar efficacy and safety compared to Aquacel .
  CONCLUSIONS	However , UrgoClean also showed better autolytic properties than the control group in the management of venous leg ulcers at the sloughy stage .
  CONCLUSIONS	The new UrgoClean dressing therefore represents a promising therapeutic option within the current range of autolytic dressings available .
  BACKGROUND	This study was sponsored by a grant from the pharmaceutical company Laboratoires Urgo .
  BACKGROUND	S. Bohbot and O. Tacca are employees of Laboratoires Urgo .
  BACKGROUND	S. Meaume , J. Dissemond and G. Perceau have received monetary compensation as presenters for Laboratoires Urgo .
  BACKGROUND	Data management and statistical analyses were conducted independently by Vertical ( Paris , France ) .

  ABSTRACT 5:
  OBJECTIVE	Eye movements ( EM ) during recall of an aversive memory is a treatment element unique to Eye Movement Desensitization and Reprocessing ( EMDR ) .
  OBJECTIVE	Experimental studies have shown that EM reduce memory vividness and/or emotionality shortly after the intervention .
  OBJECTIVE	However , it is unclear whether the immediate effects of the intervention reflect actual changes in memory .
  OBJECTIVE	The aim of this study was to test whether immediate reductions in memory vividness and emotionality persist at a 24h follow up and whether the magnitude of these effects is related to the duration of the intervention .
  METHODS	Seventy-three undergraduates recalled two negative autobiographical memories , one with EM ( `` recall with EM '' ) and one without ( `` recall only '' ) .
  METHODS	Half of participants recalled each memory for four periods of 24s , the other half for eight periods of 24s .
  METHODS	Memory vividness/emotionality were self-rated at a pre-test , an immediate post-test , and a 24h follow-up test .
  RESULTS	In both duration groups , recall with EM , but not recall only , caused an immediate decrease in memory vividness .
  RESULTS	There were no immediate reductions in memory emotionality .
  RESULTS	Furthermore , only the ` eight periods ' group showed that recall with EM , but not recall only , caused a decrease in both memory emotionality and memory vividness from the pre-test to the follow-up .
  CONCLUSIONS	Only self-report measures were used .
  CONCLUSIONS	The findings suggest that recall with EM causes 24-hchanges in memory vividness/emotionality , which may explain part of the EMDR treatment effect , and these effects are related to intervention duration .

  ABSTRACT 6:
  OBJECTIVE	Few studies have tested the impact of motivational interviewing ( MI ) delivered by primary care providers on pediatric obesity .
  OBJECTIVE	This study tested the efficacy of MI delivered by providers and registered dietitians ( RDs ) to parents of overweight children aged 2 through 8 .
  METHODS	Forty-two practices from the Pediatric Research in Office Settings Network of the American Academy of Pediatrics were randomly assigned to 1 of 3 groups .
  METHODS	Group 1 ( usual care ) measured BMI percentile at baseline and 1 - and 2-year follow-up .
  METHODS	Group 2 ( provider only ) delivered 4 MI counseling sessions to parents of the index child over 2 years .
  METHODS	Group 3 ( provider + RD ) delivered 4 provider MI sessions plus 6 MI sessions from a RD. .
  METHODS	The primary outcome was child BMI percentile at 2-year follow up .
  RESULTS	At 2-year follow-up , the adjusted BMI percentile was 90.3 , 88.1 , and 87.1 for groups 1 , 2 , and 3 , respectively .
  RESULTS	The group 3 mean was significantly ( P = .02 ) lower than group 1 .
  RESULTS	Mean changes from baseline in BMI percentile were 1.8 , 3.8 , and 4.9 across groups 1 , 2 , and 3 .
  CONCLUSIONS	MI delivered by providers and RDs ( group 3 ) resulted in statistically significant reductions in BMI percentile .
  CONCLUSIONS	Research is needed to determine the clinical significance and persistence of the BMI effects observed .
  CONCLUSIONS	How the intervention can be brought to scale ( in particular , how to train physicians to use MI effectively and how best to train RDs and integrate them into primary care settings ) also merits future research .

  ABSTRACT 7:
  BACKGROUND	Antithrombin ( AT ) concentrations are reduced after cardiac surgery with cardiopulmonary bypass compared with the preoperative levels .
  BACKGROUND	Low postoperative AT is associated with worse short - and mid-term clinical outcomes .
  BACKGROUND	The aim of the study is to evaluate the effects of AT administration on activation of the coagulation and fibrinolytic systems , platelet function , and the inflammatory response in patients with low postoperative AT levels .
  METHODS	Sixty patients with postoperative AT levels of less than 65 % were randomly assigned to receive purified AT ( 5000 IU in three administrations ) or placebo in the postoperative intensive care unit .
  METHODS	Thirty patients with postoperative AT levels greater than 65 % were observed as controls .
  METHODS	Interleukin 6 ( a marker of inflammation ) , prothrombin fragment 1-2 ( a marker of thrombin generation ) , plasmin-antiplasmin complex ( a marker of fibrinolysis ) , and platelet factor 4 ( a marker of platelet activation ) were measured at six different times .
  RESULTS	Compared with the no AT group and control patients , patients receiving AT showed significantly higher AT values until 48 hours after the last administration .
  RESULTS	Analysis of variance for repeated measures showed a significant effect of study treatment in reducing prothrombin fragment 1-2 ( p = 0.009 ; interaction with time sample , p = 0.006 ) and plasmin-antiplasmin complex ( p < 0.001 ; interaction with time sample , p < 0.001 ) values but not interleukin 6 ( p = 0.877 ; interaction with time sample , p = 0.521 ) and platelet factor 4 ( p = 0.913 ; interaction with time sample , p = 0.543 ) .
  RESULTS	No difference in chest tube drainage , reopening for bleeding , and blood transfusion was observed .
  CONCLUSIONS	Antithrombin administration in patients with low AT activity after surgery with cardiopulmonary bypass reduces postoperative thrombin generation and fibrinolysis with no effects on platelet activation and inflammatory response .

  ABSTRACT 8:
  OBJECTIVE	We evaluated patients at tertiary [ both percutaneous coronary intervention ( PCI ) and coronary artery bypass grafting ( CABG ) capable ] and primary hospitals in the EARLY-ACS trial .
  BACKGROUND	Early invasive management is recommended for high-risk non-ST-segment elevation acute coronary syndromes .
  METHODS	We evaluated outcomes in 9,204 patients presenting to : tertiary sites , primary sites with transfer to tertiary sites ( `` transferred '' ) and those who remained at primary sites ( `` non-transfer '' ) .
  RESULTS	There were 348 tertiary ( n = 7,455 patients ) and 89 primary hospitals [ n = 1,749 patients ( 729 transferred ; 1,020 non-transfer ) ] .
  RESULTS	Significant delays occurred in time from symptom onset to angiography ( 49 hr ) , PCI ( 53h ) , and CABG ( 178 hr ) for transferred patients ( P < 0.001 ) .
  RESULTS	Non-transfer patients had less 30-day death/myocardial infarction [ 9.4 % vs. 11.7 % ( tertiary ) ; adjusted odds ratio ( OR ) : 0.78 ( 0.62-0 .97 ) , P = 0.026 ] ; transferred ( 14.0 % ) and tertiary patients were similar [ adjusted OR : 1.23 ( 0.98-1 .53 ) , P = 0.074 ] .
  RESULTS	Non-transfer patients had lower 1-year mortality [ 4.3 % vs. 6.3 % ( tertiary ) ; adjusted hazard ratio ( HR ) : 0.64 ( 0.47-0 .87 ) , P = 0.005 ] : there was no difference between transferred and tertiary patients [ 5.2 % vs. 6.3 % ; adjusted HR : 0.80 ( 0.58-1 .12 ) , P = 0.202 ] .
  RESULTS	Despite similar rates of catheterization , GUSTO severe/moderate bleeding within 120 hr was less in non-transfer [ 3.1 % vs. 6.7 % ( tertiary ) ; adjusted OR : 0.47 ( 0.32-0 .68 ) , P < 0.001 ] , whereas transferred ( 6.1 % ) and tertiary patients were similar [ adjusted OR : 0.94 ( 0.68-1 .30 ) , P = 0.693 ] .
  RESULTS	There was no difference in non-CABG bleeding .
  CONCLUSIONS	Timely angiography and revascularization were often not achieved in transferred patients .
  CONCLUSIONS	Non-transferred patients presenting to primary sites had the lowest event rates and the best long-term survival .

  ABSTRACT 9:
  OBJECTIVE	We hypothesized that a targeted temperature of 33 C as compared to that of 36 C would increase survival and reduce the severity of circulatory shock in patients with shock on admission after out-of-hospital cardiac arrest ( OHCA ) .
  METHODS	The recently published Target Temperature Management trial ( TTM-trial ) randomized 939 OHCA patients with no difference in outcome between groups and no difference in mortality at the end of the trial in a predefined subgroup of patients with shock at admission .
  METHODS	Shock was defined as a systolic blood pressure of < 90 mm Hg for > 30 min or the need of supportive measures to maintain a blood pressure 90 mmHg and/or clinical signs of end-organ hypoperfusion .
  METHODS	In this post hoc analysis reported here , we further analyzed the 139 patients with shock at admission ; all had been randomized to receive intervention at 33 C ( TTM33 ; n = 71 ) or 36 C ( TTM36 ; n = 68 ) .
  METHODS	Primary outcome was 180-day mortality .
  METHODS	Secondary outcomes were intensive care unit ( ICU ) and 30-day mortality , severity of circulatory shock assessed by mean arterial pressure , serum lactate , fluid balance and the extended Sequential Organ Failure assessment ( SOFA ) score .
  RESULTS	There was no significance difference between targeted temperature management at 33 C or 36 C on 180-day mortality [ log-rank test , p = 0.17 , hazard ratio 1.33 , 95 % confidence interval ( CI ) 0.88-1 .98 ] or ICU mortality ( 61 vs. 44 % , p = 0.06 ; relative risk 1.37 , 95 % CI 0.99-1 .91 ) .
  RESULTS	Serum lactate and the extended cardiovascular SOFA score were higher in the TTM33 group ( p < 0.01 ) .
  CONCLUSIONS	We found no benefit in survival or severity of circulatory shock with targeted temperature management at 33 C as compared to 36 C in patients with shock on admission after OHCA .

  ABSTRACT 10:
  BACKGROUND	Youths with a family history of alcohol and other drug use disorders ( FH + ) are at a greater risk of developing substance use disorders than their peers with no such family histories ( FH - ) , and this increased risk may be related to impaired maturation of forebrain circuitry .
  BACKGROUND	FH + individuals have shown altered forebrain activity at rest and while performing cognitive tasks .
  BACKGROUND	However , it is not fully understood how forebrain activity is altered in FH + individuals , and ultimately how these alterations may contribute to substance use disorder risk .
  METHODS	In this study , we tested 72 FH + and 32 FH - youths performing a go/no-go task and examined activations in blocks with only go trials ( Go Only ) , blocks with 50 % go and 50 % no-go trials ( Go/NoGo ) , and a contrast of those 2 blocks .
  RESULTS	FH + youths had significantly greater cerebral activations in both the Go and Go/NoGo blocks than FH - youths in regions including the posterior cingulate/precuneus , bilateral middle/superior temporal gyrus , and medial superior frontal gyrus with no significant group differences in the subtraction between Go Only and Go/NoGo blocks .
  RESULTS	Additionally , FH + youths had moderately slower reaction times on go trials in the Go Only blocks .
  CONCLUSIONS	Our findings suggest that global activation increase in FH + youths are modulated by FH density and are not specific to the inhibitory components of the task .
  CONCLUSIONS	This pattern of increased activations in FH + youths may be at least partially due to impaired forebrain white matter development leading to greater activations/less efficient neural communication during task performance .

  What are the labels for each sentence in this abstract?

  We tested the hypothesis that clevidipine , a rapidly acting dihydropyridine calcium channel blocker , is not inferior to nitroglycerin ( NTG ) in controlling blood pressure before cardiopulmonary bypass ( CPB ) during coronary artery bypass grafting ( CABG ) . 
  
  In this double-blind study from October 4 , 2003 to April 26 , 2004 , 100 patients undergoing CABG with CPB were randomized at four centres to receive intravenous infusions of clevidipine ( 0.2-8gkg ( -1 ) min ( -1 ) ) or NTG ( 0.4 gkg ( -1 ) min ( -1 ) to a clinician-determined maximum dose rate ) from induction of anesthesia through 12hr postoperatively . 
  
  The study drug was titrated in the pre-CPB period with the aim of maintaining mean arterial pressure ( MAP ) within 5mmHg of a clinician-predetermined target .
   
  The primary endpoint was the area under the curve ( AUC ) for the total time each patient 's MAP was outside the target range from drug initiation to the start of CPB , normalized per hour ( AUCMAP-D ) . 
  
  The predefined non-inferiority criterion for the primary endpoint was a 95 % confidence interval ( CI ) upper limit no greater than 1.50 for the geometric means ratio between clevidipine and NTG . 
  
  Total mean [ standard deviation ( SD ) ] dose pre-bypass was 4.5 ( 4.7 ) mg for clevidipine and 6.9 ( 5.4 ) mg for NTG ( P < 0.05 ) . 
  
  The geometric mean AUCMAP-D for clevidipine was 283mmHgminhr ( -1 ) ( n = 45 ) and for NTG was 292mmHgminhr ( -1 ) ( n = 48 ) ; the geometric means ratio was 0.97 ( 95 % CI 0.74 to 1.27 ) . 
  
  The geometric mean AUCMAP-D during aortic cannulation was 357.7 mmHgminhr ( -1 ) for clevidipine compared with 190.5 mmHgminhr ( -1 ) for NTG . Mean ( SD ) heart rate with clevidipine was 76.0 ( 13.8 ) beatsmin ( -1 ) compared with 81.5 ( 14.4 ) beatsmin ( -1 ) for NTG . 
  
  There were no clinically important differences between groups in adverse events . 
  
  During CABG , clevidipine was not inferior to NTG for blood pressure control pre-bypass .

  (Note that there are only 5 labels and nothing else, namely: OBJECTIVE, BACKGROUND, METHODS, RESULTS, CONCLUSIONS. Return ONLY ONE label for each sentence and nothing else).
  '''

In [None]:
model.count_tokens(prompt)

In [None]:
safety_settings={
    # HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
    # HarmCategory.HARM_CATEGORY_DEROGATORY: HarmBlockThreshold.BLOCK_NONE,
    # HarmCategory.HARM_CATEGORY_TOXICITY: HarmBlockThreshold.BLOCK_NONE,
    # HarmCategory.HARM_CATEGORY_VIOLENCE: HarmBlockThreshold.BLOCK_NONE,
    # HarmCategory.HARM_CATEGORY_SEXUAL: HarmBlockThreshold.BLOCK_NONE,
    # HarmCategory.HARM_CATEGORY_MEDICAL: HarmBlockThreshold.BLOCK_NONE,
    # HarmCategory.HARM_CATEGORY_DANGEROUS: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
    HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
}

generation_config=genai.types.GenerationConfig(
        # Only one candidate for now.
        candidate_count=1,
        max_output_tokens=20,
        temperature=0.5)


In [None]:
import pandas as pd

# Data and Infer

In [None]:
df = pd.read_csv(r'')

In [None]:
import tqdm
# import time

# delay_seconds = 2

# Loop through the df row [text]
for index, row in tqdm.tqdm(df.iterrows(), total = len(df)):
    # append the [text] to the prompt
    prompt = prompt + row['text']
    # generate the response
    response = model.generate_content(prompt, 
                                    #   generation_config=generation_config, 
                                      safety_settings=safety_settings)
    
    # append the response.text to the prediction collumn
    df.at[index, 'prediction'] = response.text

    # A delay between iterations
    # time.sleep(delay_seconds)

df.to_csv(r'', index = False)

In [None]:
df.to_csv(r'', index = False)

In [None]:
response = model.generate_content(prompt, generation_config=generation_config, safety_settings=safety_settings)

In [None]:
response.text

# Eval

In [None]:
pred_df = pd.read_csv(r'')
# Convert BACKGROUND with 1, METHODS with 2, OBJECTIVE with 3, RESULTS with 4, and CONCLUSIONS with 5
pred_df['type'] = pred_df['type'].replace({'BACKGROUND':1, 'METHODS':2, 'OBJECTIVE':3, 'RESULTS':4, 'CONCLUSIONS':5})
pred_df['prediction'] = pred_df['prediction'].replace({'BACKGROUND':1, 'METHODS':2, 'OBJECTIVE':3, 'RESULTS':4, 'CONCLUSIONS':5})

In [None]:
pred_df.to_csv('pred_df.csv',index = False)

In [None]:
from sklearn.metrics import accuracy_score

In [None]:
import numpy as np
pred_df = pd.read_csv('pred_df.csv')
y_true = np.array(pred_df['type'])
y_pred = np.array(pred_df['prediction'].astype(int))
print('Accuracy: ', accuracy_score(y_true, y_pred))

In [None]:
table = pd.crosstab(pred_df['type'], pred_df['prediction'])

print(table)

In [None]:
pred_df.info()