<a href="https://colab.research.google.com/github/marco-siino/EXIST2024/blob/main/EXIST_2024_Task_1_Run1_2_Mistral7B_MSiino.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## In the Run 1 the starting and the ending tag "s" within the prompt is used for any few shot sample. In the Run 2 the "s" tag is used only once.

Installing dependencies. You might need to tweak the CMAKE_ARGS for the `llama-cpp-python` pip package.

In [1]:
# GPU llama-cpp-python; Starting from version llama-cpp-python==0.1.79, it supports GGUF
!CMAKE_ARGS="-DLLAMA_CUBLAS=on " pip install 'llama-cpp-python>=0.1.79' --force-reinstall --upgrade --no-cache-dir
# For download the models
!pip install huggingface_hub
!pip install datasets
!pip install -U deep-translator

import datasets
from datasets import load_dataset
from deep_translator import GoogleTranslator
import json
import re
import random
import numpy as np
import tqdm.notebook as tqdm

# Seed to shuffle the json training set.
seed_value = 42
random.seed(seed_value)

Collecting llama-cpp-python>=0.1.79
  Downloading llama_cpp_python-0.2.71.tar.gz (48.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.4/48.4 MB[0m [31m206.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting typing-extensions>=4.5.0 (from llama-cpp-python>=0.1.79)
  Downloading typing_extensions-4.11.0-py3-none-any.whl (34 kB)
Collecting numpy>=1.20.0 (from llama-cpp-python>=0.1.79)
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m320.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting diskcache>=5.6.1 (from llama-cpp-python>=0.1.79)
  Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━

Downloading an instruction-finetuned Mistral model.

In [2]:
from huggingface_hub import hf_hub_download

model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
model_basename = "mistral-7b-instruct-v0.2.Q6_K.gguf"

model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

# This config has been tested on an RTX 3080 (VRAM of 16GB).
# you might need to tweak with respect to your hardware.
from llama_cpp import Llama
lcpp_llm = Llama(
    model_path=model_path,
    n_threads=4, #16, # CPU cores
    n_batch=800, #8000, # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
    n_gpu_layers=32, # Change this value based on your model and your GPU VRAM pool.
    n_ctx=8192, # Context window
    logits_all=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


mistral-7b-instruct-v0.2.Q6_K.gguf:   0%|          | 0.00/5.94G [00:00<?, ?B/s]

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /root/.cache/huggingface/hub/models--TheBloke--Mistral-7B-Instruct-v0.2-GGUF/snapshots/3a6fbf4a41a1d52e415a4958cde6856d34b2db93/mistral-7b-instruct-v0.2.Q6_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loade

# Download the dataset for the three subtasks.

In [3]:
# Subtask 1.
!wget 'https://raw.githubusercontent.com/marco-siino/EXIST2024/main/dataset/EXIST2023_test_clean.json'
!wget 'https://raw.githubusercontent.com/marco-siino/EXIST2024/main/dataset/EXIST2024_training.json'
!wget 'https://raw.githubusercontent.com/marco-siino/EXIST2024/main/dataset/EXIST2024_training_task1_gold_hard.json'


train_path = 'EXIST2024_training.json'
train_gold_path = 'EXIST2024_training_task1_gold_hard.json'
test_path = 'EXIST2023_test_clean.json'

--2024-05-09 13:46:46--  https://raw.githubusercontent.com/marco-siino/EXIST2024/main/dataset/EXIST2023_test_clean.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2143912 (2.0M) [text/plain]
Saving to: ‘EXIST2023_test_clean.json’


2024-05-09 13:46:47 (245 MB/s) - ‘EXIST2023_test_clean.json’ saved [2143912/2143912]

--2024-05-09 13:46:47--  https://raw.githubusercontent.com/marco-siino/EXIST2024/main/dataset/EXIST2024_training.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9397366 (9.0M) [text/plain]
Saving to: ‘EXIST20

# Create few-shot samples from training set.

In [4]:
 prompt_context = "[INST] Is the following TWEET sexist, in any form, or does it describe situations in which such discrimination occurs (i.e., it is sexist itself, describes a sexist situation or criticizes a sexist behaviour)? Reply only with YES or NO.\n"


In [19]:
with open(train_path, 'r') as istr:
    train_1_json = json.load(istr)
  #print(train_1_json[0])

# Convert the dictionary keys into a list
keys_list = list(train_1_json.keys())

# Shuffle the list
random.shuffle(keys_list)

# Create a new dictionary with shuffled keys
train_1_json = {key: train_1_json[key] for key in keys_list}

In [20]:
def create_few_shot_samples_run1(train_1_json,nr_samples):

  nr_few_shot_samples = nr_samples

  few_shot_counter = 0

  few_shot_samples = ''

  with open(train_gold_path, 'r') as istr:
    train_gold_json = json.load(istr)
  indexed_data = {entry["id"]: entry for entry in train_gold_json}

  for key, value in train_1_json.items():

    if few_shot_counter == nr_few_shot_samples:
      break
    try:
      label = indexed_data[key]['value']
    except:
      continue

    few_shot_counter+=1
    tweet = train_1_json[key]['tweet']
    print(train_1_json[key]['tweet'] + ' // ' + label)
    if train_1_json[key]['lang']=='es':
      tweet = GoogleTranslator(source='es', target='en').translate(train_1_json[key]['tweet'])

    few_shot_samples += '<s>'+prompt_context+"TWEET:"+tweet + ' [/INST] \n ' + label + "\n</s> \n\n "

    if few_shot_counter == nr_few_shot_samples:
        break

  return few_shot_samples

In [21]:
def create_few_shot_samples_run2(train_1_json,nr_samples):

  nr_few_shot_samples = nr_samples

  few_shot_counter = 0

  few_shot_samples = ''

  with open(train_gold_path, 'r') as istr:
    train_gold_json = json.load(istr)
  indexed_data = {entry["id"]: entry for entry in train_gold_json}
  #print(indexed_data['100001'])

  for key, value in train_1_json.items():

    if few_shot_counter == nr_few_shot_samples:
      break
    try:
      label = indexed_data[key]['value']
    except:
      continue

    few_shot_counter+=1
    tweet = train_1_json[key]['tweet']
    print(train_1_json[key]['tweet'] + ' // ' + label)
    if train_1_json[key]['lang']=='es':
      tweet = GoogleTranslator(source='es', target='en').translate(train_1_json[key]['tweet'])

    few_shot_samples += prompt_context+"TWEET:"+tweet + ' [/INST] \n ' + label + "\n \n\n "

    if few_shot_counter == nr_few_shot_samples:
        break

  return few_shot_samples

In [22]:
few_shot_samples_run1 = create_few_shot_samples_run1(train_1_json,10)

@JackelineWB @JohnMAckerman @lopezobrador_ tu eres el absurdo, o sea Loret si tiene derecho de violar la privacidad de personas civiles?? // NO
Yo me pregunto si que haya fortunas genera acaso un techo de cristal o algo por el estilo. Cada vez hay más ricos, y a quien le moleste lo heredado, que analice si padece de envidia. https://t.co/oesBZ03sdh // NO
No te vistas así*No uses minifalta*No se saques selfie*No salgas sola* Parecés una puta*Vos lo habrás mirado*Estás provocandoAh ! y que deprimente lo de Fablet. El señor que nos dice qué tenemos que hacer para que no nos violen. // YES
@Russtophocles1 Girls don’t count as reply guys , call me sexist but I always love compliments from da gals. Even if they are horny posting😂😂 but if you’re mean I will cry // YES
Siempre he pensado esto de todos los abogados ... Jajaja Bueno mi cuñado NO es así, me sorprende porque es el estereotipo perfecto! Costeño, ebrio y abogado https://t.co/YH6AjDlElH // NO
Es algo relacionado con la identidad de g

In [23]:
few_shot_samples_run2 = create_few_shot_samples_run2(train_1_json,10)

@JackelineWB @JohnMAckerman @lopezobrador_ tu eres el absurdo, o sea Loret si tiene derecho de violar la privacidad de personas civiles?? // NO
Yo me pregunto si que haya fortunas genera acaso un techo de cristal o algo por el estilo. Cada vez hay más ricos, y a quien le moleste lo heredado, que analice si padece de envidia. https://t.co/oesBZ03sdh // NO
No te vistas así*No uses minifalta*No se saques selfie*No salgas sola* Parecés una puta*Vos lo habrás mirado*Estás provocandoAh ! y que deprimente lo de Fablet. El señor que nos dice qué tenemos que hacer para que no nos violen. // YES
@Russtophocles1 Girls don’t count as reply guys , call me sexist but I always love compliments from da gals. Even if they are horny posting😂😂 but if you’re mean I will cry // YES
Siempre he pensado esto de todos los abogados ... Jajaja Bueno mi cuñado NO es así, me sorprende porque es el estereotipo perfecto! Costeño, ebrio y abogado https://t.co/YH6AjDlElH // NO
Es algo relacionado con la identidad de g

In [24]:
print(few_shot_samples_run1)

<s>[INST] Is the following TWEET sexist, in any form, or does it describe situations in which such discrimination occurs (i.e., it is sexist itself, describes a sexist situation or criticizes a sexist behaviour)? Reply only with YES or NO.
TWEET:@JackelineWB @JohnMAckerman @lopezobrador_ you are absurd, that is, Loret has the right to violate the privacy of civilians?? [/INST] 
 NO
</s> 

 <s>[INST] Is the following TWEET sexist, in any form, or does it describe situations in which such discrimination occurs (i.e., it is sexist itself, describes a sexist situation or criticizes a sexist behaviour)? Reply only with YES or NO.
TWEET:I wonder if the fact that there are fortunes generates a glass ceiling or something like that. There are more and more rich people, and anyone who is bothered by what they inherited should analyze whether they suffer from envy. https://t.co/oesBZ03sdh [/INST] 
 NO
</s> 

 <s>[INST] Is the following TWEET sexist, in any form, or does it describe situations in 

In [25]:
print(few_shot_samples_run2)

[INST] Is the following TWEET sexist, in any form, or does it describe situations in which such discrimination occurs (i.e., it is sexist itself, describes a sexist situation or criticizes a sexist behaviour)? Reply only with YES or NO.
TWEET:@JackelineWB @JohnMAckerman @lopezobrador_ you are absurd, that is, Loret has the right to violate the privacy of civilians?? [/INST] 
 NO
 

 [INST] Is the following TWEET sexist, in any form, or does it describe situations in which such discrimination occurs (i.e., it is sexist itself, describes a sexist situation or criticizes a sexist behaviour)? Reply only with YES or NO.
TWEET:I wonder if the fact that there are fortunes generates a glass ceiling or something like that. There are more and more rich people, and anyone who is bothered by what they inherited should analyze whether they suffer from envy. https://t.co/oesBZ03sdh [/INST] 
 NO
 

 [INST] Is the following TWEET sexist, in any form, or does it describe situations in which such discri

# Run!

In [None]:
replies_list = ['YES','NO']
counter = 0

# List to store the results.
json_result_run1 = []


with open('answer.txt', 'w') as f:
        f.write('')

# simple JSON loading
with open(test_path, 'r') as istr:
    test_json = json.load(istr)
num_sample = len(test_json)
#print(num_sample)

#print(train_1_json[0])
for key, value in test_json.items():
  counter+=1
  tweet = test_json[key]['tweet']
  if test_json[key]['lang']=='es':
      tweet = GoogleTranslator(source='es', target='en').translate(test_json[key]['tweet'])

  current_sample = '[INST]\n'+tweet+'\n[/INST]'
  prompt = few_shot_samples_run1+current_sample

  #print(prompt)

  response = lcpp_llm(
        prompt=prompt,
        temperature= 0.2,
        logprobs=1,
        #max_tokens =1
      )

  #print(response)

  answer = str(response["choices"][0]["text"]).strip().upper()
  #print(answer)
  answer = answer[:3]
  #answer = answer.split()[0]
  # Sometime output contains a '.' remove it!
  #answer = answer.replace('.','')

  # If the predicted word is not in emotion list just replace with neutral.
  if answer not in replies_list:
        answer = 'NO'

  #current_sample += answer + " \n "

  print("GENERATED: "+ current_sample+'\n'+answer)

  current_element = {
        "id": test_json[key]['id'],
        "value": answer,
        "test_case": "EXIST2024"
    }
  json_result_run1.append(current_element)

with open("task1_hard_badrock_1.json", "w") as json_file:
    json.dump(json_result_run1, json_file, indent=4)


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1255.65 ms
llama_print_timings:      sample time =      11.26 ms /    16 runs   (    0.70 ms per token,  1421.46 tokens per second)
llama_print_timings: prompt eval time =     953.35 ms /    31 tokens (   30.75 ms per token,    32.52 tokens per second)
llama_print_timings:        eval time =    1825.91 ms /    15 runs   (  121.73 ms per token,     8.22 tokens per second)
llama_print_timings:       total time =    3000.86 ms /    46 tokens


GENERATED: [INST]
@Eurogamer_es All gamergate from development to game forums, classic in the world of video games.
[/INST]
NO


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1255.65 ms
llama_print_timings:      sample time =       8.84 ms /    16 runs   (    0.55 ms per token,  1809.95 tokens per second)
llama_print_timings: prompt eval time =     554.04 ms /    32 tokens (   17.31 ms per token,    57.76 tokens per second)
llama_print_timings:        eval time =    1479.82 ms /    15 runs   (   98.65 ms per token,    10.14 tokens per second)
llama_print_timings:       total time =    2169.46 ms /    47 tokens


GENERATED: [INST]
@ArCaNgEl__23 @Benzenazi Man, it's not comparable, look at Gamergate.
[/INST]
NO


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1255.65 ms
llama_print_timings:      sample time =       9.09 ms /    16 runs   (    0.57 ms per token,  1760.18 tokens per second)
llama_print_timings: prompt eval time =     577.26 ms /    24 tokens (   24.05 ms per token,    41.58 tokens per second)
llama_print_timings:        eval time =    1508.51 ms /    15 runs   (  100.57 ms per token,     9.94 tokens per second)
llama_print_timings:       total time =    2204.03 ms /    39 tokens


GENERATED: [INST]
I'm looking for companies involved in gamergate for etc etc to buy all their products
[/INST]
NO


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1255.65 ms
llama_print_timings:      sample time =      10.42 ms /    16 runs   (    0.65 ms per token,  1535.95 tokens per second)
llama_print_timings: prompt eval time =     266.35 ms /    75 tokens (    3.55 ms per token,   281.59 tokens per second)
llama_print_timings:        eval time =    1706.46 ms /    15 runs   (  113.76 ms per token,     8.79 tokens per second)
llama_print_timings:       total time =    2234.81 ms /    90 tokens


GENERATED: [INST]
@jordirico First it was the internet, then the gamergate, the manosphere and its extreme misogyny without the platforms lifting a finger, the incel-murders, the extreme right capitalizing on hate, the mimicry of these ideas with those of a respectable debate and finally here we are
[/INST]
YES


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1255.65 ms
llama_print_timings:      sample time =      11.24 ms /    16 runs   (    0.70 ms per token,  1423.11 tokens per second)
llama_print_timings: prompt eval time =     859.61 ms /    28 tokens (   30.70 ms per token,    32.57 tokens per second)
llama_print_timings:        eval time =    1912.53 ms /    15 runs   (  127.50 ms per token,     7.84 tokens per second)
llama_print_timings:       total time =    2988.06 ms /    43 tokens


GENERATED: [INST]
@AlonsoQuijano12 I was involved in gamergate, so unfortunately I know too much 😭
[/INST]
NO


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1255.65 ms
llama_print_timings:      sample time =       8.94 ms /    16 runs   (    0.56 ms per token,  1788.91 tokens per second)
llama_print_timings: prompt eval time =     775.03 ms /    29 tokens (   26.73 ms per token,    37.42 tokens per second)
llama_print_timings:        eval time =    1532.26 ms /    15 runs   (  102.15 ms per token,     9.79 tokens per second)
llama_print_timings:       total time =    2442.70 ms /    44 tokens


GENERATED: [INST]
@MrSandman1954 @AKN4710 Gamergate screwed the internet forever.
[/INST]
NO


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1255.65 ms
llama_print_timings:      sample time =       9.32 ms /    16 runs   (    0.58 ms per token,  1716.37 tokens per second)
llama_print_timings: prompt eval time =     299.13 ms /   102 tokens (    2.93 ms per token,   340.99 tokens per second)
llama_print_timings:        eval time =    1500.46 ms /    15 runs   (  100.03 ms per token,    10.00 tokens per second)
llama_print_timings:       total time =    2122.31 ms /   117 tokens


GENERATED: [INST]
@grupoeldeber @grupoeldeber It would be good if they also said that it was in CANADA🇨🇦 the first country where it was reported that G∆y couples who adopted children abused them in 2008. How selective are they with the crap they publish, right? https://t.co/GPAmLOfcY9 https://t.co/f36aAmxq35
[/INST]
NO


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1255.65 ms
llama_print_timings:      sample time =       9.13 ms /    16 runs   (    0.57 ms per token,  1752.85 tokens per second)
llama_print_timings: prompt eval time =     252.47 ms /    40 tokens (    6.31 ms per token,   158.43 tokens per second)
llama_print_timings:        eval time =    1557.09 ms /    15 runs   (  103.81 ms per token,     9.63 tokens per second)
llama_print_timings:       total time =    1971.28 ms /    55 tokens


GENERATED: [INST]
They, very close together and more alone than ever. The consequences of everything. Mozambique. https://t.co/5nVGTb3Ua3
[/INST]
NO


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1255.65 ms
llama_print_timings:      sample time =       8.99 ms /    16 runs   (    0.56 ms per token,  1779.16 tokens per second)
llama_print_timings: prompt eval time =     586.76 ms /    32 tokens (   18.34 ms per token,    54.54 tokens per second)
llama_print_timings:        eval time =    1530.86 ms /    15 runs   (  102.06 ms per token,     9.80 tokens per second)
llama_print_timings:       total time =    2260.74 ms /    47 tokens


GENERATED: [INST]
DC Children's Hospital Harassed Over Trans Youth Serviceshttps://t.co/RrEtZ5TDDM
[/INST]
NO


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1255.65 ms
llama_print_timings:      sample time =       9.33 ms /    16 runs   (    0.58 ms per token,  1714.71 tokens per second)
llama_print_timings: prompt eval time =     256.55 ms /    45 tokens (    5.70 ms per token,   175.40 tokens per second)
llama_print_timings:        eval time =    1568.85 ms /    15 runs   (  104.59 ms per token,     9.56 tokens per second)
llama_print_timings:       total time =    2011.60 ms /    60 tokens


GENERATED: [INST]
20mins deep into BeReal and I have never felt so pressured to take my phone nope I don't want to feel harassed I say goodbye 🫡
[/INST]
NO


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1255.65 ms
llama_print_timings:      sample time =      11.79 ms /    16 runs   (    0.74 ms per token,  1357.20 tokens per second)
llama_print_timings: prompt eval time =     271.85 ms /    61 tokens (    4.46 ms per token,   224.39 tokens per second)
llama_print_timings:        eval time =    1789.32 ms /    15 runs   (  119.29 ms per token,     8.38 tokens per second)
llama_print_timings:       total time =    2435.20 ms /    76 tokens


GENERATED: [INST]
Anthony Fauci, the leading US infectious disease expert, noted in August 2022 that he had received death threats and that his family was routinely exposed to harassment. https://t.co/EerbO2Px5j
[/INST]
NO


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1255.65 ms
llama_print_timings:      sample time =       8.92 ms /    16 runs   (    0.56 ms per token,  1794.12 tokens per second)
llama_print_timings: prompt eval time =     699.63 ms /    28 tokens (   24.99 ms per token,    40.02 tokens per second)
llama_print_timings:        eval time =    1435.95 ms /    15 runs   (   95.73 ms per token,    10.45 tokens per second)
llama_print_timings:       total time =    2263.21 ms /    43 tokens


GENERATED: [INST]
@dimplerrylover I know but literally the girl was being harassed like how can she smile like that
[/INST]
NO


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1255.65 ms
llama_print_timings:      sample time =       9.62 ms /    16 runs   (    0.60 ms per token,  1663.20 tokens per second)
llama_print_timings: prompt eval time =     284.15 ms /    85 tokens (    3.34 ms per token,   299.14 tokens per second)
llama_print_timings:        eval time =    1480.67 ms /    15 runs   (   98.71 ms per token,    10.13 tokens per second)
llama_print_timings:       total time =    2055.73 ms /   100 tokens


GENERATED: [INST]
@marcela_ruge @_Lamanada_ @Mitocondria1311 @karinin7986 @julibmoya @MissPoirot07 @Adrianaruiz17 @DiaAnimales @marcevalenciar @MONYRODRIGUEZOF Marce do you have daviplata?
[/INST]
NO


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1255.65 ms
llama_print_timings:      sample time =      11.71 ms /    16 runs   (    0.73 ms per token,  1366.47 tokens per second)
llama_print_timings: prompt eval time =     260.71 ms /    64 tokens (    4.07 ms per token,   245.48 tokens per second)
llama_print_timings:        eval time =    1804.13 ms /    15 runs   (  120.28 ms per token,     8.31 tokens per second)
llama_print_timings:       total time =    2311.64 ms /    79 tokens


GENERATED: [INST]
About to see the grand premiere of #Jauría based on the case of #LaManada @anasofihg @RobertoBeck90 @elteatrocdmx #teatromexicano https://t.co/N5Uke18ikQ
[/INST]
NO


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1255.65 ms
llama_print_timings:      sample time =       9.59 ms /    16 runs   (    0.60 ms per token,  1668.93 tokens per second)
llama_print_timings: prompt eval time =     259.75 ms /    64 tokens (    4.06 ms per token,   246.39 tokens per second)
llama_print_timings:        eval time =    1512.29 ms /    15 runs   (  100.82 ms per token,     9.92 tokens per second)
llama_print_timings:       total time =    1995.70 ms /    79 tokens


GENERATED: [INST]
What a bastard our Uncle Pablo is with his little jokes. How comfortable we are on the cool grass... mybestfriends #doglove #theyneverabandon #lodaneverythingchangedenada https://t.co/4x20oe7V8D
[/INST]
NO


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1255.65 ms
llama_print_timings:      sample time =      10.20 ms /    16 runs   (    0.64 ms per token,  1569.24 tokens per second)
llama_print_timings: prompt eval time =     289.27 ms /    75 tokens (    3.86 ms per token,   259.28 tokens per second)
llama_print_timings:        eval time =    1606.43 ms /    15 runs   (  107.10 ms per token,     9.34 tokens per second)
llama_print_timings:       total time =    2152.33 ms /    90 tokens


GENERATED: [INST]
@noebermudez_13 @ldacr And we are already looking forward to filling the stands with you everywhere again!!! @LaManada_CR And now distribute balls left and right except to the ball gatherers 🥺🤣🤣🤣🤣#VAMOSLEONAS #SOMOSLAMANADA
[/INST]
NO


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1255.65 ms
llama_print_timings:      sample time =      10.94 ms /    16 runs   (    0.68 ms per token,  1462.12 tokens per second)
llama_print_timings: prompt eval time =     311.53 ms /   116 tokens (    2.69 ms per token,   372.36 tokens per second)
llama_print_timings:        eval time =    1832.49 ms /    15 runs   (  122.17 ms per token,     8.19 tokens per second)
llama_print_timings:       total time =    2545.53 ms /   131 tokens


GENERATED: [INST]
#TBT: María Rosa, Gregorio Polanco and Ysabel Verás, back in October 2016, during one of the Kermés held for the team's fundraising. The Pack is tradition, family, unity and joy!🥎🐺🥎🐺#TBT #Auuuuu#LaManada#LobosFansClub#LobosSoftbolClub https://t.co/LwSL4npanZ
[/INST]
NO


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1255.65 ms
llama_print_timings:      sample time =       9.20 ms /    16 runs   (    0.58 ms per token,  1738.56 tokens per second)
llama_print_timings: prompt eval time =     382.56 ms /   129 tokens (    2.97 ms per token,   337.21 tokens per second)
llama_print_timings:        eval time =    1610.81 ms /    15 runs   (  107.39 ms per token,     9.31 tokens per second)
llama_print_timings:       total time =    2401.78 ms /   144 tokens


GENERATED: [INST]
AUGUST 9 Find out everything! 🎙 You can watch the entire segment of 'El Bla, Bla, Bla' with HÉCTOR JOAQUÍN in the #Podcasts section of the LaMusica app. 📲#HectorJoaquin #LaManada #ElBlaBlaBla #LaMusica @la_musica @zeta93fm https://t.co/ZDZM4Qs1zx https://t.co/zqMjKMgN4r
[/INST]
NO


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1255.65 ms
llama_print_timings:      sample time =       9.32 ms /    16 runs   (    0.58 ms per token,  1717.29 tokens per second)
llama_print_timings: prompt eval time =     291.88 ms /    89 tokens (    3.28 ms per token,   304.92 tokens per second)
llama_print_timings:        eval time =    1557.88 ms /    15 runs   (  103.86 ms per token,     9.63 tokens per second)
llama_print_timings:       total time =    2139.63 ms /   104 tokens


GENERATED: [INST]
@Fistroman1 @ElioGatsby Without proof?? What was it in public? What did we all see, what are you telling me? Don't mix churras with merinas. What from now until you come to me with the metoo and the notallmen there's little left and I don't feel like it end up really arguing, I see it coming.
[/INST]
NO


Llama.generate: prefix-match hit

llama_print_timings:        load time =    1255.65 ms
llama_print_timings:      sample time =      10.11 ms /    16 runs   (    0.63 ms per token,  1582.90 tokens per second)
llama_print_timings: prompt eval time =     281.86 ms /    83 tokens (    3.40 ms per token,   294.48 tokens per second)
llama_print_timings:        eval time =    1528.62 ms /    15 runs   (  101.91 ms per token,     9.81 tokens per second)
llama_print_timings:       total time =    2085.57 ms /    98 tokens


GENERATED: [INST]
@EstefaniaVeloz ...With the legal process, not to set a person on fire (I don't even know who they are in this case). There are women who are victims but there are also corrupt, violent and victimizers. The #MeToo movement has already fulfilled its mission. What's next? How to advance without it being a weapon of whim?
[/INST]
NO


In [None]:
replies_list = ['YES','NO']
counter = 0

# List to store the results.
json_result_run2 = []

# simple JSON loading
with open(test_path, 'r') as istr:
    test_json = json.load(istr)
num_sample = len(test_json)
#print(num_sample)

#print(train_1_json[0])
for key, value in test_json.items():
  counter+=1
  tweet = test_json[key]['tweet']
  if test_json[key]['lang']=='es':
      tweet = GoogleTranslator(source='es', target='en').translate(test_json[key]['tweet'])

  current_sample = '[INST]\n'+tweet+'\n[/INST]'
  prompt = few_shot_samples_run2+current_sample

  #print(prompt)

  response = lcpp_llm(
        prompt=prompt,
        temperature= 0.2,
        logprobs=1,
        #max_tokens =1
      )

  #print(response)

  answer = str(response["choices"][0]["text"]).strip().upper()
  #print(answer)
  answer = answer[:3]
  #answer = answer.split()[0]
  # Sometime output contains a '.' remove it!
  #answer = answer.replace('.','')

  # If the predicted word is not in emotion list just replace with neutral.
  if answer not in replies_list:
        answer = 'NO'

  #current_sample += answer + " \n "

  print("GENERATED: "+ current_sample+'\n'+answer)

  current_element = {
        "id": test_json[key]['id'],
        "value": answer,
        "test_case": "EXIST2024"
    }
  json_result_run2.append(current_element)

with open("task1_hard_badrock_2.json", "w") as json_file:
    json.dump(json_result_run2, json_file, indent=4)


In [None]:
!zip -r exist2024_badrock.zip task1_hard_badrock_1.json task1_hard_badrock_2.json

  adding: answer.txt (deflated 94%)


In [None]:
from google.colab import files
files.download('exist2024_badrock.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>