<a href="https://colab.research.google.com/github/marco-siino/EXIST2024/blob/main/EXIST_2024_Task_1_Mistral7B_MSiino.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Installing dependencies. You might need to tweak the CMAKE_ARGS for the `llama-cpp-python` pip package.

In [1]:
# GPU llama-cpp-python; Starting from version llama-cpp-python==0.1.79, it supports GGUF
!CMAKE_ARGS="-DLLAMA_CUBLAS=on " pip install 'llama-cpp-python>=0.1.79' --force-reinstall --upgrade --no-cache-dir
# For download the models
!pip install huggingface_hub
!pip install datasets
!pip install -U deep-translator

import datasets
from datasets import load_dataset
from deep_translator import GoogleTranslator
import json
import re
import random
import numpy as np
import tqdm.notebook as tqdm

Collecting llama-cpp-python>=0.1.79
  Downloading llama_cpp_python-0.2.71.tar.gz (48.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.4/48.4 MB[0m [31m173.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting typing-extensions>=4.5.0 (from llama-cpp-python>=0.1.79)
  Downloading typing_extensions-4.11.0-py3-none-any.whl (34 kB)
Collecting numpy>=1.20.0 (from llama-cpp-python>=0.1.79)
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m192.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting diskcache>=5.6.1 (from llama-cpp-python>=0.1.79)
  Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━

Downloading an instruction-finetuned Mistral model.

In [2]:
from huggingface_hub import hf_hub_download

model_name_or_path = "TheBloke/Mistral-7B-Instruct-v0.2-GGUF"
model_basename = "mistral-7b-instruct-v0.2.Q6_K.gguf"

model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)

# This config has been tested on an RTX 3080 (VRAM of 16GB).
# you might need to tweak with respect to your hardware.
from llama_cpp import Llama
lcpp_llm = Llama(
    model_path=model_path,
    n_threads=4, #16, # CPU cores
    n_batch=800, #8000, # Should be between 1 and n_ctx, consider the amount of VRAM in your GPU.
    n_gpu_layers=32, # Change this value based on your model and your GPU VRAM pool.
    n_ctx=8192, # Context window
    logits_all=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


mistral-7b-instruct-v0.2.Q6_K.gguf:   0%|          | 0.00/5.94G [00:00<?, ?B/s]

llama_model_loader: loaded meta data with 24 key-value pairs and 291 tensors from /root/.cache/huggingface/hub/models--TheBloke--Mistral-7B-Instruct-v0.2-GGUF/snapshots/3a6fbf4a41a1d52e415a4958cde6856d34b2db93/mistral-7b-instruct-v0.2.Q6_K.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = mistralai_mistral-7b-instruct-v0.2
llama_model_loader: - kv   2:                       llama.context_length u32              = 32768
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loade

# Download the dataset for the three subtasks.

In [3]:
# Subtask 1.
!wget 'https://raw.githubusercontent.com/marco-siino/EXIST2024/main/dataset/EXIST2023_test_clean.json'
!wget 'https://raw.githubusercontent.com/marco-siino/EXIST2024/main/dataset/EXIST2024_training.json'
!wget 'https://raw.githubusercontent.com/marco-siino/EXIST2024/main/dataset/EXIST2024_training_task1_gold_hard.json'


train_path = 'EXIST2024_training.json'
train_gold_path = 'EXIST2024_training_task1_gold_hard.json'
test_path = 'EXIST2023_test_clean.json'

--2024-05-09 10:33:51--  https://raw.githubusercontent.com/marco-siino/EXIST2024/main/dataset/EXIST2023_test_clean.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2143912 (2.0M) [text/plain]
Saving to: ‘EXIST2023_test_clean.json’


2024-05-09 10:33:51 (87.2 MB/s) - ‘EXIST2023_test_clean.json’ saved [2143912/2143912]

--2024-05-09 10:33:51--  https://raw.githubusercontent.com/marco-siino/EXIST2024/main/dataset/EXIST2024_training.json
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9397366 (9.0M) [text/plain]
Saving to: ‘EXIST2

# Create few-shot samples from training and validation set.

In [9]:
 prompt_context = "<s>[INST] Is the following TWEET sexist, in any form, or does it describe situations in which such discrimination occurs (i.e., it is sexist itself, describes a sexist situation or criticizes a sexist behaviour)? Reply only with YES or NO.\n"


In [14]:
def create_few_shot_samples_task1(json_set_path,nr_samples):

  nr_few_shot_samples = nr_samples

  few_shot_counter = 0

  few_shot_samples = ''

  with open(train_gold_path, 'r') as istr:
    train_gold_json = json.load(istr)
  indexed_data = {entry["id"]: entry for entry in train_gold_json}
  #print(indexed_data['100001'])
  with open(json_set_path, 'r') as istr:
    train_1_json = json.load(istr)
  #print(train_1_json[0])
  for key, value in train_1_json.items():
  #for i in tqdm.trange(len(train_1_json)):
    if few_shot_counter == nr_few_shot_samples:
      break


    try:
      label = indexed_data[key]['value']
    except:
      continue
    #for sentence_nr in range(0,len(train_1_json[i]['speakers'])):
    few_shot_counter+=1
    tweet = train_1_json[key]['tweet']
    print(train_1_json[key]['tweet'] + ' // ' + label)
    if train_1_json[key]['lang']=='es':
      tweet = GoogleTranslator(source='es', target='en').translate(train_1_json[key]['tweet'])

    few_shot_samples += prompt_context+"TWEET:"+tweet + ' [/INST] \n ' + label + "\n</s> \n\n "

    if few_shot_counter == nr_few_shot_samples:
        break

  return few_shot_samples

In [15]:
few_shot_samples = create_few_shot_samples_task1(train_path,10)
#few_shot_samples += create_few_shot_samples_task1(val_path_1,80)

@TheChiflis Ignora al otro, es un capullo.El problema con este youtuber denuncia el acoso... cuando no afecta a la gente de izquierdas. Por ejemplo, en su video sobre el gamergate presenta como "normal" el acoso que reciben Fisher, Anita o Zöey cuando hubo hasta amenazas de bomba. // YES
@ultimonomada_ Si comicsgate se parece en algo a gamergate pues muy bien por el acoso. Y si se está haciendo un sabotaje porque hay personajes que no os gustan entonces gracias por darme la razón. Sois unos lloricas ofendidos. // NO
@Steven2897 Lee sobre Gamergate, y como eso ha cambiado la manera en la cual nos comunicamos en el internet. Los fanboys de Halo están tóxicos pero los fanboys de otras comunidades/juegos también han querido coger pauta con eso 🤷🏾‍♂️ // NO
@novadragon21 @icep4ck @TvDannyZ Entonces como así es el mercado lo mejor no es hacer algo para cambiarlo y seguir alimentando el machismo en los consumidores en lugar apoyar a gente como las víctimas del gamergate.Acerca de lo otro, el "

In [16]:
print(few_shot_samples)

<s>[INST] Is the following TWEET sexist, in any form, or does it describe situations in which such discrimination occurs (i.e., it is sexist itself, describes a sexist situation or criticizes a sexist behaviour)? Reply only with YES or NO.
TWEET:@TheChiflis Ignore the other one, he's an idiot. The problem with this YouTuber is that he denounces harassment... when it doesn't affect people on the left. For example, in his video about gamergate he presents as "normal" the harassment that Fisher, Anita or Zöey receive when there were even bomb threats. [/INST] 
 YES
</s> 

 <s>[INST] Is the following TWEET sexist, in any form, or does it describe situations in which such discrimination occurs (i.e., it is sexist itself, describes a sexist situation or criticizes a sexist behaviour)? Reply only with YES or NO.
TWEET:@ultimonomada_ If comicsgate is anything like gamergate, then good for the harassment. And if sabotage is being done because there are characters that you don't like, then thank

# Run!

In [19]:
replies_list = ['YES','NO']

with open('answer.txt', 'w') as f:
        f.write('')

# simple JSON loading
with open(test_path, 'r') as istr:
    test_json = json.load(istr)
num_sample = len(test_json)
#print(num_sample)

#print(train_1_json[0])
for key, value in test_json.items():
  tweet = test_json[key]['tweet']
  if test_json[key]['lang']=='es':
      tweet = GoogleTranslator(source='es', target='en').translate(test_json[key]['tweet'])

  current_sample = '[INST]\n'+tweet+'\n[/INST]'
  prompt = few_shot_samples+current_sample

  print(prompt)

  response = lcpp_llm(
        prompt=prompt,
        temperature= 0.2,
        logprobs=1,
        #max_tokens =1
      )

  print(response)

  answer = str(response["choices"][0]["text"]).strip().lower()
  print(answer)
  #answer = answer.split()[0]
  # Sometime output contains a '.' remove it!
  #answer = answer.replace('.','')

  # If the predicted word is not in emotion list just replace with neutral.
  if answer not in replies_list:
        answer = 'NO'

  #current_sample += answer + " \n "

  print("GENERATED: "+ current_sample+answer)

  with open('answer.txt', 'a') as f:
    f.write(answer+"\n")
  break


<s>[INST] Is the following TWEET sexist, in any form, or does it describe situations in which such discrimination occurs (i.e., it is sexist itself, describes a sexist situation or criticizes a sexist behaviour)? Reply only with YES or NO.
TWEET:@TheChiflis Ignore the other one, he's an idiot. The problem with this YouTuber is that he denounces harassment... when it doesn't affect people on the left. For example, in his video about gamergate he presents as "normal" the harassment that Fisher, Anita or Zöey receive when there were even bomb threats. [/INST] 
 YES
</s> 

 <s>[INST] Is the following TWEET sexist, in any form, or does it describe situations in which such discrimination occurs (i.e., it is sexist itself, describes a sexist situation or criticizes a sexist behaviour)? Reply only with YES or NO.
TWEET:@ultimonomada_ If comicsgate is anything like gamergate, then good for the harassment. And if sabotage is being done because there are characters that you don't like, then thank

Llama.generate: prefix-match hit

llama_print_timings:        load time =    1255.65 ms
llama_print_timings:      sample time =       8.46 ms /    16 runs   (    0.53 ms per token,  1891.25 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    1872.72 ms /    16 runs   (  117.05 ms per token,     8.54 tokens per second)
llama_print_timings:       total time =    1929.83 ms /    17 tokens


{'id': 'cmpl-a16a090f-b61b-4aa9-9d0c-a70eecb9f84b', 'object': 'text_completion', 'created': 1715252188, 'model': '/root/.cache/huggingface/hub/models--TheBloke--Mistral-7B-Instruct-v0.2-GGUF/snapshots/3a6fbf4a41a1d52e415a4958cde6856d34b2db93/mistral-7b-instruct-v0.2.Q6_K.gguf', 'choices': [{'text': ' \n NO\n\nThis tweet does not contain any sexist language or', 'index': 0, 'logprobs': {'tokens': [' ', '\n', ' NO', '\n', '\n', 'This', ' twe', 'et', ' does', ' not', ' contain', ' any', ' sex', 'ist', ' language', ' or'], 'text_offset': [4897, 4898, 4899, 4902, 4903, 4904, 4908, 4912, 4914, 4919, 4923, 4931, 4935, 4939, 4942, 4951], 'token_logprobs': [-0.27291778, -0.053139508, -0.0018108174, -0.10742129, -0.00057633, -1.4971123, -0.06931656, -3.5762778e-07, -0.40130493, -6.520536e-05, -0.09280562, -0.040356763, -0.10006745, -0.0005918181, -0.19531919, -0.038368657], 'top_logprobs': [{' ': -0.27291778}, {'\n': -0.053139508}, {' NO': -0.0018108174}, {'\n': -0.10742129}, {'\n': -0.00057633}

In [None]:
counter=0
for i in range(0,len(test_1_json)):
  for sentence_nr in range(0,len(test_1_json[i]['utterances'])):
    counter+=1

print(counter)

1580


In [None]:
open('newfile','w').writelines([ line for line in open('SemEval_2024_Task_10_Mistral7B_MSiino (1).ipynb') if 'GENERATED:' in line])

result_not_filtered = [ line for line in open('SemEval_2024_Task_10_Mistral7B_MSiino (1).ipynb') if 'GENERATED:' in line]


In [None]:
with open('answer.txt', 'w') as f:
        f.write('')

In [None]:
with open('newfile') as f:
   for line in f:
       # For Python3, use print(line)
       x = line.split()
       result = x[len(x)-1].replace('\\n",','')
       result = result.replace('\\n"','')
       print (result)
       with open('answer.txt', 'a') as f:
        f.write(result+"\n")

sadness
neutral
fear
neutral
anger
disgust
neutral
neutral
neutral
neutral
neutral
neutral
disgust
fear
fear
neutral
neutral
joy
neutral
neutral
contempt
joy
neutral
neutral
sadness
disgust
fear
surprise
neutral
neutral
neutral
neutral
neutral
neutral
neutral
neutral
disgust
neutral
neutral
neutral
neutral
neutral
neutral
neutral
neutral
neutral
joy
joy
neutral
neutral
neutral
neutral
neutral
neutral
neutral
neutral
neutral
joy
neutral
neutral
neutral
neutral
neutral
fear
joy
neutral
neutral
neutral
joy
neutral
neutral
neutral
fear
disgust
neutral
neutral
neutral
neutral
sadness
sadness
neutral
neutral
neutral
neutral
disgust
neutral
fear
neutral
neutral
fear
neutral
neutral
neutral
neutral
anger
neutral
neutral
neutral
neutral
neutral
neutral
neutral
neutral
neutral
fear
neutral
fear
neutral
neutral
neutral
neutral
neutral
neutral
neutral
neutral
neutral
neutral
neutral
neutral
surprise
neutral
joy
neutral
neutral
fear
neutral
neutral
fear
neutral
neutral
neutral
neutral
neutral
fear


In [None]:
# Now fill the remaining lines for the other two tasks.

for i in range(1581,17913):
  r_int_value = random.randint(0, 1)
  r_string_value = str(r_int_value)+'.0'
  with open('answer.txt', 'a') as f:
        f.write(r_string_value+"\n")


In [None]:
!zip -r results.zip answer.txt

  adding: answer.txt (deflated 94%)


In [None]:
from google.colab import files
files.download('results.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>