<a href="https://colab.research.google.com/github/milanschroeder/TRIAS-paper1/blob/main/11_digitality_zs.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Zeroshot Classification: Digital Sub-Topics

In [20]:
# make sure to run on GPU:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

# load modules:
import pandas as pd
import numpy as np
import time
import os

# load data:
output_path = '/content/drive/MyDrive/TRIAS/zs_subtopics.csv'
input_path = '/content/drive/MyDrive/TRIAS/all_unique_texts.csv' # with id, text, target
df = pd.read_csv(input_path)
df['n_chars_para'] = df['text'].apply(lambda x: len(x))

### if need to restart (due to Colab Timeout):
classified_communication = pd.read_csv(output_path)
df = df[
    (~df.index.isin(classified_communication.index)) &
    ~((df['n_chars_para'] < 35) & (df['para_type'] == "paragraph"))
    ].sample(frac=1, random_state=42).sort_values(by='doc_pos')

Wed May 22 14:21:20 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA L4                      Off | 00000000:00:03.0 Off |                    0 |
| N/A   42C    P8              12W /  72W |      1MiB / 23034MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [21]:
# have a look at the data
df

Unnamed: 0,id,doc_pos,text,doc_key,n_chars_para,para_type,lang,doc_type,target_max,target_mode,coder_ID
1198277,1996417,1,Speech by Commissioner Vella at the AgriFish C...,detail/en/speech_19_7304,97,title,en,Speech,,,
1201790,1992526,1,Remarks by Commissioner Stella Kyriakides at t...,detail/en/speech_23_4003,135,title,en,Speech,,,
1203520,2045762,1,Jacques Santer President of the European Commi...,detail/en/speech_98_230,130,title,en,Speech,,,
1071202,1725971,1,Speech by Commissioner Urpilainen at the Annua...,detail/en/speech_23_1954,110,subheader,en,Speech,,,
1075509,1732933,1,High-level policy conference on,detail/en/speech_16_4684,31,subheader,en,Speech,,,
...,...,...,...,...,...,...,...,...,...,...,...
822999,1203932,3145,Net lending/borrowing vis-à-vis the rest of th...,detail/en/ip_10_288,64,paragraph,en,Press release,,,
823000,1204139,3352,"2 Based on estimated potential growth of 1.5%,...",detail/en/ip_10_288,104,paragraph,en,Press release,,,
823002,1204141,3354,4 Data for revenue and expenditure are not pro...,detail/en/ip_10_288,387,paragraph,en,Press release,,,
823003,1204142,3355,5 Cyclically-adjusted balance excluding one-of...,detail/en/ip_10_288,260,paragraph,en,Press release,,,




# Zeroshot Classification:

In [22]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
new_data = df[:0]

# Record the start time
start_time = time.time()
progress = 0

hypotheses = [
      "This text relates to digital communications.",
      "This text relates to internet technologies.",
      "This text relates to digital services.",
      "This text relates to digital algorithms.",
      "This text relates to digitized data.",
      "This text relates to digital policy."
  ]

label = [
    'digital_communications',
    'internet_technologies',
    'digital_services',
    'digital_algorithms',
    'digitized_data',
    'digital_policy'
]

for i, row in df.iterrows():

  premise = row['text']

  result = classifier(premise, hypotheses, multi_label = True)

  # Extracting the scores
  scores = result['scores']
  labels = result['labels']


  for j in range(len(hypotheses)):
    df.at[i, label[j]] = scores[labels.index(hypotheses[j])]

  new_data = pd.concat([new_data, pd.DataFrame(df.loc[i]).T])

  progress = progress + 1

  if progress % 100 == 0:
    end_time = time.time()
    mean_time = end_time - start_time
    print(progress, "after", mean_time)

    # save new_data:
    new_data.to_csv(output_path, mode='a', header=not os.path.exists(output_path), index=False) #append df every 1000 classifications
    new_data = new_data[:0] # truncate again

# save last new_data:
if progress % 100 != 0:
  new_data.to_csv(output_path, mode='a', header=not os.path.exists(output_path), index=False)

end_time = time.time()
total_time = end_time - start_time

print("\nTotal time:", total_time, "\nTotal classifications:", progress, "\nmean time per classification:", total_time/progress)

100 after 99.54154968261719
200 after 195.90145921707153
300 after 292.9508571624756
400 after 389.87961769104004
500 after 488.3746449947357
600 after 585.7786979675293
700 after 683.8035454750061
800 after 781.3517699241638
900 after 877.7111203670502
1000 after 973.4547157287598
1100 after 1072.6215748786926
1200 after 1169.9864354133606
1300 after 1268.5662667751312
1400 after 1366.751939535141
1500 after 1466.6190421581268
1600 after 1564.322648525238
1700 after 1661.0268061161041
1800 after 1757.1171443462372
1900 after 1856.1201310157776
2000 after 1953.5841152668
2100 after 2049.4195351600647
2200 after 2145.1935880184174
2300 after 2242.507579803467
2400 after 2337.8787508010864
2500 after 2435.1262719631195
2600 after 2533.9115982055664
2700 after 2630.204003095627
2800 after 2728.94388628006
2900 after 2826.477199316025
3000 after 2924.480701446533
3100 after 3020.8077852725983
3200 after 3118.057915687561
3300 after 3215.425691127777
3400 after 3311.026135444641
3500 after 