In [None]:
!pip install simpletransformers transformers==4.40.2

In [1]:
# Simpletransformers classifier
from simpletransformers.classification import ClassificationModel, ClassificationArgs

In [6]:

# Load the required packages

# Dataframes
import pandas as pd, numpy as np
import matplotlib.pyplot as plt

# Model performance scores
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# from google.colab import drive
# drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
# cd /content/drive/MyDrive/manifestos

/content/drive/MyDrive/manifestos


## Training set: Our model, keyword searches

In [8]:
td = pd.read_csv('training_data.csv')

In [8]:
# Construct binary variables for keyword subsets
td['high_kw']=[np.max([ r['kw_gw'],r['kw_cc'], r['kw_cer'] ]) for _,r in td.iterrows()]
td['any_kw']=[np.max([ r['kw_gw'],r['kw_cc'], r['kw_c'], r['kw_se'], r['kw_cer'] ]) for _,r in td.iterrows()]


In [9]:
print(accuracy_score(td['final_climate'],td['any_kw']))
print(precision_score(td['final_climate'],td['any_kw']))
print(recall_score(td['final_climate'],td['any_kw']))
print(f1_score(td['final_climate'],td['any_kw']))

0.7175305765870704
0.7311203319502074
0.845489443378119
0.7841566533155319


In [11]:
print(accuracy_score(td['final_climate'],td['high_kw']))
print(precision_score(td['final_climate'],td['high_kw']))
print(recall_score(td['final_climate'],td['high_kw']))
print(f1_score(td['final_climate'],td['high_kw']))

0.8308095515433896
0.9068760151597185
0.803742802303263
0.8522004578987535


## Validation set: Our model, keyword searches

In [21]:
# load all predictions
total = pd.read_csv('preds_23May_strict_broad_no_annot.csv')
total.shape

  total = pd.read_csv('preds_23May_strict_broad_no_annot.csv')


(1232000, 93)

In [22]:
# load manual validation annotations of posthoc sample
ph_samp = pd.read_csv('results_22May/posthoc_val_samples.csv')
ph_samp.shape

(3846, 10)

In [23]:
ph_samp = ph_samp.drop_duplicates(subset=['qs_new'])
ph_samp.shape

(3827, 10)

In [24]:
# merge
total = total.merge(ph_samp, on='qs_new',how='outer')

In [25]:
total.shape

(1232000, 102)

In [26]:
# consolidate columns
total.rename(columns={'posthoc_broad_x':'poshoc_broad_pos_only','TP_broad_x':'TP_broad_pos_only',
                      'posthoc_broad_y':'posthoc_broad','TP_broad_y':'TP_broad'},inplace=True)

In [27]:
# restrict to those with 3 or more words
total = total.query('words > 2')
total.shape

(1210046, 102)

In [37]:
# binary variables for keyword subsets
total['kw_high'] = [np.max([ r['kw_gw'],r['kw_cc'], r['kw_cer'] ]) for _,r in total.iterrows()]
total['kw_any'] = [np.max([ r['kw_gw'],r['kw_cc'], r['kw_se'], r['kw_cer'] ]) for _,r in total.iterrows()]

In [28]:
# subset to only the quasi-sentences annotated in the post-hoc validation set
samp = total.query('posthoc_broad_neg_rand == 1 | posthoc_broad_positive_rand == 1 | posthoc_broad_neg_any_kw == 1' )
samp.shape

(2103, 102)

In [39]:
# Calculate performance of predictions for climate relevance versus the true positive annotation variable
print(accuracy_score(samp[f'TP_broad'],samp[f'label_broad_23May'])) # (True positives + True Negatives)/ (True positives + True negatives + False positives + False negatives)
print(precision_score(samp[f'TP_broad'],samp[f'label_broad_23May'])) # True positives/ (True positives + False positives)
print(recall_score(samp[f'TP_broad'],samp[f'label_broad_23May'])) # True Positive (TP) / True Positive (TP) + False Negative (FN)
print(f1_score(samp[f'TP_broad'],samp[f'label_broad_23May']))

0.9567284831193533
0.9502923976608187
0.9193776520509194
0.9345794392523366


In [40]:
# # compare to KW any
print(accuracy_score(samp[f'TP_broad'],samp[f'kw_any'])) # (True positives + True Negatives)/ (True positives + True negatives + False positives + False negatives)
print(precision_score(samp[f'TP_broad'],samp[f'kw_any'])) # True positives/ (True positives + False positives)
print(recall_score(samp[f'TP_broad'],samp[f'kw_any'])) # True Positive (TP) / True Positive (TP) + False Negative (FN)
print(f1_score(samp[f'TP_broad'],samp[f'kw_any']))

0.4574417498811222
0.25452488687782804
0.31824611032531824
0.28284098051539913


In [41]:
# # compare to KW high
print(accuracy_score(samp[f'TP_broad'],samp[f'kw_high'])) # (True positives + True Negatives)/ (True positives + True negatives + False positives + False negatives)
print(precision_score(samp[f'TP_broad'],samp[f'kw_high'])) # True positives/ (True positives + False positives)
print(recall_score(samp[f'TP_broad'],samp[f'kw_high'])) # True Positive (TP) / True Positive (TP) + False Negative (FN)
print(f1_score(samp[f'TP_broad'],samp[f'kw_high']))

0.7161198288159771
0.7835051546391752
0.214992927864215
0.3374028856825749


In [42]:
# calculate language specific performance scores

for language in samp['language'].unique():
  s = samp[samp['language']==language]
  print(language, s.shape[0])
  print('Accuracy:',accuracy_score(s['TP_broad'],s['label_broad_23May']))
  print('Prec:',precision_score(s['TP_broad'],s['label_broad_23May']))
  print('Rec:',recall_score(s['TP_broad'],s['label_broad_23May']))
  print('F1:',f1_score(s['TP_broad'],s['label_broad_23May']))
  print()

swedish 90
Accuracy: 0.9666666666666667
Prec: 0.9655172413793104
Rec: 0.9333333333333333
F1: 0.9491525423728815

danish 89
Accuracy: 0.9662921348314607
Prec: 0.9655172413793104
Rec: 0.9333333333333333
F1: 0.9491525423728815

finnish 88
Accuracy: 0.9545454545454546
Prec: 0.9655172413793104
Rec: 0.9032258064516129
F1: 0.9333333333333333

french 89
Accuracy: 0.9662921348314607
Prec: 0.9333333333333333
Rec: 0.9655172413793104
F1: 0.9491525423728815

dutch 89
Accuracy: 0.9325842696629213
Prec: 0.9666666666666667
Rec: 0.8529411764705882
F1: 0.90625

german 90
Accuracy: 0.9444444444444444
Prec: 0.9666666666666667
Rec: 0.8787878787878788
F1: 0.9206349206349207

italian 90
Accuracy: 0.9111111111111111
Prec: 0.8333333333333334
Rec: 0.8928571428571429
F1: 0.8620689655172413

spanish 90
Accuracy: 0.9333333333333333
Prec: 0.9
Rec: 0.9
F1: 0.9

catalan 89
Accuracy: 0.9662921348314607
Prec: 1.0
Rec: 0.9032258064516129
F1: 0.9491525423728813

galician 89
Accuracy: 0.9662921348314607
Prec: 0.9666666666

## ClimateBert comparison in training set and post-hoc validation set

In [3]:
model_name = 'climatebert/distilroberta-base-climate-detector'
model_type = 'roberta'

In [4]:
model = ClassificationModel(model_type,model_name)



config.json:   0%|          | 0.00/887 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.48k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

### Training set comparison with ClimateBert

In [10]:
# load td
# fill EN text (wouldn't have been translated)
td['trans'].fillna(td['original_text'], inplace=True)

# make sure no NAs
td['trans'].isna().sum()


0

In [11]:
# Run climatebert over translations
preds,output = model.predict(td['trans'].tolist())


  self.pid = os.fork()


  0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/35 [00:00<?, ?it/s]

In [15]:
# calculate metrics
cb_td_predf = pd.DataFrame(zip(td['qs_new'],preds,output),columns=['qs_new','preds','output'])


In [16]:
cb_td_predf.rename(columns={'preds':'pred_cb'},inplace=True)

In [19]:
cb_td_predf = cb_td_predf.merge(td,on='qs_id')

In [20]:
print(accuracy_score(cb_td_predf['final_climate'],cb_td_predf['pred_cb']))
print(precision_score(cb_td_predf['final_climate'],cb_td_predf['pred_cb']))
print(recall_score(cb_td_predf['final_climate'],cb_td_predf['pred_cb']))
print(f1_score(cb_td_predf['final_climate'],cb_td_predf['pred_cb']))

0.8156668608037274
0.7712149532710281
0.9899232245681382
0.866988863206556


### Post-hoc validation set

In [29]:
# same samp dataframe from validation set checks above

# fill EN text (wouldn't have been translated)
samp['trans_y'].fillna(samp['original_text'], inplace=True)

# make sure no NAs
samp['trans_y'].isna().sum()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  samp['trans_y'].fillna(samp['original_text'], inplace=True)


0

In [30]:
# Run ClimateBert over translations
preds,output = model.predict(samp['trans_y'].tolist())

  self.pid = os.fork()


  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/22 [00:00<?, ?it/s]

In [31]:
cb_phv_predf = pd.DataFrame(zip(samp['qs_new'],preds,output),columns=['qs_new','preds','output'])

In [32]:
cb_phv_predf.rename(columns={'preds':'pred_cb'},inplace=True)

In [33]:
cb_phv_predf.shape

(2103, 3)

In [36]:
cb_phv_predf = cb_phv_predf.merge(samp,on='qs_new')

In [40]:
print(accuracy_score(cb_phv_predf['TP_broad'],cb_phv_predf['pred_cb']))
print(precision_score(cb_phv_predf['TP_broad'],cb_phv_predf['pred_cb']))
print(recall_score(cb_phv_predf['TP_broad'],cb_phv_predf['pred_cb']))
print(f1_score(cb_phv_predf['TP_broad'],cb_phv_predf['pred_cb']))

0.750832144555397
0.579496090356212
0.9434229137199435
0.7179763186221745
