In [73]:
import json
import pandas as pd
from development.datasets.OsdgDataset import load_osdg_data
from development.datasets.uclmodules_dataset import load_uclmodules_data
from development.datasets.videscription_dataset import load_videscription_data
from development.train_model import fine_tune_transformer
from development.models.Bert import Bert
from development.models.RobertaNER import RobertaNER
from development.pipelines import full_pipe
from development.scrape.RelxScraper import RelxScraper
from development.utils import parse_sdg_label

In [49]:
%load_ext autoreload
%autoreload 2
pd.set_option('max_colwidth', None)

In [78]:
with open('config.json', 'r') as file:
    CONFIG = json.load(file)
    dev_config = CONFIG['development']


In [361]:
osdg_data = load_osdg_data(dev_config['osdg_data_path'], training=False, filter_agreement=False)

In [367]:
osdg_data.groupby(['sdg']).count()['text']

sdg
1     2734
2     2457
3     2689
4     3740
5     4338
6     2815
7     3048
8     1509
9     2105
10    2032
11    2277
12    1108
13    2102
14    1141
15    2143
16    5451
Name: text, dtype: int64

In [357]:
osdg_data = load_osdg_data(dev_config['osdg_data_path'], training=True, filter_agreement=False)

In [332]:
ucl_data = load_uclmodules_data(dev_config['uclmodules_data_path'])

In [5]:
videscription_data = load_videscription_data(
    dev_config['videscription_data_path']
)

In [None]:
bert = Bert()

In [None]:
result = fine_tune_transformer(
    bert.model,
    bert.tokenizer,
    bert.tokenizer_args,
    data=osdg_data,
    dev_config=dev_config
)

In [169]:
bert = Bert('./development/weights/Bert/checkpoint-3000/')

In [None]:
cls_report = bert.evaluate(osdg_data['test'])
print(cls_report)

In [48]:
ner_model = RobertaNER()

In [27]:
ner_model.print_entities(osdg_data['train'][0][14])

ORG: The World Resources Institute
ORG: World Resources Institute


In [76]:
df = full_pipe(bert, ner_model, texts=[osdg_data['train'][0][7]])

Unnamed: 0,Text,SDG,Entities,Sentiment
0,"This is why the Sustainable Development of Protected Areas System of Ethiopia was set up, with support from the Global Environment Fund and UNDP. The project is spearheading a suite of interventions, focusing on the national system in terms of capacity building and training, and integrating the protected area system into mainstream development. Since the initiation of the project in 2008, valuation exercises have found that the main value of protected areas is in the environmental services that they provide to poor rural communities, many of which are food-insecure, protected areas were incorporated into the Ethiopia Poverty Strategy, and the legal boundaries of the protected area system were strengthened by supporting the demarcation and gazettement of four areas through a highly consultative process (UNDP, n.d.).",Climate Action,ORG: the Sustainable Development of Protected Areas System of Ethiopia - ORG: the Global Environment Fund - ORG: UNDP - ORG: UNDP -,


In [138]:
relx_scraper = RelxScraper()
relx_scraper.scrape_data(start=0)
relx_scraper.save_as_csv('./data/relx_data.csv')

In [262]:
predictions = bert.predict_batch(relx_scraper.data['abstract'].values)

In [369]:
bert.evaluate_multilabel(ucl_data, add_one=True)

0.3105493133583021

In [70]:
from development.datasets.RelxDataset import load_relx_data
from development.train_model import fine_tune_transformer_2
from development.models.BertMultiLabel import BertMultiLabel

In [61]:
relx_training_data = load_relx_data(data_path='./data/relx_data_2.csv', training=True)

In [48]:
relx_training_data = pd.read_csv('./data/relx_data_2.csv')

In [66]:
relx_training_data['train'][0].shape

(825,)

In [71]:
bert_multilabel = BertMultiLabel()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [80]:
results = fine_tune_transformer_2(
    bert_multilabel.model,
    bert_multilabel.tokenizer,
    bert_multilabel.tokenizer_args,
    data=relx_training_data,
    dev_config=dev_config
)



  0%|          | 0/104 [00:00<?, ?it/s]

  key: val[idx].clone().detach() for key, val in self.__encodings.items()


{'loss': 0.5678, 'learning_rate': 5.2e-06, 'epoch': 1.0}


  0%|          | 0/12 [00:00<?, ?it/s]

{'eval_loss': 0.5003253314201785, 'eval_runtime': 2.6619, 'eval_samples_per_second': 66.494, 'eval_steps_per_second': 4.508, 'epoch': 1.0}


  key: val[idx].clone().detach() for key, val in self.__encodings.items()


{'loss': 0.431, 'learning_rate': 1.04e-05, 'epoch': 2.0}


  0%|          | 0/12 [00:00<?, ?it/s]

{'eval_loss': 0.35986236015618855, 'eval_runtime': 2.7348, 'eval_samples_per_second': 64.721, 'eval_steps_per_second': 4.388, 'epoch': 2.0}
{'train_runtime': 82.7052, 'train_samples_per_second': 19.95, 'train_steps_per_second': 1.257, 'train_loss': 0.49941627795879656, 'epoch': 2.0}


In [81]:
from transformers import TextClassificationPipeline

In [82]:
bert_multilabel = BertMultiLabel('./development/weights/Bert/checkpoint-104/')

In [83]:
pipe = TextClassificationPipeline(
    model=bert_multilabel.model,
    tokenizer=bert_multilabel.tokenizer,
    top_k=None,
    device=0,
    max_length=512,
    truncation=True
)

A matching Triton is not available, some optimizations will not be enabled.
Error caught was: No module named 'triton'


In [93]:
relx_training_data['test'][1][5]

array([0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [92]:
relx_training_data['test'][0][5]

'This study aims to examine quality of diabetes care in persons with type 2 diabetes with and without severe mental illness (SMI).'

In [94]:
pipe(relx_training_data['test'][0][5])

[[{'label': 'Good Health and Well-Being', 'score': 0.5890412926673889},
  {'label': 'Climate Action', 'score': 0.3079196512699127},
  {'label': 'Zero Hunger', 'score': 0.26967760920524597},
  {'label': 'Reduced Inequalities', 'score': 0.26396042108535767},
  {'label': 'Clean Water and Sanitation', 'score': 0.24772398173809052},
  {'label': 'Life on Land', 'score': 0.23847320675849915},
  {'label': 'Industry, Innovation, and Infrastructure',
   'score': 0.22737286984920502},
  {'label': 'Sustainable Cites and Communities', 'score': 0.2155924290418625},
  {'label': 'Peace, Justice, and Strong Institutions',
   'score': 0.21529677510261536},
  {'label': 'Decent Work and Economic Growth', 'score': 0.2129502147436142},
  {'label': 'Gender Equality', 'score': 0.2066534459590912},
  {'label': 'Affordable and Clean Energy', 'score': 0.2043519765138626},
  {'label': 'Quality Education', 'score': 0.20322106778621674},
  {'label': 'No Poverty', 'score': 0.19368858635425568},
  {'label': 'Life Bel