In [None]:
!pip install transformers
!pip install simpletransformers

In [21]:
import pandas as pd
import numpy as np
import datetime
import time
import matplotlib.pyplot as plt
# import ipdb


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.manifold import TSNE
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MultiLabelBinarizer

from simpletransformers.classification import ClassificationModel, ClassificationArgs
from simpletransformers.classification import (
    MultiLabelClassificationModel, MultiLabelClassificationArgs
)

import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertModel, BertConfig
import logging

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [2]:
documents_train = pd.read_csv("data_round_1/documents_en_train.csv")
sentences_train = pd.read_csv("data_round_1/sentences_en_train.csv")

documents_val = pd.read_csv("data_round_1/documents_en_val.csv")
sentences_val = pd.read_csv("data_round_1/sentences_en_val.csv")

documents_test = pd.read_csv("data_round_1/documents_en_test.csv")
sentences_test = pd.read_csv("data_round_1/sentences_en_test.csv")
#immap_sector_name_to_id.json

In [None]:
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

In [None]:
device

## Investigate Document data

In [None]:
documents_train.head()

## Investigate full text data

In [None]:
sentences_train.head()

In [None]:
counts = sentences_train[["sentence_id", "doc_id"]].groupby("doc_id").count()
print("Median sentence lenght of document:", np.median(counts))
counts

In [None]:
import matplotlib.pyplot as plt
counts = sentences_train.groupby("doc_id").count()
x = counts["sentence_id"]
plt.hist(x)
plt.show()

In [None]:
all_sectors = []
string_list = ["[", "]", ",", " "]
for topics in sentences_train["sector_ids"]:
    for topic in topics:
        if topic not in string_list:
            all_sectors.append(topic)        

In [None]:
plt.hist(sorted(all_sectors))
plt.show()

In [None]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
sentences_train['sentence_text']

In [None]:
train_df = pd.DataFrame({
    'text': sentences_train['sentence_text'],
    'label': sentences_train["is_relevant"]
})

In [None]:
eval_df = pd.DataFrame({
    'text': sentences_val['sentence_text'],
    'label': sentences_val["is_relevant"],
})

In [None]:
# Optional model configuration
model_args = ClassificationArgs(num_train_epochs=1)

In [None]:
# Create a TransformerModel
model = ClassificationModel('distilbert', 'distilbert-base-uncased', use_cuda=False)

In [None]:
# Train the model
model.train_model(train_df, overwrite_output_dir=True)

In [None]:
CUDA_LAUNCH_BLOCKING=1

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(eval_df)

In [None]:
result

In [None]:
predictions, raw_outputs = model.predict(sentences_test['sentence_text'].astype(str).values.tolist())

In [None]:
len(predictions)

In [None]:
submit_df = pd.DataFrame({
    'doc_id': sentences_test['doc_id'],
    'sentence_id': sentences_test["sentence_id"],
    'is_relevant': predictions,
})

In [None]:
submit_df.to_csv("submissions/submission_relevant_baseline", index=False)

### Multi-Label Classification

In [3]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [4]:
sentences_train = sentences_train.loc[sentences_train["is_relevant"] == 1]
sentences_val = sentences_val.loc[sentences_val["is_relevant"] == 1]

In [5]:
def process_sector_ids(sentences):
    mlb = MultiLabelBinarizer()
    sectors = pd.DataFrame(mlb.fit_transform(sentences["sector_ids"]),columns=mlb.classes_)
    sectors = sectors.drop([",", "[", "]", " "], axis = 1)
    sectors["-1"] = 0
    sectors["-1"][sectors.sum(axis=1) == 0] = 1

    return sectors.values

In [6]:
train_labels = process_sector_ids(sentences_train)
val_labels = process_sector_ids(sentences_val)

In [9]:
train_labels = pd.DataFrame(train_labels)
val_labels = pd.DataFrame(val_labels)

In [10]:
sentences_train = sentences_train.reset_index()
sentences_val = sentences_val.reset_index()

In [13]:
sentences_train['labels'] = train_labels.apply(lambda x: [np.array(x)], axis=1).apply(lambda x: x[0])
sentences_val['labels'] = val_labels.apply(lambda x: [np.array(x)], axis=1).apply(lambda x: x[0])

In [14]:
sentences_train

Unnamed: 0,index,doc_id,sentence_id,sentence_text,is_relevant,sector_ids,labels
0,14,51657,14,One in five children in the country has no acc...,1,"[2, 5]","[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0]"
1,15,51657,15,Many orphaned youth live in poverty and have l...,1,"[2, 5]","[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0]"
2,16,51657,16,Some children remain living with a single pare...,1,"[2, 5]","[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0]"
3,17,51657,17,Other youth live in shelters or on the streets.,1,"[2, 5]","[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0]"
4,19,51542,0,WFP Colombia Country Brief December 2020 Opera...,1,[3],"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
...,...,...,...,...,...,...,...
41890,261836,34549,46,Anticipatory action is critical to safeguard t...,1,[],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
41891,261841,34549,51,Cash+ in the form of cash transfers and provis...,1,[0],"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
41892,261842,34549,52,The focus of this intervention is to support v...,1,[0],"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
41893,261843,34549,53,Cash+ in the form of cash transfers and provis...,1,[8],"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"


In [16]:
sentences_train = sentences_train[['sentence_text', 'labels']].copy()
sentences_val = sentences_val[['sentence_text', 'labels']].copy()

In [17]:
sentences_train.columns = ["text", "labels"]
sentences_val.columns = ["text", "labels"]

In [27]:
model_args = MultiLabelClassificationArgs(num_train_epochs=1)

In [30]:
# Create a MultiLabelClassificationModel
model = MultiLabelClassificationModel(
    "distilbert", "distilbert-base-uncased", num_labels=11, use_cuda=False
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForMultiLabelSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForMultiLabelSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForMultiLabelSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForMultiLabelSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier

In [31]:
sentences_train

Unnamed: 0,text,labels
0,One in five children in the country has no acc...,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0]"
1,Many orphaned youth live in poverty and have l...,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0]"
2,Some children remain living with a single pare...,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0]"
3,Other youth live in shelters or on the streets.,"[0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0]"
4,WFP Colombia Country Brief December 2020 Opera...,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
...,...,...
41890,Anticipatory action is critical to safeguard t...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
41891,Cash+ in the form of cash transfers and provis...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
41892,The focus of this intervention is to support v...,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
41893,Cash+ in the form of cash transfers and provis...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0]"


In [32]:
# Train the model
model.train_model(sentences_train)

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


  0%|          | 0/41895 [00:00<?, ?it/s]

INFO:simpletransformers.classification.classification_utils: Saving features into cached file cache_dir/cached_train_distilbert_128_0_2


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/5237 [00:00<?, ?it/s]

KeyboardInterrupt: 