In [1]:
# Useful starting lines
%matplotlib inline
import pandas as pd
%load_ext autoreload
%autoreload 2

In [2]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

In [3]:
# Main external library : Natural Language Toolkit (nltk)
import nltk
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

nltk.download('punkt')
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/teframartin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Load tweets

In [4]:
DATA_TRAIN_PATH = '../data/cleaned_data.csv'

# Load the train dataset preprocessed
df = pd.read_csv(DATA_TRAIN_PATH, sep=',', index_col=0)
df.sample(10)

Unnamed: 0,text,labels
87913,huey duewey louie still primary school 75 yea...,1
166444,know talking remember story poor lmfao,0
143803,shouldve brought si,0
180915,mr patrick add 3 mark thing add 3 mark test 1...,0
87510,master windmill shot mini golf count,1
161173,definitely cry topic father,0
27903,nice really cool follow miss 3,1
137711,chruch today mood go,0
160100,noo forgot charge ipod cannot go day without,0
66406,get give,1


### Creation of the model

1. Initialize a task-specific model

In [5]:
model = ClassificationModel("roberta", "roberta-base", use_cuda=False)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

2. Train the model with train_model()

In [7]:
model_args = ClassificationArgs(num_train_epochs=1, overwrite_output_dir=True, manual_seed=42)
model = ClassificationModel(model_type='roberta', model_name='roberta-base', use_cuda=False, num_labels=2, args=model_args)
model.train_model(df)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

  0%|          | 0/181320 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/22665 [00:00<?, ?it/s]

KeyboardInterrupt: 

3. Evaluate the model

In [None]:
# sample an other part of the train full dataset and measure the accuracy
# with eval_model()
result, model_outputs, wrong_preds = model.eval_model(df.sample(1000))

  0%|          | 0/1000 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Running Evaluation:   0%|          | 0/125 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
import numpy as np
from sklearn import f1_score

predictions = []
for x in model_outputs:
    predictions.append(np.argmax(x))
print('f1 score:', f1_score(df["labels"], predictions))

4. Make predictions on (unlabelled) data

In [None]:
# create the submission file on the test dataset
#  with predict()