In [1]:
!pip install simpletransformers

Collecting simpletransformers
  Downloading simpletransformers-0.63.6-py3-none-any.whl (249 kB)
[K     |████████████████████████████████| 249 kB 5.3 MB/s 
Collecting transformers>=4.6.0
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 32.3 MB/s 
[?25hCollecting wandb>=0.10.32
  Downloading wandb-0.12.14-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 36.2 MB/s 
Collecting streamlit
  Downloading streamlit-1.8.1-py2.py3-none-any.whl (10.1 MB)
[K     |████████████████████████████████| 10.1 MB 23.9 MB/s 
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 33.8 MB/s 
Collecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 45.9 MB/s 
Collecting tokenizers
  Downloading tokenizers-0.12.1-cp37-cp37m-ma

## Set up environment to process training and test data

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
df_training = pd.read_csv('/content/drive/MyDrive/Wysdom/public_data_bathurst_cleaned.csv', encoding='unicode_escape')
df_test = pd.read_csv('/content/drive/MyDrive/Wysdom/input_data.csv')

In [3]:
df_training.head()

Unnamed: 0,id,message,label
0,8793,hi i want change my address from my credit card,updateaddress
1,3083,i need 4 fruit maple oatmeal 3 cold brew froze...,orderdrinkintent
2,5932,i wish to travel next month domestic airway,bookflight
3,12077,i need reimbursement my expenses,expensereport
4,6608,i need a copy of insurance for my car,getproofofinsurance


## Preprocess data being used to fine-tune LLM

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

In [5]:
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer


In [8]:
X_messages = df_training['message']

In [9]:
!pip install unidecode

# remove weird characters from messages
import unidecode
X_messages = X_messages.apply(lambda x: unidecode.unidecode(x))



In [10]:
# remove numbers from messages
import re
X_messages = X_messages.apply(lambda x: re.sub(r'\d+', '', x))

In [11]:
# tokenize messages
nltk.download('punkt')
X_messages_tokens = X_messages.apply(lambda x: word_tokenize(x))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [12]:
# remove stopwords
nltk.download('stopwords')

X_messages_tokens = X_messages_tokens.apply(lambda x: [word for word in x if word not in stopwords.words('english')])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [14]:
# lemmatize words
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()
X_messages = X_messages_tokens.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x]))

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [15]:
X_messages

0                      hi want change address credit card
1        need fruit maple oatmeal cold brew frozen coffee
2                  wish travel next month domestic airway
3                              need reimbursement expense
4                                 need copy insurance car
                               ...                       
11618        boarding pas sent email address phone number
11619                                  need ticker bombay
11620                              want musical equipment
11621                                    need window seat
11622                       please money transfer account
Name: message, Length: 11623, dtype: object

In [None]:
# Preprocessing functions
def remove_unwanted_chars(df):
    return df.apply(lambda x: unidecode.unidecode(x))

def remove_numbers(df):
    return df.apply(lambda x: re.sub(r'\d+', '', x))

def tokenize_message(df):
    return df.apply(lambda x: word_tokenize(x))

def remove_stopwords(df):
    return df.apply(lambda x: [word for word in x if word not in stopwords.words('english')])

def lemmatize_message(df):
    return df.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x]))

In [None]:
# Make FunctionTransformers for custom preprocessing functions
from sklearn.preprocessing import FunctionTransformer

remove_unwanted_transformer = FunctionTransformer(remove_unwanted_chars)
remove_numbers_transformer = FunctionTransformer(remove_numbers)
tokenize_transformer = FunctionTransformer(tokenize_message)
remove_stopwords_transformer = FunctionTransformer(remove_stopwords)
lemmatize_transformer = FunctionTransformer(lemmatize_message)

In [None]:
preprocessing_pipe = Pipeline([('removeunwanted', remove_unwanted_transformer),
                                ('removenumbers', remove_numbers_transformer),
                                ('tokenize', tokenize_transformer),
                                ('removestopwords', remove_stopwords_transformer),
                                ('lemmatize', lemmatize_transformer)])

In [16]:
from sklearn.preprocessing import LabelEncoder
labelEncoder = LabelEncoder()
encoded_labels = labelEncoder.fit_transform(df_training['label'])

In [19]:
X = pd.concat([pd.Series(X_messages, name='text'), pd.Series(encoded_labels, name='labels')], axis=1)
X.head()

Unnamed: 0,text,labels
0,hi want change address credit card,38
1,need fruit maple oatmeal cold brew frozen coffee,22
2,wish travel next month domestic airway,0
3,need reimbursement expense,11
4,need copy insurance car,15


In [20]:
n_labels = len(X.labels.value_counts())

In [21]:
from sklearn.model_selection import train_test_split
X_train, X_val = train_test_split(X, test_size=0.2, random_state=42)

## Fine-tune LLM with preprocessed training data

In [22]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
# model_args = ClassificationArgs(num_train_epochs=5)
model = ClassificationModel('roberta', 'roberta-base', use_cuda=True, num_labels=n_labels, args={'num_train_epochs': 5, 'reprocess_input_data': True, 'overwrite_output_dir': True})

Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.layer_norm.bias', 'roberta.pooler.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.out_proj.weight', 'classi

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [23]:
model.train_model(X_train)

  0%|          | 0/9298 [00:00<?, ?it/s]



Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 0 of 5:   0%|          | 0/1163 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/1163 [00:00<?, ?it/s]

Running Epoch 2 of 5:   0%|          | 0/1163 [00:00<?, ?it/s]

Running Epoch 3 of 5:   0%|          | 0/1163 [00:00<?, ?it/s]

Running Epoch 4 of 5:   0%|          | 0/1163 [00:00<?, ?it/s]

(5815, 0.2732951785917438)

In [24]:
from sklearn.metrics.cluster import adjusted_mutual_info_score
result, model_outputs, wrong_predictions = model.eval_model(X_val, ami=adjusted_mutual_info_score)

  0%|          | 0/2325 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/291 [00:00<?, ?it/s]

In [25]:
result

{'mcc': 0.9606893391629572,
 'ami': 0.9450955302652355,
 'eval_loss': 0.2582877282936549}

## Making predictions on test set 

In [26]:
X_test = df_test['message'].rename('text')
predictions, raw_outputs = model.predict(X_test.values.tolist())

  0%|          | 0/2906 [00:00<?, ?it/s]

  0%|          | 0/364 [00:00<?, ?it/s]

In [27]:
transformed_predictions = labelEncoder.inverse_transform(predictions)

## Write predictions to zip for submission

In [28]:
my_submission = pd.DataFrame({'Id': df_test['id'], 'label': transformed_predictions})
print(my_submission.head())

      Id                 label
0  12123  reportbrokensoftware
1    244      orderpizzaintent
2   8221           replacecard
3  12856    startserviceintent
4  12108         expensereport


In [29]:
compression_opts = dict(method='zip', archive_name='coda_submission.csv')
my_submission.to_csv('/content/drive/MyDrive/Wysdom/preprocessed_robertabase5_submission.zip', index=False, compression=compression_opts)