In [7]:
import pandas as pd
import numpy as np

import tensorflow as tf
import transformers
from transformers import AutoTokenizer,TFBertModel
from keras_tuner.engine.hyperparameters import HyperParameters
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.utils import to_categorical

from tensorflow.keras.optimizers import AdamW
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy

from sklearn.metrics import f1_score, accuracy_score, confusion_matrix

## Importing and preparing data

In [8]:
train_data = pd.read_csv('bert_ft_preproc_train.csv')
train_data.rename(columns={'Class':'type','Commit message':'text'}, inplace=True)
train_data['text'] = train_data['text'].astype(pd.StringDtype())
test_data = pd.read_csv('bert_ft_preproc_test.csv')
test_data.rename(columns={'Class':'type','Commit message':'text'},inplace=True)
test_data['text'] = test_data['text'].astype(pd.StringDtype())
label_dict = {'extract':0,'inline':1, 'move':2, 'pull up':3, 'push down':4, 'rename':5}

## Converting labels to categorical data

In [9]:
test_data['label'] = test_data.type.map(label_dict)
train_data['label'] = train_data.type.map(label_dict)

In [10]:
train_data

Unnamed: 0,text,type,label
0,fixed bug where rulesets were not being disabl...,inline,1
1,a few minor improvements to probes,push down,4
2,rf remove unused formal parameter,inline,1
3,sgf provide support in the sdg xml namespace t...,pull up,3
4,removed usage of the deprecated method collect...,move,2
...,...,...,...
3998,completely overhaul and clean up basic archite...,push down,4
3999,rename method to match any java style guide,rename,5
4000,wagon refactor ssh tests in prep to get some s...,push down,4
4001,vishal refactoring remove redundant db calls t...,inline,1


In [11]:
y_train = to_categorical(train_data['label'])
y_test = to_categorical(test_data['label'])

## Load model and tokenizer

In [12]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert = TFBertModel.from_pretrained('bert-base-uncased')

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


## Preprocess the data into a format BERT can use

In [13]:
x_train = tokenizer(
    text=train_data['text'].tolist(),
    add_special_tokens=True,
    max_length=200,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)
x_test = tokenizer(
    text=test_data['text'].tolist(),
    add_special_tokens=True,
    max_length=200,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

## Building the model

In [14]:
max_len = 200
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = bert(input_ids,attention_mask = input_mask)[0] 
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)
y = Dense(6,activation = 'sigmoid')(out)
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

## Compiling the model

In [15]:
optimizer = AdamW(
    learning_rate=5e-05, # this learning rate is for bert model , taken from huggingface website 
    epsilon=1e-08,
    clipnorm=1.0)
# Set loss and metrics
loss =CategoricalCrossentropy()
metric = CategoricalAccuracy('balanced_accuracy'),
# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

## Training the model

In [17]:
train_history = model.fit(
    x ={'input_ids':x_train['input_ids'],'attention_mask':x_train['attention_mask']} ,
    y = y_train,
    validation_data = (
    {'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']}, y_test
    ),
  epochs=5,
    batch_size=32
)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Model evaluation

In [18]:
predicted_raw = model.predict({'input_ids':x_test['input_ids'],'attention_mask':x_test['attention_mask']})
predicted_raw[0]



array([0.3743481 , 0.8243222 , 0.5668551 , 0.9176102 , 0.7274624 ,
       0.24616697], dtype=float32)

In [19]:
y_predicted = np.argmax(predicted_raw, axis = 1)
y_true = test_data.label

In [20]:
test_data.label

0       1
1       4
2       4
3       1
4       0
       ..
996     3
997     0
998     0
999     5
1000    3
Name: label, Length: 1001, dtype: int64

In [21]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_predicted))

              precision    recall  f1-score   support

           0       0.80      0.59      0.68       167
           1       0.46      0.26      0.34       167
           2       0.70      0.75      0.72       166
           3       0.38      0.53      0.44       167
           4       0.38      0.47      0.42       167
           5       0.96      0.95      0.95       167

    accuracy                           0.59      1001
   macro avg       0.61      0.59      0.59      1001
weighted avg       0.61      0.59      0.59      1001



In [22]:
confusion_matrix(y_true,y_predicted)
pd.crosstab(y_true, y_predicted, rownames = ['Actual'], colnames =['Predicted'], margins = True)

Predicted,0,1,2,3,4,5,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,99,10,4,26,28,0,167
1,5,44,12,51,52,3,167
2,9,2,124,26,4,1,166
3,6,10,19,88,42,2,167
4,5,29,11,42,79,1,167
5,0,0,7,1,1,158,167
All,124,95,177,234,206,165,1001


In [23]:
actual = pd.DataFrame(y_true)
pred = pd.DataFrame(y_predicted)

In [24]:
pred.rename(columns={0:'pred_label'}, inplace=True)

In [25]:
text = pd.DataFrame(test_data['text'])

In [26]:
bert_eval = pd.concat([text,actual,pred], axis=1)

In [27]:
num_to_label = {0:'extract',1:'inline', 2:'move', 3:'pull-up', 4:'push-down', 5:'rename'}

In [28]:
bert_eval['pred_label'] = bert_eval.pred_label.map(num_to_label)
bert_eval['label'] = bert_eval.label.map(num_to_label)

In [29]:
bert_eval.to_csv('bert_validation_report.csv', index=False)

In [30]:
# model.save('bert_model.keras')