In [33]:
import pandas as pd
import numpy as np

#text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
#from textblob import TextBlob

#model-building
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier


In [34]:
#nltk.download('punkt')
#nltk.download('averaged_perceptron_tagger')
#nltk.download('wordnet')

In [35]:
df = pd.read_csv('raw_commits.csv')
commits = df['Commit message']
ref_type = df['Class']

In [36]:
#lowercase, strip, remove punctuation
def preprocess(text):
    text = text.lower() 
    text= text.strip()  
    text= re.compile('<.*?>').sub('', text) 
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

In [37]:
#stopword removal
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

In [38]:
#final preprocessing
def finalpreprocess(string):
    return stopword(preprocess(string))
commits_pr = commits.apply(lambda x: finalpreprocess(x))

In [39]:
commits_pr = commits_pr.apply(lambda x: x.split()[:32])
commits_pr = pd.Series([' '.join(map(str, l)) for l in commits_pr])

In [40]:
commits_pr

0                                          extract method
1       minor tweaks following review extraction metho...
2       extract stuff method git p depot paths coremed...
3                          extract methods doiserviceimpl
4       refactoring getmenuspace navigation extract is...
                              ...                        
4999           rename getprotocol getmechanism testclient
5000    rename mapping methods mapfrom mapto signed lu...
5001    renaming refactor deserialization related code...
5002    renamed usage description match name used comm...
5003    renamed isoccupied point point hasroaduseron p...
Length: 5004, dtype: object

In [41]:
labeled_commits_preproc = pd.concat([commits_pr, ref_type], axis=1, join='inner')
labeled_commits_preproc.rename(columns={0: 'Commit message'}, inplace=True)
labeled_commits_preproc.head()
df = labeled_commits_preproc

In [42]:
encoded_dict = {'extract':0,'inline':1, 'move':2, 'pull up':3, 'push down':4, 'rename':5}
df['Class_Num'] = df.Class.map(encoded_dict)


In [43]:
from tensorflow.keras.utils import to_categorical

In [44]:
import transformers

In [45]:
X = df['Commit message']
y = df['Class_Num']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99, stratify=y)

In [46]:
from tensorflow.keras.utils import to_categorical

y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

In [47]:
from transformers import AutoTokenizer,TFBertModel
tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
bert = TFBertModel.from_pretrained('bert-base-cased')

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [48]:
X_train = tokenizer(
    text=X_train.tolist(),
    add_special_tokens=True,
    max_length=32,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)
X_test = tokenizer(
    text=X_test.tolist(),
    add_special_tokens=True,
    max_length=32,
    truncation=True,
    padding=True, 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [49]:
X_train

{'input_ids': <tf.Tensor: shape=(4003, 32), dtype=int32, numpy=
array([[  101,  4275, 15430, ..., 14467, 12272,   102],
       [  101,  3137,  8313, ...,     0,     0,     0],
       [  101,   187,  2087, ...,     0,     0,     0],
       ...,
       [  101, 10625,  1231, ...,  1116,  1112,   102],
       [  101,   191,  2944, ...,     0,     0,     0],
       [  101,  8130,  4684, ...,     0,     0,     0]])>, 'attention_mask': <tf.Tensor: shape=(4003, 32), dtype=int32, numpy=
array([[1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 1, 1, 1],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])>}

In [50]:
input_ids = X_train['input_ids']
attention_mask = X_train['attention_mask']

In [51]:
X_test['input_ids']

<tf.Tensor: shape=(1001, 32), dtype=int32, numpy=
array([[  101,  1231,  8057, ...,     0,     0,     0],
       [  101,   179, 15677, ...,     0,     0,     0],
       [  101,  1815,  2373, ...,     0,     0,     0],
       ...,
       [  101,  4275,  9726, ...,  1179, 25021,   102],
       [  101,  1231, 16124, ...,     0,     0,     0],
       [  101,  1231,  8057, ...,     0,     0,     0]])>

In [52]:
import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense

In [53]:
max_len = 32
input_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = bert(input_ids,attention_mask = input_mask)[0] 
out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)
y = Dense(6,activation = 'sigmoid')(out)
model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)
model.layers[2].trainable = True

In [54]:
optimizer = Adam(
    learning_rate=5e-5,
    epsilon=1e-08,
    clipnorm=1.0,
    weight_decay=0.01)
# Set loss and metrics
loss = CategoricalCrossentropy(from_logits = False)
metric = CategoricalAccuracy('balanced_accuracy'),
# Compile the model
model.compile(
    optimizer = optimizer,
    loss = loss, 
    metrics = metric)

In [55]:

train_history = model.fit(
    x ={'input_ids':X_train['input_ids'],'attention_mask':X_train['attention_mask']} ,
    y = y_train,
    validation_data = (
    {'input_ids':X_test['input_ids'],'attention_mask':X_test['attention_mask']}, y_test
    ),
  epochs=5,
    batch_size=24
)



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [56]:
predicted_raw = model.predict({'input_ids':X_test['input_ids'],'attention_mask':X_test['attention_mask']})
predicted_raw[0]



array([0.5435681 , 0.83902055, 0.8576314 , 0.7128423 , 0.65960366,
       0.22812688], dtype=float32)

In [57]:
df_test = pd.DataFrame(y_test, columns = [0,1,2,3,4,5])

In [58]:
import pandas as pd
import numpy as np

df_test['Class'] = np.where(df_test[0]==1.0, 0,
                           np.where(df_test[1]==1.0, 1,
                                   np.where(df_test[2]==1.0, 2,
                                           np.where(df_test[3]==1.0, 3,
                                                    np.where(df_test[4]==1.0, 4,5)))))

df_test_class = df_test['Class']
df_test_class = df_test_class.to_numpy()
df_test_class


array([1, 4, 4, ..., 0, 5, 3])

In [59]:
y_predicted = np.argmax(predicted_raw, axis = 1)
y_true = df_test_class

In [60]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_predicted))

              precision    recall  f1-score   support

           0       0.80      0.61      0.69       167
           1       0.42      0.50      0.45       167
           2       0.49      0.70      0.57       166
           3       0.49      0.42      0.45       167
           4       0.45      0.37      0.40       167
           5       0.94      0.88      0.91       167

    accuracy                           0.58      1001
   macro avg       0.60      0.58      0.58      1001
weighted avg       0.60      0.58      0.58      1001



In [None]:
#https://www.analyticsvidhya.com/blog/2021/12/multiclass-classification-using-transformers/

In [66]:
confusion_matrix(y_true,y_predicted)
pd.crosstab(y_true, y_predicted, rownames = ['Actual'], colnames =['Predicted'], margins = True)

Predicted,0,1,2,3,4,5,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,102,23,16,9,17,0,167
1,9,83,26,25,21,3,167
2,7,11,116,13,14,5,166
3,5,29,39,70,23,1,167
4,3,49,27,27,61,0,167
5,1,4,14,0,1,147,167
All,127,199,238,144,137,156,1001
