In [1]:
import fasttext
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix

## Creating and prepping data files
Uncomment if files are not in directory

In [6]:

#import the data
train_data = pd.read_csv('bert_ft_preproc_train.csv')
test_data = pd.read_csv('bert_ft_preproc_test.csv')

#append '__label__' to the beginning of each label to format for fasttext
train_data.iloc[:,1] = train_data.iloc[:,1].apply(lambda x: '__label__' + str(x).replace(' ','-'))
test_data.iloc[:,1] = test_data.iloc[:,1].apply(lambda x: '__label__' + str(x).replace(' ','-'))



In [7]:

#saving train data to text file for model training and initial validation

train_texts = train_data['Commit message'].tolist()
train_labels = train_data['Class'].tolist()
prepped_train_data = []
for i in range(len(train_texts)):
    sample = train_labels[i] + ' ' + train_texts[i] + '\n'
    prepped_train_data.append(sample)
    
with open('fasttext_train_data.txt','w') as f:
    for datapoint in prepped_train_data:
        f.write(datapoint)
    f.close()

#saving test data
test_texts = test_data['Commit message'].tolist()
test_labels = test_data['Class'].tolist()
prepped_test_data = []
for i in range(len(test_texts)):
    sample = test_labels[i] + ' ' + test_texts[i] + '\n'
    prepped_test_data.append(sample)
    
with open('fasttext_test_data.txt','w') as f:
    for datapoint in prepped_test_data:
        f.write(datapoint)
    f.close()


## Training the model
The train_supervised method trains on 'fasttext_train_data.txt' with hyperparameters optimized on the test data. The dim parameter is set to 100 as hardware limitations appear to prevent anything higher. The loss is set to 'ova' (one-versus-all) as this this is more suited for multi-class problems like this. Finally, we use the pretrained vectors obtained from the corpus creation to help improve the model's accuracy.

In [2]:
model = fasttext.train_supervised(input='fasttext_train_data.txt',
                                  dim=100,
                                  wordNgrams=2,
                                  loss='ova',
                                  autotuneValidationFile='fasttext_test_data.txt',
                                  pretrainedVectors='fasttext_embeds.vec',
                                  thread=6)
#autotuneValidationFile='fasttext_test_data.txt'

Progress: 100.0% Trials:   57 Best score:  0.621379 ETA:   0h 0m 0s
Training again with best arguments
Read 0M words
Number of words:  12345
Number of labels: 6
Progress: 100.0% words/sec/thread:   80753 lr:  0.000000 avg.loss:  2.190625 ETA:   0h 0m 0s


In [3]:
#model.save_model('/media/sf_VM-ubuntu-shared/fasttext_model.bin')

## Validation

In [7]:
model = fasttext.load_model('fasttext_model.bin')



In [4]:
model.test('fasttext_test_data.txt')

(1001, 0.6123876123876124, 0.6123876123876124)

In [25]:
y_pred = test_data.iloc[:, 0].apply(lambda x: model.predict(x)[0][0])
y_true = test_data.iloc[:, 1]

In [26]:
confusion_matrix(y_true, y_pred)
pd.crosstab(y_true, y_pred, rownames = ['Actual'], colnames =['Predicted'], margins = True)

Predicted,__label__extract,__label__inline,__label__move,__label__pull-up,__label__push-down,__label__rename,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
__label__extract,131,8,8,11,9,0,167
__label__inline,12,68,25,35,23,4,167
__label__move,12,3,120,20,8,3,166
__label__pull-up,16,20,31,68,30,2,167
__label__push-down,12,32,24,33,64,2,167
__label__rename,1,0,4,0,0,162,167
All,184,131,212,167,134,173,1001


In [27]:
from sklearn.metrics import classification_report
print(classification_report(y_true,y_pred))

                    precision    recall  f1-score   support

  __label__extract       0.71      0.78      0.75       167
   __label__inline       0.52      0.41      0.46       167
     __label__move       0.57      0.72      0.63       166
  __label__pull-up       0.41      0.41      0.41       167
__label__push-down       0.48      0.38      0.43       167
   __label__rename       0.94      0.97      0.95       167

          accuracy                           0.61      1001
         macro avg       0.60      0.61      0.60      1001
      weighted avg       0.60      0.61      0.60      1001



In [18]:
test_data

Unnamed: 0,Commit message,Class
0,refactoring xobjectpagetypeprovider remove dep...,__label__inline
1,java typeres fix pr comments,__label__push-down
2,move readonly command to clustercommands moved...,__label__push-down
3,score constructor cleanup,__label__inline
4,sgf modify the gemfiredatasourcepostprocessor ...,__label__extract
...,...,...
996,remove subclasses of vertxhandler and only kee...,__label__pull-up
997,splitting record scanner rest tests up into di...,__label__extract
998,fixed layout in gmobjectframe and timelinefram...,__label__extract
999,rename metrictype to metricattribute this is a...,__label__rename


In [33]:
y_pred

0         __label__pull-up
1       __label__push-down
2            __label__move
3         __label__pull-up
4       __label__push-down
               ...        
996     __label__push-down
997        __label__inline
998       __label__extract
999        __label__rename
1000         __label__move
Name: Commit message, Length: 1001, dtype: object

In [36]:
pred = pd.DataFrame(y_pred)
pred.rename(columns={'Commit message':'predicted'}, inplace=True)

In [38]:
val = pd.concat([test_data,pred], axis=1)

In [39]:
val['Class'] = val['Class'].apply(lambda x: x[9:])
val['predicted'] = val['predicted'].apply(lambda x: x[9:])

In [40]:
val.head()

Unnamed: 0,Commit message,Class,predicted
0,refactoring xobjectpagetypeprovider remove dep...,inline,pull-up
1,java typeres fix pr comments,push-down,push-down
2,move readonly command to clustercommands moved...,push-down,move
3,score constructor cleanup,inline,pull-up
4,sgf modify the gemfiredatasourcepostprocessor ...,extract,push-down


In [41]:
val.to_csv('fasttext_validation_report.csv', index=False)