In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim.models import Word2Vec
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression

In [3]:
df = pd.read_csv('preprocessed.csv')
commits = df['Commit message']
ref_type = df['Class']

## scikit-learn Replication of Previous Study's Classification with Logistic Regression 

In [7]:
# sklearn pipeline with Tf-Idf vectorization and Logistic Regression classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(ngram_range=(1,2))),
    ('clf', LogisticRegression(multi_class='ovr'))
])

In [8]:
X,y = commits,ref_type
skf = StratifiedKFold(n_splits=10, shuffle=True)
report_list = list()
for train_index, test_index in skf.split(X,y):
    x_train_fold, x_test_fold = X[train_index], X[test_index]
    y_train_fold, y_test_fold = y[train_index], y[test_index]
    pipeline.fit(x_train_fold, y_train_fold)
    y_pred = pipeline.predict(x_test_fold)
    report = classification_report(y_test_fold, y_pred, output_dict=True)
    report_list.append(pd.DataFrame(report).transpose())
    #print(report)

In [9]:
total = report_list[0].stack()
for i in range(1, 10):
    total = total.add(report_list[i].stack())

avgs = total / 10
avgs.unstack()[:6]

Unnamed: 0,precision,recall,f1-score,support
extract,0.641274,0.682143,0.659832,83.4
inline,0.488306,0.501119,0.493682,83.4
move,0.58719,0.708649,0.641557,83.4
pull up,0.430316,0.402912,0.414974,83.4
push down,0.478898,0.366767,0.415008,83.4
rename,0.929049,0.912507,0.92015,83.4


## Tf-Idf and Logistic Regression Classifier with main train/test split

In [20]:
train_data = pd.read_csv('replication_preproc_train.csv')
test_data = pd.read_csv('replication_preproc_test.csv')

In [21]:
X_train = train_data['Commit message'].tolist()
y_train = train_data['Class'].tolist()
X_test = test_data['Commit message'].tolist()
y_test = test_data['Class'].tolist()

In [22]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

     extract       0.64      0.73      0.68       167
      inline       0.48      0.50      0.49       167
        move       0.61      0.73      0.66       166
     pull up       0.50      0.46      0.48       167
   push down       0.50      0.36      0.42       167
      rename       0.94      0.92      0.93       167

    accuracy                           0.62      1001
   macro avg       0.61      0.62      0.61      1001
weighted avg       0.61      0.62      0.61      1001



In [23]:
confusion_matrix(y_test,y_pred)
pd.crosstab(y_test, y_pred, rownames = ['Actual'], colnames =['Predicted'], margins = True)

Predicted,extract,inline,move,pull up,push down,rename,All
Actual,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
extract,122,12,4,18,11,0,167
inline,16,83,20,23,22,3,167
move,17,7,121,11,7,3,166
pull up,16,24,30,76,20,1,167
push down,17,43,20,24,60,3,167
rename,4,5,4,0,0,154,167
All,192,174,199,152,120,164,1001


In [28]:
pred = pd.DataFrame(y_pred, columns = ['predicted'])
model_eval = pd.concat([test_data,pred], axis=1)

In [29]:
# model_eval.to_csv('replication_evaluation_report.csv',index=False)