## Importing data

In [1]:
import time
start_time = time.time()

In [2]:
import pandas as pd

In [3]:
refactoring = pd.read_csv('refactoring.csv',sep='|')

In [4]:
satd = pd.read_excel('satd_v1.xlsx')

In [5]:
dataFrame = satd.merge(refactoring, how='inner',left_on=['Commit_id'],right_on=['Commit'])

In [6]:
dataFrame.head()

Unnamed: 0,SATD_id,Commit_id,Project_id,Satd_type,v1_comment,v2_comment,Refactoring_id,Commit,Type
0,1751,'696b51abc1e5f38b1c636710af552cafcc2e14fc',11,'SATD_CHANGED','TODO: Load analysis data\nFIXME\nresults\n.cs...,'TODO: Load analysis data,1245,'696b51abc1e5f38b1c636710af552cafcc2e14fc','Rename Attribute
1,1751,'696b51abc1e5f38b1c636710af552cafcc2e14fc',11,'SATD_CHANGED','TODO: Load analysis data\nFIXME\nresults\n.cs...,'TODO: Load analysis data,1246,'696b51abc1e5f38b1c636710af552cafcc2e14fc','Rename Method
2,1752,'696b51abc1e5f38b1c636710af552cafcc2e14fc',11,'SATD_REMOVED','TODO: Load analysis data\nFIXME\nresults\n.cs...,'results\n.csv\nfrom\na\nconstant,1245,'696b51abc1e5f38b1c636710af552cafcc2e14fc','Rename Attribute
3,1752,'696b51abc1e5f38b1c636710af552cafcc2e14fc',11,'SATD_REMOVED','TODO: Load analysis data\nFIXME\nresults\n.cs...,'results\n.csv\nfrom\na\nconstant,1246,'696b51abc1e5f38b1c636710af552cafcc2e14fc','Rename Method
4,1753,'696b51abc1e5f38b1c636710af552cafcc2e14fc',11,'SATD_REMOVED','FIXME: use analysis name','None,1245,'696b51abc1e5f38b1c636710af552cafcc2e14fc','Rename Attribute


## Data Preprocessing

In [7]:
refactoringTypes = dataFrame['Type'].unique()

### Refactoring types

In [8]:
refactoringTypes

array(["'Rename Attribute", "'Rename Method", "'Change Variable Type",
       "'Rename Variable", "'Change Attribute Type",
       "'Change Parameter Type", "'Change Return Type",
       "'Rename Parameter", "'Rename Class", "'Move Class"], dtype=object)

In [9]:
dataFrameCopy = dataFrame.copy(deep=True)

### Combining refactoring types

In [10]:
dataFrame.loc[dataFrame["Type"] == "'Rename Attribute", "Type"] = 'Rename'

dataFrame.loc[dataFrame["Type"] == "'Rename Method", "Type"] = 'Rename'

dataFrame.loc[dataFrame["Type"] == "'Rename Variable", "Type"] = 'Rename'

dataFrame.loc[dataFrame["Type"] == "'Rename Parameter", "Type"] = 'Rename'

dataFrame.loc[dataFrame["Type"] == "'Rename Class", "Type"] = 'Rename'

dataFrame.loc[dataFrame["Type"] == "'Change Variable Type", "Type"] = 'Change'

dataFrame.loc[dataFrame["Type"] == "'Change Attribute Type", "Type"] = 'Change'

dataFrame.loc[dataFrame["Type"] == "'Change Parameter Type", "Type"] = 'Change'
              
dataFrame.loc[dataFrame["Type"] == "'Change Return Type", "Type"] = 'Change'

dataFrame.loc[dataFrame["Type"] == "'Move Class", "Type"] = 'Move'



### Grouping data by SATD_id, v1_comment and refactoring type

In [11]:
dataFrame = dataFrame.groupby(["SATD_id", "v1_comment","Type"]).sum().reset_index()

In [12]:
dataFrame.head()

Unnamed: 0,SATD_id,v1_comment,Type,Project_id,Refactoring_id
0,1751,'TODO: Load analysis data\nFIXME\nresults\n.cs...,Rename,22,2491
1,1752,'TODO: Load analysis data\nFIXME\nresults\n.cs...,Rename,22,2491
2,1753,'FIXME: use analysis name',Rename,22,2491
3,1760,'FIXME:',Change,11,2187
4,1760,'FIXME:',Rename,33,6567


In [13]:
# here to drop duplicate values of text column
dataFrame.dropna(subset=["v1_comment"], inplace=True)

### Onehot encoding the refactoring type

In [14]:
onehotencoded_refactoring_type = pd.get_dummies(dataFrame["Type"])

In [15]:
dataFrame = pd.concat([dataFrame, onehotencoded_refactoring_type], axis=1)
dataFrame.head()

Unnamed: 0,SATD_id,v1_comment,Type,Project_id,Refactoring_id,Change,Move,Rename
0,1751,'TODO: Load analysis data\nFIXME\nresults\n.cs...,Rename,22,2491,0,0,1
1,1752,'TODO: Load analysis data\nFIXME\nresults\n.cs...,Rename,22,2491,0,0,1
2,1753,'FIXME: use analysis name',Rename,22,2491,0,0,1
3,1760,'FIXME:',Change,11,2187,1,0,0
4,1760,'FIXME:',Rename,33,6567,0,0,1


In [16]:
dataFrame.dropna(subset=["v1_comment"], inplace=True)
dataFrame.head()

Unnamed: 0,SATD_id,v1_comment,Type,Project_id,Refactoring_id,Change,Move,Rename
0,1751,'TODO: Load analysis data\nFIXME\nresults\n.cs...,Rename,22,2491,0,0,1
1,1752,'TODO: Load analysis data\nFIXME\nresults\n.cs...,Rename,22,2491,0,0,1
2,1753,'FIXME: use analysis name',Rename,22,2491,0,0,1
3,1760,'FIXME:',Change,11,2187,1,0,0
4,1760,'FIXME:',Rename,33,6567,0,0,1


In [17]:
dataFrame = dataFrame.drop(columns=["SATD_id","Type","Project_id","Refactoring_id"]).reset_index()
dataFrame.head()

Unnamed: 0,index,v1_comment,Change,Move,Rename
0,0,'TODO: Load analysis data\nFIXME\nresults\n.cs...,0,0,1
1,1,'TODO: Load analysis data\nFIXME\nresults\n.cs...,0,0,1
2,2,'FIXME: use analysis name',0,0,1
3,3,'FIXME:',1,0,0
4,4,'FIXME:',0,0,1


### Vectorizing the v1_comments using CountVectorizer

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

# apply Vectorization CountVectorizer function
vectorzier = CountVectorizer(min_df=0, lowercase=False)
vectorzier.fit(dataFrame['v1_comment'])
vectorOfFeatures = vectorzier.transform(dataFrame['v1_comment'])

### Removing stopwords, punctuations, lemmatizing, other preprocessing on v1_comments

In [19]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

nltk.download('stopwords')
nltk.download('wordnet')

def preprocess(x):
    try:
        stop_words = stopwords.words('english')
        lemmatizer = WordNetLemmatizer()
        x = x.replace(r'\n'," ")
        x = x.replace(':'," ")
        x = x.replace('\\'," ")
        x = x.replace('/'," ")
        x = x.replace('.'," ")
        x = x.replace('_'," ")
        x = x.replace('='," ")
        x = x.lower()
        x = x.translate(str.maketrans('', '', string.punctuation))
        x = x.split()
        x = [word for word in x if word not in stop_words]
        x = [lemmatizer.lemmatize(word) for word in x]
        x = str(x).replace(',', ' ').replace("'", "")[1:-1]
        return x
    except Exception as e:
        return 'empty'

dataFrame['v1_comment'] = dataFrame['v1_comment'].apply(preprocess)
dataFrame = pd.concat([pd.DataFrame(vectorOfFeatures.toarray()), dataFrame], axis=1)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\msgam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\msgam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
dataFrame.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1891,1892,1893,1894,1895,index,v1_comment,Change,Move,Rename
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,todo load analysis data fixme result csv...,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,todo load analysis data fixme result csv...,0,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,fixme use analysis name,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,3,fixme,1,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,4,fixme,0,0,1


## Splitting the dataset into test and train

In [21]:
from sklearn.model_selection import train_test_split

columns = list(dataFrame.columns)
X_train, X_test, y_train, y_test = train_test_split(dataFrame[columns[:-3]], dataFrame[columns[-3:]], test_size=0.30,
                                                    random_state=42)
X_train = X_train.drop(columns=['v1_comment'])
X_test = X_test.drop(columns=['v1_comment'])

categories = list(y_train.columns)
for x in categories:
    y_train.loc[y_train[x] > 0, x] = 1
    y_test.loc[y_test[x] > 0, x] = 1


## Training the model

### Random Forest

In [22]:
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

In [23]:
approach = "Combined refactoring Types"
vectorizer = "Count Vectorizer"

In [24]:
columns = list(dataFrame.columns)
target_classes =columns[-3:]

In [25]:
from skmultilearn.problem_transform import ClassifierChain
from sklearn.ensemble import RandomForestClassifier

classifierChain = ClassifierChain(RandomForestClassifier(class_weight='balanced'))
classifierChain.fit(X_train, y_train)
y_pred = classifierChain.predict(X_train)

print("training results")
print(classification_report(y_pred, y_train,target_names=target_classes))
print('Done with the training ')
y_pred = classifierChain.predict(X_test)
print("*"*40)
print("test data results")
print(classification_report(y_pred,y_test,target_names=target_classes,zero_division=0))
modeleval = {}
modeleval["Approach"] = approach
modeleval["Model"] = "Random Forest"
modeleval["Vectorizer"] = vectorizer
modeleval["Accuracy"] = accuracy_score(y_pred,y_test)
modeleval["Recall"] = recall_score(y_pred,y_test,average='weighted')
modeleval["Precision"] =precision_score(y_pred,y_test,average='weighted')
modeleval["F1 Score"] = f1_score(y_pred,y_test,average='weighted')

models_performance = pd.DataFrame(modeleval,index=[0])

print(accuracy_score(y_pred,y_test))
print("*"*40)

training results
              precision    recall  f1-score   support

      Change       1.00      1.00      1.00       354
        Move       1.00      1.00      1.00       101
      Rename       1.00      1.00      1.00       442

   micro avg       1.00      1.00      1.00       897
   macro avg       1.00      1.00      1.00       897
weighted avg       1.00      1.00      1.00       897
 samples avg       1.00      1.00      1.00       897

Done with the training 
****************************************
test data results
              precision    recall  f1-score   support

      Change       0.10      0.13      0.11       125
        Move       0.00      0.00      0.00        45
      Rename       0.36      0.29      0.32       215

   micro avg       0.20      0.20      0.20       385
   macro avg       0.15      0.14      0.14       385
weighted avg       0.23      0.20      0.21       385
 samples avg       0.20      0.20      0.20       385

0.2025974025974026
***********

In [26]:
models_performance

Unnamed: 0,Approach,Model,Vectorizer,Accuracy,Recall,Precision,F1 Score
0,Combined refactoring Types,Random Forest,Count Vectorizer,0.202597,0.202597,0.232974,0.214882


### Logistic Regression

In [27]:
from sklearn.linear_model import LogisticRegression

classifierChain = ClassifierChain(LogisticRegression())
classifierChain.fit(X_train, y_train)
y_pred = classifierChain.predict(X_train)
print("training results")
print(classification_report(y_pred, y_train,target_names=target_classes))
print('Done with the training ')
y_pred = classifierChain.predict(X_test)
print("*"*40)
print("test data results")
print(classification_report(y_pred,y_test,target_names=target_classes,zero_division=0))

print(accuracy_score(y_pred,y_test))

modeleval = {}
modeleval["Approach"] = approach
modeleval["Model"] = "Logistic Regression"
modeleval["Vectorizer"] = vectorizer
modeleval["Accuracy"] = accuracy_score(y_pred,y_test)
modeleval["Recall"] = recall_score(y_pred,y_test,average='weighted')
modeleval["Precision"] =precision_score(y_pred,y_test,average='weighted')
modeleval["F1 Score"] = f1_score(y_pred,y_test,average='weighted')

models_performance = models_performance.append(modeleval,ignore_index=True)

print("*"*40)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


training results
              precision    recall  f1-score   support

      Change       0.43      0.68      0.53       223
        Move       0.29      0.69      0.41        42
      Rename       0.84      0.59      0.69       632

   micro avg       0.62      0.62      0.62       897
   macro avg       0.52      0.65      0.54       897
weighted avg       0.71      0.62      0.64       897
 samples avg       0.62      0.62      0.62       897

Done with the training 
****************************************
test data results
              precision    recall  f1-score   support

      Change       0.16      0.24      0.20       112
        Move       0.04      0.11      0.06        18
      Rename       0.53      0.36      0.43       255

   micro avg       0.31      0.31      0.31       385
   macro avg       0.24      0.24      0.23       385
weighted avg       0.40      0.31      0.34       385
 samples avg       0.31      0.31      0.31       385

0.3116883116883117
***********

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [28]:
models_performance

Unnamed: 0,Approach,Model,Vectorizer,Accuracy,Recall,Precision,F1 Score
0,Combined refactoring Types,Random Forest,Count Vectorizer,0.202597,0.202597,0.232974,0.214882
1,Combined refactoring Types,Logistic Regression,Count Vectorizer,0.311688,0.311688,0.400225,0.342016


## Approach : Not combining classes and using fastText vectorizer

### Using the backup dataframe

In [29]:
dataFrame = dataFrameCopy.copy(deep=True)

### Grouping data by SATD_id, v1_comment and refactoring type

In [30]:
dataFrame.dropna(subset=["v1_comment"], inplace=True)

In [31]:
dataFrame = dataFrame.groupby(["SATD_id", "v1_comment","Type"]).sum().reset_index()

In [32]:
onehotencoded_refactoring_type = pd.get_dummies(dataFrame["Type"])

In [33]:
dataFrame = pd.concat([dataFrame, onehotencoded_refactoring_type], axis=1)

In [34]:
dataFrame = dataFrame.groupby(["SATD_id", "v1_comment"]).sum().reset_index()

In [35]:
dataFrame = dataFrame.drop_duplicates(subset=['SATD_id'])

In [36]:
dataFrame.dropna(subset=["v1_comment"], inplace=True)

In [37]:
dataFrame = dataFrame.drop(columns=["SATD_id"]).reset_index()

In [38]:
dataFrame = dataFrame.drop(columns=["index"])

In [39]:
dataFrame.head()

Unnamed: 0,v1_comment,Project_id,Refactoring_id,'Change Attribute Type,'Change Parameter Type,'Change Return Type,'Change Variable Type,'Move Class,'Rename Attribute,'Rename Class,'Rename Method,'Rename Parameter,'Rename Variable
0,'TODO: Load analysis data\nFIXME\nresults\n.cs...,22,2491,0,0,0,0,0,1,0,1,0,0
1,'TODO: Load analysis data\nFIXME\nresults\n.cs...,22,2491,0,0,0,0,0,1,0,1,0,0
2,'FIXME: use analysis name',22,2491,0,0,0,0,0,1,0,1,0,0
3,'FIXME:',44,8754,0,0,0,1,0,1,0,1,0,1
4,'TODO: sort func must be independent of param ...,44,8754,0,0,0,1,0,1,0,1,0,1


### Removing stopwords, punctuations, lemmatizing, other preprocessing on v1_comments

In [40]:
dataFrame['v1_comment'] = dataFrame['v1_comment'].apply(preprocess)

In [41]:
# here to drop duplicate values of text column
dataFrame.dropna(subset=["v1_comment"], inplace=True)

### Vectorizing the v1_comments using CountVectorizer

In [42]:
# apply Vectorization CountVectorizer function
vectorzier = CountVectorizer(min_df=0, lowercase=False)
vectorzier.fit(dataFrame['v1_comment'])
vectorOfFeatures = vectorzier.transform(dataFrame['v1_comment'])

In [43]:
dataFrame.head()

Unnamed: 0,v1_comment,Project_id,Refactoring_id,'Change Attribute Type,'Change Parameter Type,'Change Return Type,'Change Variable Type,'Move Class,'Rename Attribute,'Rename Class,'Rename Method,'Rename Parameter,'Rename Variable
0,todo load analysis data fixme result csv...,22,2491,0,0,0,0,0,1,0,1,0,0
1,todo load analysis data fixme result csv...,22,2491,0,0,0,0,0,1,0,1,0,0
2,fixme use analysis name,22,2491,0,0,0,0,0,1,0,1,0,0
3,fixme,44,8754,0,0,0,1,0,1,0,1,0,1
4,todo sort func must independent param index,44,8754,0,0,0,1,0,1,0,1,0,1


### Creating corpus for fastText

In [44]:
dataFrame['v1_comment'].tolist()

['todo  load  analysis  data  fixme  result  csv  constant',
 'todo  load  analysis  data  fixme  result  csv  constant',
 'fixme  use  analysis  name',
 'fixme',
 'todo  sort  func  must  independent  param  index',
 'fixme  review',
 'todo  order  sum  tag',
 'fixme  really  needed',
 'fixme  really  needed',
 'fixme  ask  christian  maybe  go  figure  fixme  lo  mejor  con  conservar  un  decorador  especifico  para  la  data  matrix  ya  e  suficiente  este  no  puede  proporcionar  el  tipo  de  dato',
 'workaround  bug  swing  http  bug  sun  com  view  bug  dobug  id  4473075',
 'workaround  bug  swing  http  bug  sun  com  view  bug  dobug  id  4473075',
 'fixme  improve  efficency  dont  use  map  set',
 'fixme  de  donde  puedo  sacar  el  resource  para  guardar  la  matriz',
 'fixme  el  baseresource  e  necesario  siempre  hay  porque  arrastralo',
 'fixme',
 'fixme',
 'fixme  could  transient',
 'fixme  de  donde  puedo  sacar  el  resource  para  guardar  la  matriz',
 '

#### Writing all comments to file, it will be used to train fastText vectorizer

In [45]:
with open('data.txt','w') as f:
    f.write(" ".join(dataFrame['v1_comment'].tolist()))

In [46]:
import fasttext
model = fasttext.train_unsupervised("data.txt")

### Converting word vectors to bag of words representation by averaging all words in sentence

In [47]:
import numpy as np
def make_feature_vec(words, model, num_features):
    """
    Average the word vectors for a set of words
    """
    feature_vec = np.zeros((num_features,),dtype="float32")  # pre-initialize (for speed)
    nwords = 0
    index2word_set = set(model.words)  # words known to the model

    for word in words.split("  "):
        if word in index2word_set: 
            nwords = nwords + 1
            feature_vec = np.add(feature_vec,model[word])
    if nwords == 0:
        nwords = 1
    
    feature_vec = np.divide(feature_vec, nwords)
    return feature_vec


def get_avg_feature_vecs(reviews, model, num_features):
    """
    Calculate average feature vectors for all reviews
    """
    counter = 0
    review_feature_vecs = np.zeros((len(reviews),num_features), dtype='float32')  # pre-initialize (for speed)
    
    for review in reviews:
        review_feature_vecs[counter] = make_feature_vec(review, model, num_features)
        counter = counter + 1
    return review_feature_vecs

In [48]:
temp = []
len(dataFrame['v1_comment'])
temp.append(get_avg_feature_vecs(dataFrame['v1_comment'],model,100))

In [49]:
df = pd.concat([pd.DataFrame(np.concatenate(temp)), dataFrame], axis=1)

In [50]:
dataFrame = df
dataFrame = dataFrame.drop(columns=["Project_id","Refactoring_id"])
dataFrame.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,'Change Attribute Type,'Change Parameter Type,'Change Return Type,'Change Variable Type,'Move Class,'Rename Attribute,'Rename Class,'Rename Method,'Rename Parameter,'Rename Variable
0,0.035253,-0.221339,-0.098625,-0.236461,0.263577,0.066604,0.16775,0.066086,0.165482,0.104146,...,0,0,0,0,0,1,0,1,0,0
1,0.035253,-0.221339,-0.098625,-0.236461,0.263577,0.066604,0.16775,0.066086,0.165482,0.104146,...,0,0,0,0,0,1,0,1,0,0
2,0.039602,-0.235574,-0.105093,-0.251194,0.28383,0.071217,0.182953,0.072194,0.178029,0.108141,...,0,0,0,0,0,1,0,1,0,0
3,0.042217,-0.254851,-0.106125,-0.260302,0.306699,0.075431,0.198612,0.085364,0.18826,0.107281,...,0,0,0,1,0,1,0,1,0,1
4,0.036319,-0.215901,-0.097643,-0.232728,0.2619,0.064633,0.168823,0.064793,0.16168,0.103497,...,0,0,0,1,0,1,0,1,0,1


## Splitting the dataset into test and train

In [51]:
columns = list(dataFrame.columns)
X_train, X_test, y_train, y_test = train_test_split(dataFrame[columns[:-10]], dataFrame[columns[-10:]], test_size=0.30,
                                                    random_state=42)
X_train = X_train.drop(columns=['v1_comment'])
X_test = X_test.drop(columns=['v1_comment'])

categories = list(y_train.columns)
for x in categories:
    y_train.loc[y_train[x] > 0, x] = 1
    y_test.loc[y_test[x] > 0, x] = 1


## Training the model

### Random Forest

In [52]:
approach = "Without combined refactoring types"
vectorizer = "FastText"

In [53]:
columns = list(dataFrame.columns)
target_classes =columns[-10:]

In [54]:
classifierChain = ClassifierChain(RandomForestClassifier(class_weight='balanced'))
classifierChain.fit(X_train, y_train)
y_pred = classifierChain.predict(X_train)
print("training results")
print(classification_report(y_pred, y_train,target_names=target_classes))
print('Done with the training ')
y_pred = classifierChain.predict(X_test)
print("*"*40)
print("test data results")
print(classification_report(y_pred,y_test,target_names=target_classes,zero_division=0))

modeleval = {}
modeleval["Approach"] = approach
modeleval["Model"] = "Random Forest"
modeleval["Vectorizer"] = vectorizer
modeleval["Accuracy"] = accuracy_score(y_pred,y_test)
modeleval["Recall"] = recall_score(y_pred,y_test,average='weighted')
modeleval["Precision"] =precision_score(y_pred,y_test,average='weighted')
modeleval["F1 Score"] = f1_score(y_pred,y_test,average='weighted')

models_performance = models_performance.append(modeleval,ignore_index=True)
print(accuracy_score(y_pred,y_test))
print("*"*40)

training results
                        precision    recall  f1-score   support

'Change Attribute Type       0.97      0.85      0.91       255
'Change Parameter Type       0.94      0.86      0.90       228
   'Change Return Type       0.93      0.82      0.88       200
 'Change Variable Type       0.97      0.88      0.92       262
           'Move Class       0.88      0.88      0.88       111
     'Rename Attribute       0.90      0.82      0.86       194
         'Rename Class       0.80      0.70      0.74       115
        'Rename Method       0.97      0.87      0.92       287
     'Rename Parameter       0.90      0.86      0.88       208
      'Rename Variable       0.90      0.87      0.88       252

             micro avg       0.93      0.85      0.89      2112
             macro avg       0.92      0.84      0.88      2112
          weighted avg       0.93      0.85      0.89      2112
           samples avg       0.92      0.89      0.89      2112

Done with the traini

In [55]:
models_performance

Unnamed: 0,Approach,Model,Vectorizer,Accuracy,Recall,Precision,F1 Score
0,Combined refactoring Types,Random Forest,Count Vectorizer,0.202597,0.202597,0.232974,0.214882
1,Combined refactoring Types,Logistic Regression,Count Vectorizer,0.311688,0.311688,0.400225,0.342016
2,Without combined refactoring types,Random Forest,FastText,0.210046,0.623641,0.537827,0.575658


### Logistic Regression

In [56]:
classifierChain = ClassifierChain(LogisticRegression())
classifierChain.fit(X_train, y_train)
y_pred = classifierChain.predict(X_train)
print("training results")
print(classification_report(y_pred, y_train,target_names=target_classes))
print('Done with the training ')
y_pred = classifierChain.predict(X_test)
print("*"*40)
print("test data results")
print(classification_report(y_pred,y_test,target_names=target_classes,zero_division=0))

modeleval = {}
modeleval["Approach"] = approach
modeleval["Model"] = "Logistic Regression"
modeleval["Vectorizer"] = vectorizer
modeleval["Accuracy"] = accuracy_score(y_pred,y_test)
modeleval["Recall"] = recall_score(y_pred,y_test,average='weighted')
modeleval["Precision"] =precision_score(y_pred,y_test,average='weighted')
modeleval["F1 Score"] = f1_score(y_pred,y_test,average='weighted')

models_performance = models_performance.append(modeleval,ignore_index=True)

print(accuracy_score(y_pred,y_test))
print("*"*40)

training results
                        precision    recall  f1-score   support

'Change Attribute Type       0.08      0.90      0.15        20
'Change Parameter Type       0.08      0.85      0.15        20
   'Change Return Type       0.02      0.22      0.04        18
 'Change Variable Type       0.02      0.22      0.03        18
           'Move Class       0.00      0.00      0.00         0
     'Rename Attribute       0.02      0.22      0.04        18
         'Rename Class       0.04      0.22      0.07        18
        'Rename Method       0.02      0.22      0.03        18
     'Rename Parameter       0.00      0.00      0.00        18
      'Rename Variable       0.02      0.28      0.04        18

             micro avg       0.03      0.36      0.06       166
             macro avg       0.03      0.31      0.05       166
          weighted avg       0.03      0.36      0.06       166
           samples avg       0.04      0.01      0.02       166

Done with the traini

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [57]:
models_performance

Unnamed: 0,Approach,Model,Vectorizer,Accuracy,Recall,Precision,F1 Score
0,Combined refactoring Types,Random Forest,Count Vectorizer,0.202597,0.202597,0.232974,0.214882
1,Combined refactoring Types,Logistic Regression,Count Vectorizer,0.311688,0.311688,0.400225,0.342016
2,Without combined refactoring types,Random Forest,FastText,0.210046,0.623641,0.537827,0.575658
3,Without combined refactoring types,Logistic Regression,FastText,0.0,0.388889,0.017789,0.033946


### Hypertuning Random Forest

In [58]:
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Impurity measure
criterion = ['gini','entropy']
n_estimators = [int(x) for x in np.linspace(10, 110, num = 11)]
# Create the params grid
params_random_forest = {'n_estimators' : n_estimators,
                        'max_depth': max_depth,
                        'criterion': criterion,
                       'class_weight':['balanced']}

param_grid = {
    'bootstrap': [True],
    'criterion': criterion,
    'max_depth': [80, 90, 100, 110],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200,300],
    'class_weight':['balanced']
}

In [59]:
from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(RandomForestClassifier(class_weight='balanced'), params_random_forest, cv=5,
                               return_train_score=True,
                               n_jobs=-1,
                               refit="accuracy")
    
# Fitting training data on the gridsearch object
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print("-"*40)

{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 50, 'n_estimators': 70}
----------------------------------------


In [60]:
hyp_rf = grid_search.best_estimator_
hyp_rf = ClassifierChain(grid_search.best_estimator_)
hyp_rf.fit(X_train, y_train)

ClassifierChain(classifier=RandomForestClassifier(class_weight='balanced',
                                                  max_depth=50,
                                                  n_estimators=70),
                require_dense=[True, True])

In [61]:
y_pred = hyp_rf.predict(X_train)
print("training results")
print(classification_report(y_pred, y_train,target_names=target_classes))
print('Done with the training ')
y_pred = hyp_rf.predict(X_test)
print("*"*40)
print("test data results")
print(classification_report(y_pred,y_test,target_names=target_classes,zero_division=0))

modeleval = {}
modeleval["Approach"] = approach
modeleval["Model"] = "Hypertuned Random Forest"
modeleval["Vectorizer"] = vectorizer
modeleval["Accuracy"] = accuracy_score(y_pred,y_test)
modeleval["Recall"] = recall_score(y_pred,y_test,average='weighted')
modeleval["Precision"] =precision_score(y_pred,y_test,average='weighted')
modeleval["F1 Score"] = f1_score(y_pred,y_test,average='weighted')

models_performance = models_performance.append(modeleval,ignore_index=True)

print(accuracy_score(y_pred,y_test))
print("*"*40)

training results
                        precision    recall  f1-score   support

'Change Attribute Type       0.96      0.86      0.91       247
'Change Parameter Type       0.93      0.87      0.90       224
   'Change Return Type       0.92      0.83      0.87       196
 'Change Variable Type       0.97      0.88      0.92       262
           'Move Class       0.88      0.88      0.88       111
     'Rename Attribute       0.88      0.83      0.85       188
         'Rename Class       0.80      0.70      0.74       115
        'Rename Method       0.95      0.88      0.92       279
     'Rename Parameter       0.91      0.85      0.87       214
      'Rename Variable       0.89      0.88      0.89       246

             micro avg       0.92      0.85      0.89      2082
             macro avg       0.91      0.85      0.88      2082
          weighted avg       0.92      0.85      0.89      2082
           samples avg       0.92      0.89      0.89      2082

Done with the traini

In [62]:
models_performance

Unnamed: 0,Approach,Model,Vectorizer,Accuracy,Recall,Precision,F1 Score
0,Combined refactoring Types,Random Forest,Count Vectorizer,0.202597,0.202597,0.232974,0.214882
1,Combined refactoring Types,Logistic Regression,Count Vectorizer,0.311688,0.311688,0.400225,0.342016
2,Without combined refactoring types,Random Forest,FastText,0.210046,0.623641,0.537827,0.575658
3,Without combined refactoring types,Logistic Regression,FastText,0.0,0.388889,0.017789,0.033946
4,Without combined refactoring types,Hypertuned Random Forest,FastText,0.191781,0.616457,0.518386,0.560757


## Approach: Using TF-IDF Vectorizer

### Using Backup dataframe

In [63]:
dataFrame = dataFrameCopy.copy(deep=True)

### Getting the data ready for carrying out natural language processing

In [64]:
# Drop the nan values from the dataset
dataFrame.dropna(subset=["v1_comment"], inplace=True)
# On hot encoding the values for the Type column
onehotencoded_refactoring_type = pd.get_dummies(dataFrame["Type"])
# Joining two dataframes together for preprocessing 
dataFrame = pd.concat([dataFrame, onehotencoded_refactoring_type], axis=1)
# Grouping by the STAD_ID and V1 comments and then reset the index
dataFrame = dataFrame.groupby(["SATD_id", "v1_comment"]).sum().reset_index()
# Dropping duplicates
dataFrame = dataFrame.drop_duplicates(subset=['SATD_id'])
# Dropping any Na value in the v1 comment
dataFrame.dropna(subset=["v1_comment"], inplace=True)
# Dropping SATD_ID column as V1_comment is the only input column needed
dataFrame = dataFrame.drop(columns=["SATD_id"]).reset_index()
# Dropping index column as V1_comment is the only input column needed
dataFrame = dataFrame.drop(columns=["index"])
# Having a look at the dataset
dataFrame.head()

Unnamed: 0,v1_comment,Project_id,Refactoring_id,'Change Attribute Type,'Change Parameter Type,'Change Return Type,'Change Variable Type,'Move Class,'Rename Attribute,'Rename Class,'Rename Method,'Rename Parameter,'Rename Variable
0,'TODO: Load analysis data\nFIXME\nresults\n.cs...,22,2491,0,0,0,0,0,1,0,1,0,0
1,'TODO: Load analysis data\nFIXME\nresults\n.cs...,22,2491,0,0,0,0,0,1,0,1,0,0
2,'FIXME: use analysis name',22,2491,0,0,0,0,0,1,0,1,0,0
3,'FIXME:',44,8754,0,0,0,1,0,1,0,1,0,1
4,'TODO: sort func must be independent of param ...,44,8754,0,0,0,1,0,1,0,1,0,1


In [65]:
dataFrame.dropna(subset=["v1_comment"], inplace=True)

dataFrame['v1_comment'] = dataFrame['v1_comment'].apply(preprocess)

In [66]:
dataFrame = dataFrame.drop(columns=["Project_id","Refactoring_id"])

In [67]:
dataFrame.head()

Unnamed: 0,v1_comment,'Change Attribute Type,'Change Parameter Type,'Change Return Type,'Change Variable Type,'Move Class,'Rename Attribute,'Rename Class,'Rename Method,'Rename Parameter,'Rename Variable
0,todo load analysis data fixme result csv...,0,0,0,0,0,1,0,1,0,0
1,todo load analysis data fixme result csv...,0,0,0,0,0,1,0,1,0,0
2,fixme use analysis name,0,0,0,0,0,1,0,1,0,0
3,fixme,0,0,0,1,0,1,0,1,0,1
4,todo sort func must independent param index,0,0,0,1,0,1,0,1,0,1


### Using TfidfVectorizer for vectorizing

In [68]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Apply Vectorization - TfidfVectorizer function
vectorzier = TfidfVectorizer(min_df=0, lowercase=False)
# Fitting the vectorizer on data 
vectorzier.fit(dataFrame['v1_comment'])
# Transforming the vectorizer on the dataset
vectorOfFeatures = vectorzier.transform(dataFrame['v1_comment'])
# Concatenating the features
dataFrame = pd.concat([pd.DataFrame(vectorOfFeatures.toarray()), dataFrame], axis=1)
# Havnig a look at the dataset
dataFrame.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,'Change Attribute Type,'Change Parameter Type,'Change Return Type,'Change Variable Type,'Move Class,'Rename Attribute,'Rename Class,'Rename Method,'Rename Parameter,'Rename Variable
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,1,0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,1,0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,1,0,1,0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,1,0,1,0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,1,0,1,0,1


In [69]:
columns = list(dataFrame.columns)
X_train, X_test, y_train, y_test = train_test_split(dataFrame[columns[:-10]], dataFrame[columns[-10:]], test_size=0.30,
                                                    random_state=42)
X_train = X_train.drop(columns=['v1_comment'])
X_test = X_test.drop(columns=['v1_comment'])

categories = list(y_train.columns)
for x in categories:
    y_train.loc[y_train[x] > 0, x] = 1
    y_test.loc[y_test[x] > 0, x] = 1

### Training the model Random Forest

In [70]:
vectorizer = "TF-IDF"

columns = list(dataFrame.columns)
target_classes =columns[-10:]

In [71]:
classifierChain = ClassifierChain(RandomForestClassifier(class_weight='balanced'))
classifierChain.fit(X_train, y_train)
y_pred = classifierChain.predict(X_train)
print("training results")
print(classification_report(y_pred, y_train,target_names=target_classes))
print('Done with the training ')
y_pred = classifierChain.predict(X_test)
print("*"*40)
print("test data results")
print(classification_report(y_pred,y_test,target_names=target_classes,zero_division=0))

modeleval = {}
modeleval["Approach"] = approach
modeleval["Model"] = "Random Forest"
modeleval["Vectorizer"] = vectorizer
modeleval["Accuracy"] = accuracy_score(y_pred,y_test)
modeleval["Recall"] = recall_score(y_pred,y_test,average='weighted')
modeleval["Precision"] =precision_score(y_pred,y_test,average='weighted')
modeleval["F1 Score"] = f1_score(y_pred,y_test,average='weighted')

models_performance = models_performance.append(modeleval,ignore_index=True)

print(accuracy_score(y_pred,y_test))
print("*"*40)

training results
                        precision    recall  f1-score   support

'Change Attribute Type       0.96      0.93      0.94       229
'Change Parameter Type       0.96      0.94      0.95       214
   'Change Return Type       0.93      0.95      0.94       174
 'Change Variable Type       0.97      0.96      0.96       239
           'Move Class       0.96      0.91      0.94       119
     'Rename Attribute       0.90      0.94      0.92       170
         'Rename Class       0.84      0.88      0.86        95
        'Rename Method       0.99      0.94      0.96       271
     'Rename Parameter       0.91      0.94      0.93       193
      'Rename Variable       0.93      0.96      0.95       235

             micro avg       0.94      0.94      0.94      1939
             macro avg       0.94      0.94      0.94      1939
          weighted avg       0.94      0.94      0.94      1939
           samples avg       0.94      0.94      0.94      1939

Done with the traini

In [72]:
models_performance

Unnamed: 0,Approach,Model,Vectorizer,Accuracy,Recall,Precision,F1 Score
0,Combined refactoring Types,Random Forest,Count Vectorizer,0.202597,0.202597,0.232974,0.214882
1,Combined refactoring Types,Logistic Regression,Count Vectorizer,0.311688,0.311688,0.400225,0.342016
2,Without combined refactoring types,Random Forest,FastText,0.210046,0.623641,0.537827,0.575658
3,Without combined refactoring types,Logistic Regression,FastText,0.0,0.388889,0.017789,0.033946
4,Without combined refactoring types,Hypertuned Random Forest,FastText,0.191781,0.616457,0.518386,0.560757
5,Without combined refactoring types,Random Forest,TF-IDF,0.219178,0.649701,0.505386,0.567259


### Hypertuning Random Forest

In [73]:
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Impurity measure
criterion = ['gini','entropy']
n_estimators = [int(x) for x in np.linspace(10, 110, num = 11)]
# Create the params grid
params_random_forest = {'n_estimators' : n_estimators,
                        'max_depth': max_depth,
                        'criterion': criterion,
                       'class_weight':['balanced']}

param_grid = {
    'bootstrap': [True],
    'criterion': criterion,
    'max_depth': [80, 90, 100, 110],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300],
    'class_weight':['balanced']
}

In [74]:
grid_search = GridSearchCV(RandomForestClassifier(class_weight='balanced'), param_grid, cv=5,
                               return_train_score=True,
                               n_jobs=-1,
                               refit="f1")
    

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print("-"*40)

{'bootstrap': True, 'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 110, 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 300}
----------------------------------------


In [75]:
hyp_rf = grid_search.best_estimator_

In [76]:
y_pred = hyp_rf.predict(X_train)
print("training results")
print(classification_report(y_pred, y_train,target_names=target_classes))
print('Done with the training ')
y_pred = hyp_rf.predict(X_test)
print("*"*40)
print("test data results")
print(classification_report(y_pred,y_test,target_names=target_classes,zero_division=0))

modeleval = {}
modeleval["Approach"] = approach
modeleval["Model"] = "Hypertuned Random Forest"
modeleval["Vectorizer"] = vectorizer
modeleval["Accuracy"] = accuracy_score(y_pred,y_test)
modeleval["Recall"] = recall_score(y_pred,y_test,average='weighted')
modeleval["Precision"] =precision_score(y_pred,y_test,average='weighted')
modeleval["F1 Score"] = f1_score(y_pred,y_test,average='weighted')

models_performance = models_performance.append(modeleval,ignore_index=True)

print(accuracy_score(y_pred,y_test))
print("*"*40)

training results
                        precision    recall  f1-score   support

'Change Attribute Type       1.00      0.45      0.62       496
'Change Parameter Type       1.00      0.41      0.59       505
   'Change Return Type       0.99      0.38      0.55       460
 'Change Variable Type       0.99      0.48      0.65       485
           'Move Class       0.47      0.79      0.59        67
     'Rename Attribute       1.00      0.38      0.55       467
         'Rename Class       0.94      0.31      0.47       302
        'Rename Method       1.00      0.54      0.70       480
     'Rename Parameter       0.98      0.42      0.59       471
      'Rename Variable       0.99      0.53      0.69       456

             micro avg       0.96      0.44      0.61      4189
             macro avg       0.94      0.47      0.60      4189
          weighted avg       0.98      0.44      0.60      4189
           samples avg       0.92      0.46      0.56      4189

Done with the traini

In [77]:
models_performance

Unnamed: 0,Approach,Model,Vectorizer,Accuracy,Recall,Precision,F1 Score
0,Combined refactoring Types,Random Forest,Count Vectorizer,0.202597,0.202597,0.232974,0.214882
1,Combined refactoring Types,Logistic Regression,Count Vectorizer,0.311688,0.311688,0.400225,0.342016
2,Without combined refactoring types,Random Forest,FastText,0.210046,0.623641,0.537827,0.575658
3,Without combined refactoring types,Logistic Regression,FastText,0.0,0.388889,0.017789,0.033946
4,Without combined refactoring types,Hypertuned Random Forest,FastText,0.191781,0.616457,0.518386,0.560757
5,Without combined refactoring types,Random Forest,TF-IDF,0.219178,0.649701,0.505386,0.567259
6,Without combined refactoring types,Hypertuned Random Forest,TF-IDF,0.09589,0.440642,0.969936,0.602275


## Logistic Regression

In [78]:
classifierChain = ClassifierChain(LogisticRegression())
classifierChain.fit(X_train, y_train)
y_pred = classifierChain.predict(X_train)
print("training results")
print(classification_report(y_pred, y_train,target_names=target_classes))
print('Done with the training ')
y_pred = classifierChain.predict(X_test)
print("*"*40)
print("test data results")
print(classification_report(y_pred,y_test,target_names=target_classes,zero_division=0))

modeleval = {}
modeleval["Approach"] = approach
modeleval["Model"] = "Logistics Regression"
modeleval["Vectorizer"] = vectorizer
modeleval["Accuracy"] = accuracy_score(y_pred,y_test)
modeleval["Recall"] = recall_score(y_pred,y_test,average='weighted')
modeleval["Precision"] =precision_score(y_pred,y_test,average='weighted')
modeleval["F1 Score"] = f1_score(y_pred,y_test,average='weighted')

models_performance = models_performance.append(modeleval,ignore_index=True)
print(accuracy_score(y_pred,y_test))
print("*"*40)

training results
                        precision    recall  f1-score   support

'Change Attribute Type       0.72      0.93      0.81       172
'Change Parameter Type       0.55      0.79      0.65       145
   'Change Return Type       0.48      0.73      0.58       117
 'Change Variable Type       0.46      0.72      0.56       151
           'Move Class       0.19      0.72      0.30        29
     'Rename Attribute       0.53      0.66      0.59       142
         'Rename Class       0.35      0.83      0.49        42
        'Rename Method       0.53      0.76      0.63       180
     'Rename Parameter       0.38      0.63      0.47       121
      'Rename Variable       0.37      0.66      0.48       138

             micro avg       0.48      0.75      0.58      1237
             macro avg       0.46      0.74      0.56      1237
          weighted avg       0.50      0.75      0.59      1237
           samples avg       0.36      0.34      0.33      1237

Done with the traini

  _warn_prf(average, modifier, msg_start, len(result))


In [79]:
models_performance

Unnamed: 0,Approach,Model,Vectorizer,Accuracy,Recall,Precision,F1 Score
0,Combined refactoring Types,Random Forest,Count Vectorizer,0.202597,0.202597,0.232974,0.214882
1,Combined refactoring Types,Logistic Regression,Count Vectorizer,0.311688,0.311688,0.400225,0.342016
2,Without combined refactoring types,Random Forest,FastText,0.210046,0.623641,0.537827,0.575658
3,Without combined refactoring types,Logistic Regression,FastText,0.0,0.388889,0.017789,0.033946
4,Without combined refactoring types,Hypertuned Random Forest,FastText,0.191781,0.616457,0.518386,0.560757
5,Without combined refactoring types,Random Forest,TF-IDF,0.219178,0.649701,0.505386,0.567259
6,Without combined refactoring types,Hypertuned Random Forest,TF-IDF,0.09589,0.440642,0.969936,0.602275
7,Without combined refactoring types,Logistics Regression,TF-IDF,0.054795,0.679293,0.32286,0.434882


## Approach: Using count vectorizer

### Using Backup dataframe

In [80]:
dataFrame = dataFrameCopy.copy(deep=True)

### Getting the data ready for carrying out natural language processing

In [81]:
# Drop the nan values from the dataset
dataFrame.dropna(subset=["v1_comment"], inplace=True)
# On hot encoding the values for the Type column
onehotencoded_refactoring_type = pd.get_dummies(dataFrame["Type"])
# Joining two dataframes together for preprocessing 
dataFrame = pd.concat([dataFrame, onehotencoded_refactoring_type], axis=1)
# Grouping by the STAD_ID and V1 comments and then reset the index
dataFrame = dataFrame.groupby(["SATD_id", "v1_comment"]).sum().reset_index()
# Dropping duplicates
dataFrame = dataFrame.drop_duplicates(subset=['SATD_id'])
# Dropping any Na value in the v1 comment
dataFrame.dropna(subset=["v1_comment"], inplace=True)
# Dropping SATD_ID column as V1_comment is the only input column needed
dataFrame = dataFrame.drop(columns=["SATD_id"]).reset_index()
# Dropping index column as V1_comment is the only input column needed
dataFrame = dataFrame.drop(columns=["index"])
# Having a look at the dataset
dataFrame.head()

Unnamed: 0,v1_comment,Project_id,Refactoring_id,'Change Attribute Type,'Change Parameter Type,'Change Return Type,'Change Variable Type,'Move Class,'Rename Attribute,'Rename Class,'Rename Method,'Rename Parameter,'Rename Variable
0,'TODO: Load analysis data\nFIXME\nresults\n.cs...,22,2491,0,0,0,0,0,1,0,1,0,0
1,'TODO: Load analysis data\nFIXME\nresults\n.cs...,22,2491,0,0,0,0,0,1,0,1,0,0
2,'FIXME: use analysis name',22,2491,0,0,0,0,0,1,0,1,0,0
3,'FIXME:',44,8754,0,0,0,1,0,1,0,1,0,1
4,'TODO: sort func must be independent of param ...,44,8754,0,0,0,1,0,1,0,1,0,1


In [82]:
dataFrame.dropna(subset=["v1_comment"], inplace=True)

dataFrame['v1_comment'] = dataFrame['v1_comment'].apply(preprocess)

In [83]:
dataFrame = dataFrame.drop(columns=["Project_id","Refactoring_id"])

In [84]:
dataFrame.head()

Unnamed: 0,v1_comment,'Change Attribute Type,'Change Parameter Type,'Change Return Type,'Change Variable Type,'Move Class,'Rename Attribute,'Rename Class,'Rename Method,'Rename Parameter,'Rename Variable
0,todo load analysis data fixme result csv...,0,0,0,0,0,1,0,1,0,0
1,todo load analysis data fixme result csv...,0,0,0,0,0,1,0,1,0,0
2,fixme use analysis name,0,0,0,0,0,1,0,1,0,0
3,fixme,0,0,0,1,0,1,0,1,0,1
4,todo sort func must independent param index,0,0,0,1,0,1,0,1,0,1


### Using Count Vectorizer

In [85]:
# Apply Vectorization - CountVectorizer function
vectorzier = CountVectorizer(min_df=0, lowercase=False)
# Fitting the vectorizer on data 
vectorzier.fit(dataFrame['v1_comment'])
# Transforming the vectorizer on the dataset
vectorOfFeatures = vectorzier.transform(dataFrame['v1_comment'])
# Concatenating the features
dataFrame = pd.concat([pd.DataFrame(vectorOfFeatures.toarray()), dataFrame], axis=1)
# Havnig a look at the dataset
dataFrame.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,'Change Attribute Type,'Change Parameter Type,'Change Return Type,'Change Variable Type,'Move Class,'Rename Attribute,'Rename Class,'Rename Method,'Rename Parameter,'Rename Variable
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,1


In [86]:
# split the data set
columns = list(dataFrame.columns)
X_train, X_test, y_train, y_test = train_test_split(dataFrame[columns[:-10]], dataFrame[columns[-10:]], test_size=0.30,
                                                    random_state=42)
X_train = X_train.drop(columns=['v1_comment'])
X_test = X_test.drop(columns=['v1_comment'])

categories = list(y_train.columns)
for x in categories:
    y_train.loc[y_train[x] > 0, x] = 1
    y_test.loc[y_test[x] > 0, x] = 1

### Training the model with Random Forest

In [87]:
vectorizer = "Count Vectorizer"

columns = list(dataFrame.columns)
target_classes =columns[-10:]

classifierChain = ClassifierChain(RandomForestClassifier(class_weight='balanced'))
classifierChain.fit(X_train, y_train)
y_pred = classifierChain.predict(X_train)
print("training results")
print(classification_report(y_pred, y_train,target_names=target_classes))
print('Done with the training ')
y_pred = classifierChain.predict(X_test)
print("*"*40)
print("test data results")
print(classification_report(y_pred,y_test,target_names=target_classes,zero_division=0))

modeleval = {}
modeleval["Approach"] = approach
modeleval["Model"] = "Random Forest"
modeleval["Vectorizer"] = vectorizer
modeleval["Accuracy"] = accuracy_score(y_pred,y_test)
modeleval["Recall"] = recall_score(y_pred,y_test,average='weighted')
modeleval["Precision"] =precision_score(y_pred,y_test,average='weighted')
modeleval["F1 Score"] = f1_score(y_pred,y_test,average='weighted')

models_performance = models_performance.append(modeleval,ignore_index=True)

print(accuracy_score(y_pred,y_test))
print("*"*40)

training results
                        precision    recall  f1-score   support

'Change Attribute Type       0.96      0.92      0.94       233
'Change Parameter Type       0.96      0.94      0.95       214
   'Change Return Type       0.94      0.93      0.94       179
 'Change Variable Type       0.98      0.94      0.96       246
           'Move Class       0.94      0.93      0.93       113
     'Rename Attribute       0.92      0.93      0.92       174
         'Rename Class       0.84      0.88      0.86        96
        'Rename Method       0.98      0.95      0.96       267
     'Rename Parameter       0.92      0.94      0.93       195
      'Rename Variable       0.93      0.96      0.95       235

             micro avg       0.94      0.94      0.94      1952
             macro avg       0.94      0.93      0.93      1952
          weighted avg       0.95      0.94      0.94      1952
           samples avg       0.94      0.94      0.94      1952

Done with the traini

In [88]:
models_performance

Unnamed: 0,Approach,Model,Vectorizer,Accuracy,Recall,Precision,F1 Score
0,Combined refactoring Types,Random Forest,Count Vectorizer,0.202597,0.202597,0.232974,0.214882
1,Combined refactoring Types,Logistic Regression,Count Vectorizer,0.311688,0.311688,0.400225,0.342016
2,Without combined refactoring types,Random Forest,FastText,0.210046,0.623641,0.537827,0.575658
3,Without combined refactoring types,Logistic Regression,FastText,0.0,0.388889,0.017789,0.033946
4,Without combined refactoring types,Hypertuned Random Forest,FastText,0.191781,0.616457,0.518386,0.560757
5,Without combined refactoring types,Random Forest,TF-IDF,0.219178,0.649701,0.505386,0.567259
6,Without combined refactoring types,Hypertuned Random Forest,TF-IDF,0.09589,0.440642,0.969936,0.602275
7,Without combined refactoring types,Logistics Regression,TF-IDF,0.054795,0.679293,0.32286,0.434882
8,Without combined refactoring types,Random Forest,Count Vectorizer,0.255708,0.59697,0.688918,0.637337


### Hypertuning Random Forest

In [89]:
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Impurity measure
criterion = ['gini','entropy']
n_estimators = [int(x) for x in np.linspace(10, 110, num = 11)]
# Create the params grid
params_random_forest = {'n_estimators' : n_estimators,
                        'max_depth': max_depth,
                        'criterion': criterion,
                       'class_weight':['balanced']}

param_grid = {
    'bootstrap': [True],
    'criterion': criterion,
    'max_depth': [80, 90, 100, 110],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200],
    'class_weight':['balanced']
}

In [90]:
grid_search = GridSearchCV(RandomForestClassifier(class_weight='balanced'), param_grid, cv=5,
                               return_train_score=True,
                               n_jobs=-1,
                               refit="f1")
    
grid_search.fit(X_train, y_train)

print(grid_search.best_params_)
print("-"*40)

{'bootstrap': True, 'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 90, 'min_samples_leaf': 3, 'min_samples_split': 10, 'n_estimators': 200}
----------------------------------------


In [91]:
hyp_rf = grid_search.best_estimator_

In [92]:
y_pred = hyp_rf.predict(X_train)
print("training results")
print(classification_report(y_pred, y_train,target_names=target_classes))
print('Done with the training ')
y_pred = hyp_rf.predict(X_test)
print("*"*40)
print("test data results")
print(classification_report(y_pred,y_test,target_names=target_classes,zero_division=0))


modeleval = {}
modeleval["Approach"] = approach
modeleval["Model"] = "Hypertuned Random Forest"
modeleval["Vectorizer"] = vectorizer
modeleval["Accuracy"] = accuracy_score(y_pred,y_test)
modeleval["Recall"] = recall_score(y_pred,y_test,average='weighted')
modeleval["Precision"] =precision_score(y_pred,y_test,average='weighted')
modeleval["F1 Score"] = f1_score(y_pred,y_test,average='weighted')

models_performance = models_performance.append(modeleval,ignore_index=True)

print(accuracy_score(y_pred,y_test))
print("*"*40)

training results
                        precision    recall  f1-score   support

'Change Attribute Type       0.99      0.44      0.61       504
'Change Parameter Type       1.00      0.41      0.58       510
   'Change Return Type       0.99      0.37      0.54       474
 'Change Variable Type       1.00      0.48      0.64       496
           'Move Class       0.42      0.72      0.53        65
     'Rename Attribute       1.00      0.36      0.53       487
         'Rename Class       0.94      0.25      0.39       377
        'Rename Method       1.00      0.52      0.69       490
     'Rename Parameter       0.98      0.41      0.58       478
      'Rename Variable       0.99      0.50      0.66       485

             micro avg       0.96      0.42      0.59      4366
             macro avg       0.93      0.45      0.58      4366
          weighted avg       0.98      0.42      0.59      4366
           samples avg       0.92      0.44      0.54      4366

Done with the traini

In [93]:
models_performance

Unnamed: 0,Approach,Model,Vectorizer,Accuracy,Recall,Precision,F1 Score
0,Combined refactoring Types,Random Forest,Count Vectorizer,0.202597,0.202597,0.232974,0.214882
1,Combined refactoring Types,Logistic Regression,Count Vectorizer,0.311688,0.311688,0.400225,0.342016
2,Without combined refactoring types,Random Forest,FastText,0.210046,0.623641,0.537827,0.575658
3,Without combined refactoring types,Logistic Regression,FastText,0.0,0.388889,0.017789,0.033946
4,Without combined refactoring types,Hypertuned Random Forest,FastText,0.191781,0.616457,0.518386,0.560757
5,Without combined refactoring types,Random Forest,TF-IDF,0.219178,0.649701,0.505386,0.567259
6,Without combined refactoring types,Hypertuned Random Forest,TF-IDF,0.09589,0.440642,0.969936,0.602275
7,Without combined refactoring types,Logistics Regression,TF-IDF,0.054795,0.679293,0.32286,0.434882
8,Without combined refactoring types,Random Forest,Count Vectorizer,0.255708,0.59697,0.688918,0.637337
9,Without combined refactoring types,Hypertuned Random Forest,Count Vectorizer,0.082192,0.433108,0.978155,0.595496


## Logistic Regression

In [94]:
classifierChain = ClassifierChain(LogisticRegression())
classifierChain.fit(X_train, y_train)
y_pred = classifierChain.predict(X_train)
print("training results")
print(classification_report(y_pred, y_train,target_names=target_classes))
print('Done with the training ')
y_pred = classifierChain.predict(X_test)
print("*"*40)
print("test data results")
print(classification_report(y_pred,y_test,target_names=target_classes,zero_division=0))

modeleval = {}
modeleval["Approach"] = approach
modeleval["Model"] = "Logistic Regression"
modeleval["Vectorizer"] = vectorizer
modeleval["Accuracy"] = accuracy_score(y_pred,y_test)
modeleval["Recall"] = recall_score(y_pred,y_test,average='weighted')
modeleval["Precision"] =precision_score(y_pred,y_test,average='weighted')
modeleval["F1 Score"] = f1_score(y_pred,y_test,average='weighted')

models_performance = models_performance.append(modeleval,ignore_index=True)


print(accuracy_score(y_pred,y_test))
print("*"*40)

training results
                        precision    recall  f1-score   support

'Change Attribute Type       0.88      0.92      0.90       213
'Change Parameter Type       0.79      0.87      0.83       190
   'Change Return Type       0.77      0.86      0.81       159
 'Change Variable Type       0.79      0.89      0.84       209
           'Move Class       0.61      0.97      0.75        70
     'Rename Attribute       0.79      0.86      0.82       162
         'Rename Class       0.61      0.78      0.69        78
        'Rename Method       0.84      0.90      0.87       240
     'Rename Parameter       0.72      0.85      0.78       170
      'Rename Variable       0.79      0.88      0.83       218

             micro avg       0.78      0.88      0.83      1709
             macro avg       0.76      0.88      0.81      1709
          weighted avg       0.78      0.88      0.83      1709
           samples avg       0.76      0.79      0.75      1709

Done with the traini

  _warn_prf(average, modifier, msg_start, len(result))


In [95]:
models_performance

Unnamed: 0,Approach,Model,Vectorizer,Accuracy,Recall,Precision,F1 Score
0,Combined refactoring Types,Random Forest,Count Vectorizer,0.202597,0.202597,0.232974,0.214882
1,Combined refactoring Types,Logistic Regression,Count Vectorizer,0.311688,0.311688,0.400225,0.342016
2,Without combined refactoring types,Random Forest,FastText,0.210046,0.623641,0.537827,0.575658
3,Without combined refactoring types,Logistic Regression,FastText,0.0,0.388889,0.017789,0.033946
4,Without combined refactoring types,Hypertuned Random Forest,FastText,0.191781,0.616457,0.518386,0.560757
5,Without combined refactoring types,Random Forest,TF-IDF,0.219178,0.649701,0.505386,0.567259
6,Without combined refactoring types,Hypertuned Random Forest,TF-IDF,0.09589,0.440642,0.969936,0.602275
7,Without combined refactoring types,Logistics Regression,TF-IDF,0.054795,0.679293,0.32286,0.434882
8,Without combined refactoring types,Random Forest,Count Vectorizer,0.255708,0.59697,0.688918,0.637337
9,Without combined refactoring types,Hypertuned Random Forest,Count Vectorizer,0.082192,0.433108,0.978155,0.595496


In [96]:
print("--- Notebook took %s minutes to complete ---" % ((time.time() - start_time)/60))

--- Notebook took 3.4123854517936705 minutes to complete ---
