In [2]:
import pandas as pd
import numpy as np

#text pre-processing
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
#from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

#model-building
import sklearn
from sklearn.model_selection import train_test_split


## Importing data

In [3]:
df = pd.read_csv('raw_commits.csv')
commits = df['Commit message']
ref_type = df['Class']

## Splitting into training and test data to evaluate each method

In [4]:
X = commits
y = ref_type

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=99, stratify=y)

## Defining methods for lower-casing and stripping noise, punctuation
To be used for all datasets/models

In [5]:
#lowercase, strip, remove punctuation
def preprocess(text):
    text = text.lower() 
    text=text.strip()  
    text=re.compile('<.*?>').sub('', text) 
    text = re.sub(r'https*\S+', ' ', text) # remove links
    text = re.sub(r'http*\S+', ' ', text)
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 
    return text

## Defining methods for further preprocessing: stop-word removal and lemmatization
To be used with datasets/models for replication of AlOmar et al.'s models and with Word2Vec

In [6]:
#stopword removal
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

In [7]:
#Lemmatization
wl = WordNetLemmatizer()
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # Get position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # Map the position tag and lemmatize the word/token
    return " ".join(a)

## Preprocessing dataset for replication and Word2Vec

In [8]:
#final preprocessing
def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))
rep_train = X_train.apply(lambda x: finalpreprocess(x))
rep_test = X_test.apply(lambda x: finalpreprocess(x))

In [9]:
replic_preproc_train = pd.concat([rep_train, y_train], axis=1, join='inner')
replic_preproc_test = pd.concat([rep_test, y_test], axis=1, join='inner')
replic_preproc_train.head()

Unnamed: 0,Commit message,Class
1855,fix bug rulesets disable properly also add com...,inline
3788,minor improvement probe,push down
2336,rf remove unused formal parameter,inline
2819,sgf provide support sdg xml namespace load pre...,pull up
835,remove usage deprecate method collection key,move


In [10]:
# replic_preproc_train.to_csv('replication_preproc_train.csv', index=False)
# replic_preproc_test.to_csv('replication_preproc_test.csv',index=False)

## Preprocessing dataset for BERT and fastText

In [11]:
def bert_ft_finalpreprocess(string):
    return preprocess(string)

bert_ft_train = X_train.apply(lambda x: bert_ft_finalpreprocess(x))
bert_ft_test = X_test.apply(lambda x: bert_ft_finalpreprocess(x))

In [12]:
bert_ft_preproc_train = pd.concat([bert_ft_train, y_train], axis=1, join='inner')
bert_ft_preproc_test = pd.concat([bert_ft_test, y_test], axis=1, join='inner')
bert_ft_preproc_train.head()

Unnamed: 0,Commit message,Class
1855,fixed bug where rulesets were not being disabl...,inline
3788,a few minor improvements to probes,push down
2336,rf remove unused formal parameter,inline
2819,sgf provide support in the sdg xml namespace t...,pull up
835,removed usage of the deprecated method collect...,move


In [13]:
# bert_ft_preproc_train.to_csv('bert_ft_preproc_train.csv',index=False)
# bert_ft_preproc_test.to_csv('bert_ft_preproc_test.csv',index=False)