# Code featurizer

This workbook analyzes the lines of code using the bag-of-words algorithm. 

In [35]:
# configuration
from sklearn.feature_extraction.text import CountVectorizer
import re
import pandas as pd

# a list of the delimiters in the code, extra stop words
# source: most common programming languages
CODE_STOP_DELIM = "([\s\t\(\)\[\]{}!@#$%^&*\/\+\-=;:\\\\|`'\"~,.<>/?\n])"

# number of words in the resulting BoW dictionary
number_of_words = 1000

# input filename
input_filename = './gerrit_review_comments_dictionary_sentiment_wireshark.csv'

# save filename
save_filename = './gerrit_review_for_classifier_wireshark.csv'

In [16]:
# BoW tokenizer
def code_stop_words_tokenizer(line):
    global CODE_STOP_DELIM
    split_line = re.split(CODE_STOP_DELIM, line)
    split_line = list(filter(lambda a: a != '', split_line))
    split_line = ["0" if x.isdigit() else x for x in split_line]
    return split_line

In [19]:
lines = ["temp = a;", "a = b;", "b = temp;"]


# reading in the lines as a list
inputDF = pd.read_csv(input_filename, sep='$')

# converting the data frame to a list, and filling in np.nan with ''
linesList = list(inputDF['LOC'].fillna(''))

In [33]:
# counting the number of tokens in the file
count_vect = CountVectorizer(max_features=number_of_words, 
                             tokenizer=code_stop_words_tokenizer, 
                             ngram_range=(1,1))

# creating the bag of words vectors
bag_of_words = count_vect.fit_transform(linesList).todense()

# column names are the BoW tokens based on frequency
colnames = [x for x in sorted(count_vect.vocabulary_.keys())]

# create a data frame based
lines_bow = pd.DataFrame(bag_of_words, columns=colnames)

# concatenate the lines and their corresponding BoW representation into a single data frame
linesCommentsDF = pd.concat([inputDF, lines_bow], axis=1)

In [34]:
linesCommentsDF.head()

Unnamed: 0,filename,LOC,class_value,\t,Unnamed: 5,!,"""",#,$,%,...,wsapp,wsdg_html_chunked,www,xml,y,you,zstd,{,|,}
0,.github/workflows/close_pr.yml,"comment: ""We do not accept PRs. Patche...",0,0,21,0,2,0,0,0,...,0,1,1,0,0,0,0,0,0,0
1,epan/dissectors/packet-tls-utils.c,"tvb, offset, next_offset -...",0,0,25,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,epan/dissectors/packet-tls-utils.c,"tvb, offset, next_offset -...",1,0,25,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,epan/dissectors/packet-tls-utils.c,"tvb, offset, next_offset -...",0,0,25,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,epan/dissectors/packet-tls-utils.c,"tvb, offset, next_offset -...",0,0,25,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [37]:
# saving the output into a .csv file with $ as separator
pd.DataFrame(linesCommentsDF).to_csv(save_filename, 
                                        sep = "$",
                                        index = False)