# Dataset creation from CodeSearchNet Challenge Microsoft Dataset

In [None]:
# install dependencies
# install Pytorch for neural network model
! pip install -q torch==1.4.0 -f https://download.pytorch.org/whl/cu101/torch_stable.html

In [None]:
# Download the CodeSearchNet Challenge dataset for the Java programming language
! wget -q https://s3.amazonaws.com/code-search-net/CodeSearchNet/v2/java.zip

In [None]:
# Unzip the dataset
! unzip -qq java.zip

In [None]:
# decompress this gzip file
!gzip -d java/final/jsonl/train/java_train_0.jsonl.gz

In [None]:
with open('java/final/jsonl/train/java_train_0.jsonl', 'r') as f:
    sample_file = f.readlines()
sample_file[0]

'{"repo": "spring-projects/spring-boot", "path": "spring-boot-project/spring-boot/src/main/java/org/springframework/boot/context/properties/bind/IndexedElementsBinder.java", "func_name": "IndexedElementsBinder.bindIndexed", "original_string": "protected final void bindIndexed(ConfigurationPropertyName name, Bindable<?> target,\\n\\t\\t\\tAggregateElementBinder elementBinder, ResolvableType aggregateType,\\n\\t\\t\\tResolvableType elementType, IndexedCollectionSupplier result) {\\n\\t\\tfor (ConfigurationPropertySource source : getContext().getSources()) {\\n\\t\\t\\tbindIndexed(source, name, target, elementBinder, result, aggregateType,\\n\\t\\t\\t\\t\\telementType);\\n\\t\\t\\tif (result.wasSupplied() && result.get() != null) {\\n\\t\\t\\t\\treturn;\\n\\t\\t\\t}\\n\\t\\t}\\n\\t}", "language": "java", "code": "protected final void bindIndexed(ConfigurationPropertyName name, Bindable<?> target,\\n\\t\\t\\tAggregateElementBinder elementBinder, ResolvableType aggregateType,\\n\\t\\t\\tRes

In [None]:
import json
from pprint import pprint
pprint(json.loads(sample_file[0]))

{'code': 'protected final void bindIndexed(ConfigurationPropertyName name, '
         'Bindable<?> target,\n'
         '\t\t\tAggregateElementBinder elementBinder, ResolvableType '
         'aggregateType,\n'
         '\t\t\tResolvableType elementType, IndexedCollectionSupplier result) '
         '{\n'
         '\t\tfor (ConfigurationPropertySource source : '
         'getContext().getSources()) {\n'
         '\t\t\tbindIndexed(source, name, target, elementBinder, result, '
         'aggregateType,\n'
         '\t\t\t\t\telementType);\n'
         '\t\t\tif (result.wasSupplied() && result.get() != null) {\n'
         '\t\t\t\treturn;\n'
         '\t\t\t}\n'
         '\t\t}\n'
         '\t}',
 'code_tokens': ['protected',
                 'final',
                 'void',
                 'bindIndexed',
                 '(',
                 'ConfigurationPropertyName',
                 'name',
                 ',',
                 'Bindable',
                 '<',
                 '?',

In [None]:
# add additional libraries
import pandas as pd

from pathlib import Path
from typing import List, Optional

In [None]:
# dataset creation from Code SearchNet Challenging Microsoft Dataset

# Adding jsonlist to dataframes
# Load a list of jsonl.gz files into pandas Dataframe

def jsonl_list_to_dataframe(file_list, columns=['code', 'docstring']):
    return pd.concat([pd.read_json(f,
                                   orient='records', 
                                   compression='gzip',
                                   lines=True)[columns] 
                      for f in file_list], sort=False)

In [None]:
# getting dataframes method
# contrast data splits and converting to dataframes

def get_dfs(path: Path) -> List[pd.DataFrame]:
    dfs = []
    for split in ["train", "valid", "test"]:
        files = sorted((path/split).glob("**/*.gz"))
        df = jsonl_list_to_dataframe(files).rename(columns = {'code': 'mthd', 'docstring': 'cmt'})
        dfs.append(df)
        
    return dfs

In [None]:
# training,testing,validation data after deviding to 3 parts
path = Path('.')
df_trn, df_val, df_tst = get_dfs(path/"java/final/jsonl")
df_trn.head()

Unnamed: 0,mthd,cmt
0,public final void deleteSnapshot(ProjectSnapsh...,Removes an existing snapshot. Snapshots are us...
1,public final void deleteSnapshot(String snapsh...,Removes an existing snapshot. Snapshots are us...
2,public Table reloadTableWithFields(TableField ...,[VARIABLE TableField.NUM_ROWS]
3,public Table update() {\n // [START ]\n ...,[TARGET update(TableOption...)]
4,"public InsertAllResponse insert(String rowId1,...","[VARIABLE ""rowId2""]"


In [None]:
sample = 1
df_trn = df_trn.sample(frac = sample)
df_val = df_val.sample(frac = sample)
df_tst = df_tst.sample(frac = sample)

len(df_trn), len(df_val), len(df_tst)

(424451, 15328, 26909)

In [None]:
df_trn.head(10)

Unnamed: 0,mthd,cmt
24742,private void repeatConfigUntilUnsubscribed(fin...,Helper method to push configs until unsubscrib...
9385,"public Map<String, String> getControls(Version...","Returns the control settings, adjusted for the..."
24465,public static MimeMessage createMimeMessage(Se...,"Creates a mime-message, multipart if attacheme..."
4573,public static String getApplicationHashKey(Str...,"Your app key hash is required for example, for..."
8562,public void handleException(Throwable e) {\n ...,/////////////////////////////
302,private void mapClassResourceEntry(String file...,Maps (class) resources to files.\n\n@param fil...
3593,public ArrayNode filter(ComparisonExpression e...,Allows filtering values in a ArrayNode as per ...
2772,protected void fireFrameReceived(final CEMI fr...,Fires a frame received event ({@link KNXListen...
19134,public static final <T extends Date> Function<...,<p>\nIt creates an {@link Interval} from the i...
24045,public void removeAllQueriesAndGroups() {\n\t\...,Removes all the elements from the vector Query...


# Contrast inline code, comments from methods to creating new dataset process related to inline code,comments

In [None]:
#get inline pairs

from tqdm.auto import tqdm

def get_inline_pairs(mthd):
    """"""
    pairs = [[]]

    comment = False
    bracket = False
    indent_lvl = -1
    lines = mthd.split("\n")
    for line in lines:
        if "//" in line and not bracket:
            pairs[-1].append(line)
            if '\t' in line:
                indent_lvl = line.count('\t')
            else:
                indent_lvl = line.split("//")[0].count(' ')
            comment = True
            bracket = False
        elif comment:
            if '{' in line and not bracket:
                bracket = True
                pairs[-1].append(line)
            elif '}' in line:
                line_indent = -1
                if '\t' in line:
                    line_indent = line.count('\t')
                else:
                    line_indent = line.split("//")[0].count(' ')
                if indent_lvl == line_indent:
                    # print("indent lvl:", indent_lvl)
                    # print(pairs[-1], line)
                    pairs[-1].append(line)
                if not bracket:
                    # print("Hit bracket, terminating pair", line)
                    pairs.append([])
                    comment = False
                    bracket = False
            elif line.isspace() or line == '' and not bracket:
                pairs.append([])
                comment = False
            # elif "//" in line and not bracket:
            #     indent_lvl = line.count('\t')
            #     pairs.append([line])
            else:
                pairs[-1].append(line)

                
    
    # Convert pairs into proper format of (code snippet, inline comment) dataframe
    code_snippets   = []
    comments        = []
    for pair in pairs:
        if pair and len(pair) < 5:
            code    = []
            comment = []
            skip = False
            for line in pair:
                if "TODO" in line: break
                if "//" in line:
                    comment.append(line)
                else:
                    code.append(line)
            if len(code) > 0 and len(comment) > 0:
                code_snippets.append('\n'.join(code))
                comments.append('\n'.join(comment))

    pairs = pd.DataFrame(zip(code_snippets, comments), columns = ["mthd", "cmt"])
    return pairs

In [None]:
#identify the which are the inline comments in a range

from tqdm.notebook import tqdm
for i in tqdm(range(100)):
    if "//" in df_trn["mthd"].iloc[i]: print(i)

  0%|          | 0/100 [00:00<?, ?it/s]

0
2
4
5
18
19
28
30
32
36
37
43
44
46
48
49
56
60
62
66
70
74
77
80
95
98


In [None]:
pairs = get_inline_pairs(df_trn["mthd"].iloc[2])
len(pairs)

3

In [None]:
print(df_trn["mthd"].iloc[2])

public static MimeMessage createMimeMessage(Session session, String from, String[] to, String subject, String content, DataSource[] attachments) {

		logger.debug("Creates a mime message with {} attachments", (attachments == null) ? 0 : attachments.length);
		
		try {
			MimeMessage message = new MimeMessage(session);  
			
			if (from != null) {
				message.setSender(new InternetAddress(from));  
			}
			
			if (subject != null) {
				message.setSubject(subject, "UTF-8");  
			}
			
			if (to != null) {
				for (String toAdr : to) {
					message.addRecipient(Message.RecipientType.TO, new InternetAddress(toAdr));  
				}
			}

			if (attachments == null || attachments.length == 0) {
				// Setup a plain text message
				message.setContent(content, "text/plain; charset=UTF-8");			
				
			} else {
				// Setup a multipart message 
				Multipart multipart = new MimeMultipart(); 
				message.setContent(multipart);

				// Create the message part 
				BodyPart messageBodyPart = new MimeBod

In [None]:
pairs["mthd"].iloc[0], pairs["cmt"].iloc[0]

('\t\t\t\tmessage.setContent(content, "text/plain; charset=UTF-8");\t\t\t', '\t\t\t\t// Setup a plain text message')

In [None]:
pairs["mthd"].iloc[1], pairs["cmt"].iloc[1]

('\t\t\t\tMultipart multipart = new MimeMultipart(); \n\t\t\t\tmessage.setContent(multipart);', '\t\t\t\t// Setup a multipart message ')

In [None]:
pairs["mthd"].iloc[2], pairs["cmt"].iloc[2]

('\t\t\t\tBodyPart messageBodyPart = new MimeBodyPart();\n\t\t\t\tmessageBodyPart.setContent(content, "text/plain; charset=UTF-8");\t\t\t\n\t\t\t\tmultipart.addBodyPart(messageBodyPart);', '\t\t\t\t// Create the message part ')

Concat dataset

In [None]:
# add inline comment functions
def add_inline(df):
    new_df = df[df['mthd'].str.contains("//")]
    all_pairs = []
    for i, row in tqdm(list(new_df.iterrows())):
        pairs = get_inline_pairs(row['mthd'])
        all_pairs.append(pairs)

    df_pairs = pd.concat([pairs for pairs in all_pairs])
    return pd.concat([df, df_pairs])

In [None]:
df_trn = add_inline(df_trn)
df_val = add_inline(df_val)
df_tst = add_inline(df_tst)

len(df_trn), len(df_val), len(df_tst)

  0%|          | 0/72060 [00:00<?, ?it/s]

  0%|          | 0/2037 [00:00<?, ?it/s]

  0%|          | 0/5199 [00:00<?, ?it/s]

(324029, 11370, 23074)

In [None]:
df_trn.tail(10)

Unnamed: 0,mthd,cmt
434534,adserviceinterface adservice = adwordsservices...,// get the adservice.
434535,adoperation operation = new adoperation(); ope...,// creates ad group ad operation and adds it t...
434536,"extractfields(entity, factoryproperty, list, p...",// extract fields directly from class
434537,try { zeroargumentconstructor = getclassofobje...,//no public zero argument constructor availabl...
434539,return;,// ignore the unique index created for the pri...
434541,for (resourceevents resourcelistener : _listen...,// deliver the onrelease event to the register...
434542,"_containercontext.removeresourcecontext(this, ...",// unregister this resourcecontext with associ...
434543,_hasacquired = false;,// reset the flag to indicate resources have b...
434545,t retvalue = data; data = null; return retvalue;,// return and clear current
434546,return typeframe.gettoptype();,// this probably means that we're looking at a...


# Data cleaning with Preprocessing

**1.remove any non-ascii characters to simplify the problem**

In [None]:
def is_ascii(s):
    '''
    Determines if the given string contains only ascii characters

    :param s: the string to check
    :returns: whether or not the given string contains only ascii characters
    '''
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return False
    else:
        return True

df_trn = df_trn[df_trn['cmt'].apply(lambda x: is_ascii(x))]
df_val = df_val[df_val['cmt'].apply(lambda x: is_ascii(x))]
df_tst = df_tst[df_tst['cmt'].apply(lambda x: is_ascii(x))]

len(df_trn), len(df_val), len(df_tst)

(463292, 15792, 30322)

**2.remove any examples that have the special `<code>`tag**

In [None]:
def has_code(cmt: str) -> bool:
    if '<code>' in cmt: return True
    else: return False

df_trn = df_trn[~df_trn['cmt'].apply(lambda x: has_code(x))]
df_val = df_val[~df_val['cmt'].apply(lambda x: has_code(x))]
df_tst = df_tst[~df_tst['cmt'].apply(lambda x: has_code(x))]

len(df_trn), len(df_val), len(df_tst)

(434548, 14836, 28664)

**3.remove the JavaDoc parts of the comments**

In [None]:
import re

def remove_jdocs(df: pd.DataFrame) -> pd.DataFrame:

    methods = []
    comments = []
    for i, row in tqdm(list(df.iterrows())):
        comment = row["cmt"]
        comment = re.sub("([\{\[]).*?([\)\}])", '', comment)
        
        
        cleaned = []
        for line in comment.split('\n'):
            if "@" in line: break
            cleaned.append(line)
        comments.append('\n'.join(cleaned))
        methods.append(row["mthd"])
    new_df = pd.DataFrame(zip(methods, comments), columns = ["mthd", "cmt"])

    return new_df

df_trn = remove_jdocs(df_trn);
df_val = remove_jdocs(df_val);
df_tst = remove_jdocs(df_tst);

len(df_trn), len(df_val), len(df_tst)

  0%|          | 0/434548 [00:00<?, ?it/s]

  0%|          | 0/14836 [00:00<?, ?it/s]

  0%|          | 0/28664 [00:00<?, ?it/s]

(434548, 14836, 28664)

**4.remove any HTML tags from the comments**

In [None]:
def clean_html(cmt: str) -> str:
    result = re.sub(r"<.?span[^>]*>|<.?code[^>]*>|<.?p[^>]*>|<.?hr[^>]*>|<.?h[1-3][^>]*>|<.?a[^>]*>|<.?b[^>]*>|<.?blockquote[^>]*>|<.?del[^>]*>|<.?dd[^>]*>|<.?dl[^>]*>|<.?dt[^>]*>|<.?em[^>]*>|<.?i[^>]*>|<.?img[^>]*>|<.?kbd[^>]*>|<.?li[^>]*>|<.?ol[^>]*>|<.?pre[^>]*>|<.?s[^>]*>|<.?sup[^>]*>|<.?sub[^>]*>|<.?strong[^>]*>|<.?strike[^>]*>|<.?ul[^>]*>|<.?br[^>]*>", "", cmt)
    return result

df_trn.cmt = df_trn.cmt.apply(clean_html)
df_val.cmt = df_val.cmt.apply(clean_html)
df_tst.cmt = df_tst.cmt.apply(clean_html)

len(df_trn), len(df_val), len(df_tst)

(434548, 14836, 28664)

**5.make everything lower case, remove extra whitespace, remove empty comments, and remove duplicates**

In [None]:
df_trn = df_trn.applymap(lambda x: ' '.join(x.split()).lower())
df_val = df_val.applymap(lambda x: ' '.join(x.split()).lower())
df_tst = df_tst.applymap(lambda x: ' '.join(x.split()).lower())

df_trn = df_trn[~(df_trn['cmt'] == '')]
df_val = df_val[~(df_val['cmt'] == '')]
df_tst = df_tst[~(df_tst['cmt'] == '')]

df_trn = df_trn[~df_trn['cmt'].duplicated()]
df_val = df_val[~df_val['cmt'].duplicated()]
df_tst = df_tst[~df_tst['cmt'].duplicated()]

len(df_trn), len(df_val), len(df_tst)

(324029, 11370, 23074)

In [None]:
df_trn.head(10)

Unnamed: 0,mthd,cmt
0,private void repeatconfiguntilunsubscribed(fin...,helper method to push configs until unsubscrib...
1,"public map<string, string> getcontrols(version...","returns the control settings, adjusted for the..."
2,public static mimemessage createmimemessage(se...,"creates a mime-message, multipart if attacheme..."
3,public static string getapplicationhashkey(str...,"your app key hash is required for example, for..."
4,public void handleexception(throwable e) { whi...,/////////////////////////////
5,private void mapclassresourceentry(string file...,maps (class) resources to files.
6,public arraynode filter(comparisonexpression e...,allows filtering values in a arraynode as per ...
7,public static final <t extends date> function<...,it creates an from the input elements. the wil...
8,public void removeallqueriesandgroups() { list...,removes all the elements from the vector query...
9,"private response addcontent(string spaceid, st...",see contentresource.addcontent()


In [None]:
df_trn.tail(5)

Unnamed: 0,mthd,cmt
434541,for (resourceevents resourcelistener : _listen...,// deliver the onrelease event to the register...
434542,"_containercontext.removeresourcecontext(this, ...",// unregister this resourcecontext with associ...
434543,_hasacquired = false;,// reset the flag to indicate resources have b...
434545,t retvalue = data; data = null; return retvalue;,// return and clear current
434546,return typeframe.gettoptype();,// this probably means that we're looking at a...


In [None]:
df_trn.to_excel("A4.xlsx")