In [20]:
import pandas as pd
import numpy as np

# 1. Removing empty (NaN) values

In [2]:
# Check for na values
df = pd.read_csv("data/commits.csv")
df.isnull().sum()
len(df)

87071

In [3]:
# Dropping all na values (Doesnt have but good practice to still drop)
df = df.dropna(how='any')
# df = df.drop(columns=['sha', 'remote_url'], axis=1)
# df.to_csv('data/message.csv', index=False)

# Lower casing all words & dropping rows with column size greater than 512

In [4]:
df['message'] = df['message'].apply(str.lower)
df = df[df['message'].str.len() <= 512] # Change the 512 to max context length of your model

# longest_string = max(df['message'], key=len)
# print(longest_string, len(longest_string))
len(df)

82937

# Proportionate Stratified Sampling

In [5]:
# Number of entries per repo
repo_types = np.unique(df['remote_url'].tolist())
for repo in repo_types:
    print(f"{repo}: {len(df[df['remote_url'] == repo])}")

https://github.com/apache/httpd: 32270
https://github.com/nginx/nginx: 7621
https://github.com/openssh/openssh-portable: 11964
https://github.com/openssl/openssl: 31082


# Formula: (sample size/population size) × stratum size

In [8]:
httpd_sample = df.loc[df['remote_url'] == "https://github.com/apache/httpd"]
nginx_sample = df.loc[df['remote_url'] == "https://github.com/nginx/nginx"]
openssh_sample = df.loc[df['remote_url'] == "https://github.com/openssh/openssh-portable"]
openssl_sample = df.loc[df['remote_url'] == "https://github.com/openssl/openssl"]

# Sample size of 20%
httpd_sample_size = round(0.2 * (len(httpd_sample)))
nginx_sample_size = round(0.2 * (len(nginx_sample)))
openssh_sample_size = round(0.2 * (len(openssh_sample)))
openssl_sample_size = round(0.2 * (len(openssl_sample)))

print(httpd_sample_size)
print("Total Sampling Size: ", httpd_sample_size+nginx_sample_size+openssh_sample_size+openssl_sample_size)

6454
Total Sampling Size:  16587


In [29]:
sample_df = df.groupby("remote_url",group_keys=False).apply(lambda x:x.sample(frac=0.2, random_state=1))
httpd_sampless = sample_df.loc[df['remote_url'] == "https://github.com/apache/httpd"]
httpd_sampless

Unnamed: 0,message,sha,remote_url
64232,* modules/ldap/util_ldap_cache.c (modldap_shme...,54b8395a53f884e60bb9661012890782f9deaa5b,https://github.com/apache/httpd
50547,aplogno tagging\n\ngit-svn-id: https://svn.apa...,aca09955d164fe203d15e03662183d455e9ed92c,https://github.com/apache/httpd
75743,"update the ""bind"" docs, remove the stuff from ...",180ef301d6d26602bb0a59871a8818c14e62968f,https://github.com/apache/httpd
60633,roll on to 2.3.2.\n\n\ngit-svn-id: https://svn...,ba2883e9b20ab32f47f87e04449b20023a4400e8,https://github.com/apache/httpd
48229,rebuild\n\ngit-svn-id: https://svn.apache.org/...,b59b7bdfa6f6d4248169597cea493c7fa8554f4b,https://github.com/apache/httpd
...,...,...,...
72685,fix typos...\n\n\ngit-svn-id: https://svn.apac...,d5fe728b25b8030a7afc93f6d166ce851788640f,https://github.com/apache/httpd
67009,style\n\n\ngit-svn-id: https://svn.apache.org/...,0b7de2e55c33b41e05b74e0f605446bc03b85602,https://github.com/apache/httpd
58092,remove those files and enter the html transfor...,f3ac1d322f1ad754228190849fd30e164d276cdb,https://github.com/apache/httpd
68221,"ah, we don't even need the 'ls -1' - 'ls' suff...",f51a3c83cc106cc2de3beadeb2dcf1f27f7a206f,https://github.com/apache/httpd


In [None]:
from transformers import AutoTokenizer, DataCollatorForLanguageModeling
import torch
from datasets import Dataset

# Load the dataset with only the message

In [None]:
dataset = Dataset.from_pandas(df)
dataset = dataset.remove_columns(['sha','remote_url'])
dataset

Dataset({
    features: ['message'],
    num_rows: 120637
})

# Tokenizer

In [None]:
model_name = "microsoft/codebert-base" # Change model name to your model
tokenizer = AutoTokenizer.from_pretrained(model_name)
context_length = 512 # Change to your model max context length

def tokenize_pad_and_truncate(texts):
    return tokenizer(texts['message'], truncation=True, padding="max_length", max_length=context_length)

tokenized_dataset = dataset.map(tokenize_pad_and_truncate, batched=True)
tokenized_dataset[0]


# inputs = tokenizer('Fix: bug racing event', return_tensors='pt', max_length=5, truncation=True, padding='max_length')
# print(inputs)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'message': 'log an error if bio_write(3) fails\n\n\ngit-svn-id: https://svn.apache.org/repos/asf/httpd/httpd/trunk@1910269 13f79535-47bb-0310-9956-ffa450edef68\n',
 'input_ids': [0,
  12376,
  41,
  5849,
  114,
  10709,
  1215,
  29631,
  1640,
  246,
  43,
  10578,
  50140,
  50118,
  20901,
  12,
  36245,
  282,
  12,
  808,
  35,
  1205,
  640,
  36245,
  282,
  4,
  48530,
  4,
  1957,
  73,
  241,
  11474,
  73,
  281,
  506,
  73,
  8166,
  417,
  73,
  8166,
  417,
  73,
  4328,
  6435,
  1039,
  1646,
  698,
  31416,
  508,
  506,
  36346,
  2022,
  12,
  3706,
  14141,
  12,
  3933,
  698,
  12,
  2831,
  4419,
  12,
  3145,
  102,
  13872,
  196,
  4550,
  4671,
  50118,
  2,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'attention_

In [None]:
dataset = Dataset.from_pandas(df)
dataset[0]

{'message': 'log an error if BIO_write(3) fails\n\n\ngit-svn-id: https://svn.apache.org/repos/asf/httpd/httpd/trunk@1910269 13f79535-47bb-0310-9956-ffa450edef68\n'}

In [None]:
model_name = "microsoft/codebert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)